biotite 1.5.0__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-311-x86_64-linux-gnu.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-311-x86_64-linux-gnu.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-311-x86_64-linux-gnu.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-311-x86_64-linux-gnu.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-311-x86_64-linux-gnu.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-311-x86_64-linux-gnu.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-311-x86_64-linux-gnu.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-311-x86_64-linux-gnu.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-311-x86_64-linux-gnu.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-311-x86_64-linux-gnu.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-311-x86_64-linux-gnu.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-311-x86_64-linux-gnu.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-311-x86_64-linux-gnu.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-311-x86_64-linux-gnu.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-311-x86_64-linux-gnu.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-311-x86_64-linux-gnu.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-311-x86_64-linux-gnu.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-311-x86_64-linux-gnu.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-311-x86_64-linux-gnu.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-311-x86_64-linux-gnu.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-311-x86_64-linux-gnu.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,3411 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ # distutils: language = c++
6
+
7
+ __name__ = "biotite.sequence.align"
8
+ __author__ = "Patrick Kunzmann"
9
+ __all__ = ["KmerTable", "BucketKmerTable"]
10
+
11
+ cimport cython
12
+ cimport numpy as np
13
+ from cpython.mem cimport PyMem_Malloc as malloc
14
+ from cpython.mem cimport PyMem_Free as free
15
+ from libc.string cimport memcpy
16
+ from libcpp.set cimport set as cpp_set
17
+
18
+ import numpy as np
19
+ from ..alphabet import LetterAlphabet, common_alphabet, AlphabetError
20
+ from .kmeralphabet import KmerAlphabet
21
+ from .buckets import bucket_number
22
+
23
+
24
+ ctypedef np.int32_t int32
25
+ ctypedef np.int64_t int64
26
+ ctypedef np.uint8_t uint8
27
+ ctypedef np.uint32_t uint32
28
+ ctypedef np.uint64_t ptr
29
+
30
+
31
+ cdef enum EntrySize:
32
+ # The size (number of 32 bit elements) for each entry in C-arrays
33
+ # of KmerTable and BucketKmerTable, respectively
34
+ #
35
+ # Size: reference ID (int32) + sequence pos (int32)
36
+ NO_BUCKETS = 2
37
+ # Size: k-mer (int64) + reference ID (int32) + sequence pos (int32)
38
+ BUCKETS = 4
39
+
40
+
41
+ cdef class KmerTable:
42
+ """
43
+ This class represents a *k-mer* index table.
44
+ It maps *k-mers* (subsequences with length *k*) to the sequence
45
+ positions, where the *k-mer* appears.
46
+ It is primarily used to find *k-mer* matches between two sequences.
47
+ A match is defined as a *k-mer* that appears in both sequences.
48
+ Instances of this class are immutable.
49
+
50
+ There are multiple ways to create a :class:`KmerTable`:
51
+
52
+ - :meth:`from_sequences()` iterates through all overlapping
53
+ *k-mers* in a sequence and stores the sequence position of
54
+ each *kmer* in the table.
55
+ - :meth:`from_kmers()` is similar to :meth:`from_sequences()`
56
+ but directly accepts *k-mers* as input instead of sequences.
57
+ - :meth:`from_kmer_selection()` takes a combination of *k-mers*
58
+ and their positions in a sequence, which can be used to
59
+ apply subset selectors, such as :class:`MinimizerSelector`.
60
+ - :meth:`from_tables()` merges the entries from multiple
61
+ :class:`KmerTable` objects into a new table.
62
+ - :meth:`from_positions()` let's the user provide manual
63
+ *k-mer* positions, which can be useful for loading a
64
+ :class:`KmerTable` from file.
65
+
66
+ The standard constructor merely returns an empty table and is
67
+ reserved for internal use.
68
+
69
+ Each indexed *k-mer* position is represented by a tuple of
70
+
71
+ 1. a unique reference ID that identifies to which sequence a
72
+ position refers to and
73
+ 2. the zero-based sequence position of the first symbol in the
74
+ *k-mer*.
75
+
76
+ The :meth:`match()` method iterates through all overlapping *k-mers*
77
+ in another sequence and, for each *k-mer*, looks up the reference
78
+ IDs and positions of this *k-mer* in the table.
79
+ For each matching position, it adds the *k-mer* position in this
80
+ sequence, the matching reference ID and the matching sequence
81
+ position to the array of matches.
82
+ Finally these matches are returned to the user.
83
+ Optionally, a :class:`SimilarityRule` can be supplied, to find
84
+ also matches for similar *k-mers*.
85
+ This is especially useful for protein sequences to match two
86
+ *k-mers* with a high substitution probability.
87
+
88
+ The positions for a given *k-mer* code can be obtained via indexing.
89
+ Iteration over a :class:`KmerTable` yields the *k-mers* that have at
90
+ least one associated position.
91
+ The *k-mer* code for a *k-mer* can be calculated with
92
+ ``table.kmer_alphabet.encode()`` (see :class:`KmerAlphabet`).
93
+
94
+ Attributes
95
+ ----------
96
+ kmer_alphabet : KmerAlphabet
97
+ The internal :class:`KmerAlphabet`, that is used to
98
+ encode all overlapping *k-mers* of an input sequence.
99
+ alphabet : Alphabet
100
+ The base alphabet, from which this :class:`KmerTable` was
101
+ created.
102
+ k : int
103
+ The length of the *k-mers*.
104
+
105
+ See Also
106
+ --------
107
+ BucketKmerTable
108
+
109
+ Notes
110
+ -----
111
+
112
+ The design of the :class:`KmerTable` is inspired by the *MMseqs2*
113
+ software :footcite:`Steinegger2017`.
114
+
115
+ *Memory consumption*
116
+
117
+ For efficient mapping, a :class:`KmerTable` contains a pointer
118
+ array, that contains one 64-bit pointer for each possible *k-mer*.
119
+ If there is at least one position for a *k-mer*, the corresponding
120
+ pointer points to a C-array that contains
121
+
122
+ 1. The length of the C-array *(int64)*
123
+ 2. The reference ID for each position of this *k-mer* *(uint32)*
124
+ 3. The sequence position for each position of this *k-mer* *(uint32)*
125
+
126
+ Hence, the memory requirements can be quite large for long *k-mers*
127
+ or large alphabets.
128
+ The required memory space :math:`S` in byte is within the bounds of
129
+
130
+ .. math::
131
+
132
+ 8 n^k + 8L \leq S \leq 16 n^k + 8L,
133
+
134
+ where :math:`n` is the number of symbols in the alphabet and
135
+ :math:`L` is the summed length of all sequences added to the table.
136
+
137
+ *Multiprocessing*
138
+
139
+ :class:`KmerTable` objects can be used in multi-processed setups:
140
+ Adding a large database of sequences to a table can be sped up by
141
+ splitting the database into smaller chunks and create a separate
142
+ table for each chunk in separate processes.
143
+ Eventually, the tables can be merged to one large table using
144
+ :meth:`from_tables()`.
145
+
146
+ Since :class:`KmerTable` supports the *pickle* protocol,
147
+ the matching step can also be divided into multiple processes, if
148
+ multiple sequences need to be matched.
149
+
150
+ *Storage on hard drive*
151
+
152
+ The most time efficient way to read/write a :class:`KmerTable` is
153
+ the *pickle* format.
154
+ If a custom format is desired, the user needs to extract the
155
+ reference IDs and position for each *k-mer*.
156
+ To restrict this task to all *k-mer* that have at least one match
157
+ :meth:`get_kmers()` can be used.
158
+ Conversely, the reference IDs and positions can be restored via
159
+ :meth:`from_positions()`.
160
+
161
+ References
162
+ ----------
163
+
164
+ .. footbibliography::
165
+
166
+ Examples
167
+ --------
168
+
169
+ Create a *2-mer* index table for some nucleotide sequences:
170
+
171
+ >>> table = KmerTable.from_sequences(
172
+ ... k = 2,
173
+ ... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
174
+ ... ref_ids = [0, 1]
175
+ ... )
176
+
177
+ Display the contents of the table as
178
+ (reference ID, sequence position) tuples:
179
+
180
+ >>> print(table)
181
+ AG: (1, 2)
182
+ AT: (0, 2)
183
+ CT: (1, 0)
184
+ TA: (0, 1), (0, 3), (1, 1)
185
+ TT: (0, 0)
186
+
187
+ Find matches of the table with a sequence:
188
+
189
+ >>> query = NucleotideSequence("TAG")
190
+ >>> matches = table.match(query)
191
+ >>> for query_pos, table_ref_id, table_pos in matches:
192
+ ... print("Query sequence position:", query_pos)
193
+ ... print("Table reference ID: ", table_ref_id)
194
+ ... print("Table sequence position:", table_pos)
195
+ ... print()
196
+ Query sequence position: 0
197
+ Table reference ID: 0
198
+ Table sequence position: 1
199
+ <BLANKLINE>
200
+ Query sequence position: 0
201
+ Table reference ID: 0
202
+ Table sequence position: 3
203
+ <BLANKLINE>
204
+ Query sequence position: 0
205
+ Table reference ID: 1
206
+ Table sequence position: 1
207
+ <BLANKLINE>
208
+ Query sequence position: 1
209
+ Table reference ID: 1
210
+ Table sequence position: 2
211
+ <BLANKLINE>
212
+
213
+ Get all reference IDs and positions for a given *k-mer*:
214
+
215
+ >>> kmer_code = table.kmer_alphabet.encode("TA")
216
+ >>> print(table[kmer_code])
217
+ [[0 1]
218
+ [0 3]
219
+ [1 1]]
220
+ """
221
+
222
+ cdef object _kmer_alph
223
+ cdef int _k
224
+
225
+ # The pointer array is the core of the index table:
226
+ # It maps each possible k-mer (represented by its code) to a
227
+ # C-array of indices.
228
+ # Each entry in a C-array points to a reference ID and the
229
+ # location in that sequence where the respective k-mer appears
230
+ # The memory layout of each C-array is as following:
231
+ #
232
+ # (Array length) (RefID 0) (Position 0) (RefID 1) (Position 1) ...
233
+ # -----int64----|---uint32---|---uint32---|---uint32---|---uint32---
234
+ #
235
+ # The array length is based on 32 bit units.
236
+ # If there is no entry for a k-mer, the respective pointer is NULL.
237
+ cdef ptr[:] _ptr_array
238
+
239
+
240
+ def __cinit__(self, kmer_alphabet):
241
+ # This check is necessary for proper memory management
242
+ # of the allocated arrays
243
+ if self._is_initialized():
244
+ raise Exception("Duplicate call of constructor")
245
+
246
+ self._kmer_alph = kmer_alphabet
247
+ self._k = kmer_alphabet.k
248
+ self._ptr_array = np.zeros(len(self._kmer_alph), dtype=np.uint64)
249
+
250
+
251
+ @property
252
+ def kmer_alphabet(self):
253
+ return self._kmer_alph
254
+
255
+ @property
256
+ def alphabet(self):
257
+ return self._kmer_alph.base_alphabet
258
+
259
+ @property
260
+ def k(self):
261
+ return self._k
262
+
263
+ @staticmethod
264
+ def from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
265
+ alphabet=None, spacing=None):
266
+ """
267
+ from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
268
+ alphabet=None, spacing=None)
269
+
270
+ Create a :class:`KmerTable` by storing the positions of all
271
+ overlapping *k-mers* from the input `sequences`.
272
+
273
+ Parameters
274
+ ----------
275
+ k : int
276
+ The length of the *k-mers*.
277
+ sequences : sized iterable object of Sequence, length=m
278
+ The sequences to get the *k-mer* positions from.
279
+ These sequences must have equal alphabets, or one of these
280
+ sequences must have an alphabet that extends the alphabets
281
+ of all other sequences.
282
+ ref_ids : sized iterable object of int, length=m, optional
283
+ The reference IDs for the given sequences.
284
+ These are used to identify the corresponding sequence for a
285
+ *k-mer* match.
286
+ By default the IDs are counted from *0* to *m*.
287
+ ignore_masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
288
+ Sequence positions to ignore.
289
+ *k-mers* that involve these sequence positions are not added
290
+ to the table.
291
+ This is used e.g. to skip repeat regions.
292
+ If provided, the list must contain one boolean mask
293
+ (or ``None``) for each sequence, and each bolean mask must
294
+ have the same length as the sequence.
295
+ By default, no sequence position is ignored.
296
+ alphabet : Alphabet, optional
297
+ The alphabet to use for this table.
298
+ It must extend the alphabets of the input `sequences`.
299
+ By default, an appropriate alphabet is inferred from the
300
+ input `sequences`.
301
+ This option is usually used for compatibility with another
302
+ sequence/table in the matching step.
303
+ spacing : None or str or list or ndarray, dtype=int, shape=(k,)
304
+ If provided, spaced *k-mers* are used instead of continuous
305
+ ones.
306
+ The value contains the *informative* positions relative to
307
+ the start of the *k-mer*, also called the *model*.
308
+ The number of *informative* positions must equal *k*.
309
+ Refer to :class:`KmerAlphabet` for more details.
310
+
311
+ See Also
312
+ --------
313
+ from_kmers : The same functionality based on already created *k-mers*
314
+
315
+ Returns
316
+ -------
317
+ table : KmerTable
318
+ The newly created table.
319
+
320
+ Examples
321
+ --------
322
+
323
+ >>> sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")]
324
+ >>> table = KmerTable.from_sequences(
325
+ ... 2, sequences, ref_ids=[100, 101]
326
+ ... )
327
+ >>> print(table)
328
+ AG: (101, 2)
329
+ AT: (100, 2)
330
+ CT: (101, 0)
331
+ TA: (100, 1), (100, 3), (101, 1)
332
+ TT: (100, 0)
333
+
334
+ Give an explicit compatible alphabet:
335
+
336
+ >>> table = KmerTable.from_sequences(
337
+ ... 2, sequences, ref_ids=[100, 101],
338
+ ... alphabet=NucleotideSequence.ambiguous_alphabet()
339
+ ... )
340
+
341
+ Ignore all ``N`` in a sequence:
342
+
343
+ >>> sequence = NucleotideSequence("ACCNTANNG")
344
+ >>> table = KmerTable.from_sequences(
345
+ ... 2, [sequence], ignore_masks=[sequence.symbols == "N"]
346
+ ... )
347
+ >>> print(table)
348
+ AC: (0, 0)
349
+ CC: (0, 1)
350
+ TA: (0, 4)
351
+ """
352
+ ref_ids = _compute_ref_ids(ref_ids, sequences)
353
+ ignore_masks = _compute_masks(ignore_masks, sequences)
354
+ alphabet = _compute_alphabet(
355
+ alphabet, (sequence.alphabet for sequence in sequences)
356
+ )
357
+
358
+ table = KmerTable(KmerAlphabet(alphabet, k, spacing))
359
+
360
+ # Calculate k-mers
361
+ kmers_list = [
362
+ table._kmer_alph.create_kmers(sequence.code)
363
+ for sequence in sequences
364
+ ]
365
+
366
+ masks = [
367
+ _prepare_mask(table._kmer_alph, ignore_mask, len(sequence))
368
+ for sequence, ignore_mask in zip(sequences, ignore_masks)
369
+ ]
370
+
371
+ # Count the number of appearances of each k-mer and store the
372
+ # result in the pointer array, that is now used as count array
373
+ for kmers, mask in zip(kmers_list, masks):
374
+ table._count_masked_kmers(kmers, mask)
375
+
376
+ # Transfrom count array into pointer array with C-array of
377
+ # appropriate size
378
+ _init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
379
+
380
+ # Fill the C-arrays with the k-mer positions
381
+ for kmers, ref_id, mask in zip(kmers_list, ref_ids, masks):
382
+ table._add_kmers(kmers, ref_id, mask)
383
+
384
+ return table
385
+
386
+
387
+ @staticmethod
388
+ def from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None):
389
+ """
390
+ from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None)
391
+
392
+ Create a :class:`KmerTable` by storing the positions of all
393
+ input *k-mers*.
394
+
395
+ Parameters
396
+ ----------
397
+ kmer_alphabet : KmerAlphabet
398
+ The :class:`KmerAlphabet` to use for the new table.
399
+ Should be the same alphabet that was used to calculate the
400
+ input *kmers*.
401
+ kmers : sized iterable object of (ndarray, dtype=np.int64), length=m
402
+ List where each array contains the *k-mer* codes from a
403
+ sequence.
404
+ For each array the index of the *k-mer* code in the array
405
+ is stored in the table as sequence position.
406
+ ref_ids : sized iterable object of int, length=m, optional
407
+ The reference IDs for the sequences.
408
+ These are used to identify the corresponding sequence for a
409
+ *k-mer* match.
410
+ By default the IDs are counted from *0* to *m*.
411
+ masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
412
+ A *k-mer* code at a position, where the corresponding mask
413
+ is false, is not added to the table.
414
+ By default, all positions are added.
415
+
416
+ See Also
417
+ --------
418
+ from_sequences : The same functionality based on undecomposed sequences
419
+
420
+ Returns
421
+ -------
422
+ table : KmerTable
423
+ The newly created table.
424
+
425
+ Examples
426
+ --------
427
+
428
+ >>> sequences = [ProteinSequence("BIQTITE"), ProteinSequence("NIQBITE")]
429
+ >>> kmer_alphabet = KmerAlphabet(ProteinSequence.alphabet, 3)
430
+ >>> kmer_codes = [kmer_alphabet.create_kmers(s.code) for s in sequences]
431
+ >>> for code in kmer_codes:
432
+ ... print(code)
433
+ [11701 4360 7879 9400 4419]
434
+ [ 6517 4364 7975 11704 4419]
435
+ >>> table = KmerTable.from_kmers(
436
+ ... kmer_alphabet, kmer_codes
437
+ ... )
438
+ >>> print(table)
439
+ IQT: (0, 1)
440
+ IQB: (1, 1)
441
+ ITE: (0, 4), (1, 4)
442
+ NIQ: (1, 0)
443
+ QTI: (0, 2)
444
+ QBI: (1, 2)
445
+ TIT: (0, 3)
446
+ BIQ: (0, 0)
447
+ BIT: (1, 3)
448
+ """
449
+ _check_kmer_alphabet(kmer_alphabet)
450
+ _check_multiple_kmer_bounds(kmers, kmer_alphabet)
451
+
452
+ ref_ids = _compute_ref_ids(ref_ids, kmers)
453
+ masks = _compute_masks(masks, kmers)
454
+
455
+ table = KmerTable(kmer_alphabet)
456
+
457
+ masks = [
458
+ np.ones(len(arr), dtype=np.uint8) if mask is None
459
+ # Convert boolean mask into uint8 array to be able
460
+ # to handle it as memory view
461
+ else np.frombuffer(
462
+ mask.astype(bool, copy=False), dtype=np.uint8
463
+ )
464
+ for mask, arr in zip(masks, kmers)
465
+ ]
466
+
467
+ for arr, mask in zip(kmers, masks):
468
+ table._count_masked_kmers(arr, mask)
469
+
470
+ _init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
471
+
472
+ for arr, ref_id, mask in zip(kmers, ref_ids, masks):
473
+ table._add_kmers(arr, ref_id, mask)
474
+
475
+ return table
476
+
477
+
478
+ @staticmethod
479
+ def from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None):
480
+ """
481
+ from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None)
482
+
483
+ Create a :class:`KmerTable` by storing the positions of a
484
+ filtered subset of input *k-mers*.
485
+
486
+ This can be used to reduce the number of stored *k-mers* using
487
+ a *k-mer* subset selector such as :class:`MinimizerSelector`.
488
+
489
+ Parameters
490
+ ----------
491
+ kmer_alphabet : KmerAlphabet
492
+ The :class:`KmerAlphabet` to use for the new table.
493
+ Should be the same alphabet that was used to calculate the
494
+ input *kmers*.
495
+ positions : sized iterable object of (ndarray, shape=(n,), dtype=uint32), length=m
496
+ List where each array contains the sequence positions of
497
+ the filtered subset of *k-mers* given in `kmers`.
498
+ The list may contain multiple elements for multiple
499
+ sequences.
500
+ kmers : sized iterable object of (ndarray, shape=(n,), dtype=np.int64), length=m
501
+ List where each array contains the filtered subset of
502
+ *k-mer* codes from a sequence.
503
+ For each array the index of the *k-mer* code in the array,
504
+ is stored in the table as sequence position.
505
+ The list may contain multiple elements for multiple
506
+ sequences.
507
+ ref_ids : sized iterable object of int, length=m, optional
508
+ The reference IDs for the sequences.
509
+ These are used to identify the corresponding sequence for a
510
+ *k-mer* match.
511
+ By default the IDs are counted from *0* to *m*.
512
+
513
+ Returns
514
+ -------
515
+ table : KmerTable
516
+ The newly created table.
517
+
518
+ Examples
519
+ --------
520
+
521
+ Reduce the size of sequence data in the table using minimizers:
522
+
523
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
524
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
525
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
526
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
527
+ >>> kmer_table = KmerTable.from_kmer_selection(
528
+ ... kmer_alph, [minimizer_pos], [minimizers]
529
+ ... )
530
+
531
+ Use the same :class:`MinimizerSelector` to select the minimizers
532
+ from the query sequence and match them against the table.
533
+ Although the amount of *k-mers* is reduced, matching is still
534
+ guanrateed to work, if the two sequences share identity in the
535
+ given window:
536
+
537
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
538
+ >>> minimizer_pos, minimizers = minimizer.select(sequence2)
539
+ >>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
540
+ >>> print(matches)
541
+ [[ 9 0 11]
542
+ [12 0 14]]
543
+ >>> for query_pos, _, db_pos in matches:
544
+ ... print(sequence1)
545
+ ... print(" " * (db_pos-1) + "^" * kmer_table.k)
546
+ ... print(sequence2)
547
+ ... print(" " * (query_pos-1) + "^" * kmer_table.k)
548
+ ... print()
549
+ THIS*IS*A*SEQVENCE
550
+ ^^^
551
+ ANQTHER*SEQVENCE
552
+ ^^^
553
+ <BLANKLINE>
554
+ THIS*IS*A*SEQVENCE
555
+ ^^^
556
+ ANQTHER*SEQVENCE
557
+ ^^^
558
+ <BLANKLINE>
559
+ """
560
+ _check_kmer_alphabet(kmer_alphabet)
561
+ _check_multiple_kmer_bounds(kmers, kmer_alphabet)
562
+ _check_position_shape(positions, kmers)
563
+
564
+ ref_ids = _compute_ref_ids(ref_ids, kmers)
565
+
566
+ table = KmerTable(kmer_alphabet)
567
+
568
+ for arr in kmers:
569
+ table._count_kmers(arr)
570
+
571
+ _init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
572
+
573
+ for pos, arr, ref_id in zip(positions, kmers, ref_ids):
574
+ table._add_kmer_selection(
575
+ pos.astype(np.uint32, copy=False), arr, ref_id
576
+ )
577
+
578
+ return table
579
+
580
+
581
+ @staticmethod
582
+ def from_tables(tables):
583
+ """
584
+ from_tables(tables)
585
+
586
+ Create a :class:`KmerTable` by merging the *k-mer* positions
587
+ from existing `tables`.
588
+
589
+ Parameters
590
+ ----------
591
+ tables : iterable object of KmerTable
592
+ The tables to be merged.
593
+ All tables must have equal :class:`KmerAlphabet` objects,
594
+ i.e. the same *k* and equal base alphabets.
595
+
596
+ Returns
597
+ -------
598
+ table : KmerTable
599
+ The newly created table.
600
+
601
+ Examples
602
+ --------
603
+
604
+ >>> table1 = KmerTable.from_sequences(
605
+ ... 2, [NucleotideSequence("TTATA")], ref_ids=[100]
606
+ ... )
607
+ >>> table2 = KmerTable.from_sequences(
608
+ ... 2, [NucleotideSequence("CTAG")], ref_ids=[101]
609
+ ... )
610
+ >>> merged_table = KmerTable.from_tables([table1, table2])
611
+ >>> print(merged_table)
612
+ AG: (101, 2)
613
+ AT: (100, 2)
614
+ CT: (101, 0)
615
+ TA: (100, 1), (100, 3), (101, 1)
616
+ TT: (100, 0)
617
+ """
618
+ cdef KmerTable table
619
+
620
+ _check_same_kmer_alphabet(tables)
621
+
622
+ merged_table = KmerTable(tables[0].kmer_alphabet)
623
+
624
+ # Sum the number of appearances of each k-mer from the tables
625
+ for table in tables:
626
+ # 'merged_table._ptr_array' is repurposed as count array,
627
+ # This can be safely done, because in this step the pointers
628
+ # are not initialized yet.
629
+ # This may save a lot of memory because no extra array is
630
+ # required to count the number of positions for each *k-mer*
631
+ _count_table_entries(
632
+ merged_table._ptr_array, table._ptr_array,
633
+ EntrySize.NO_BUCKETS
634
+ )
635
+
636
+ _init_c_arrays(merged_table._ptr_array, EntrySize.NO_BUCKETS)
637
+
638
+ for table in tables:
639
+ _append_entries(merged_table._ptr_array, table._ptr_array)
640
+
641
+ return merged_table
642
+
643
+
644
+ @cython.boundscheck(False)
645
+ @cython.wraparound(False)
646
+ @staticmethod
647
+ def from_positions(kmer_alphabet, dict kmer_positions):
648
+ """
649
+ from_positions(kmer_alphabet, kmer_positions)
650
+
651
+ Create a :class:`KmerTable` from *k-mer* reference IDs and
652
+ positions.
653
+ This constructor is especially useful for restoring a table
654
+ from previously serialized data.
655
+
656
+ Parameters
657
+ ----------
658
+ kmer_alphabet : KmerAlphabet
659
+ The :class:`KmerAlphabet` to use for the new table
660
+ kmer_positions : dict of (int -> ndarray, shape=(n,2), dtype=int)
661
+ A dictionary representing the *k-mer* reference IDs and
662
+ positions to be stored in the newly created table.
663
+ It maps a *k-mer* code to a :class:`ndarray`.
664
+ To achieve a high performance the data type ``uint32``
665
+ is preferred for the arrays.
666
+
667
+ Returns
668
+ -------
669
+ table : KmerTable
670
+ The newly created table.
671
+
672
+ Examples
673
+ --------
674
+
675
+ >>> sequence = ProteinSequence("BIQTITE")
676
+ >>> table = KmerTable.from_sequences(3, [sequence], ref_ids=[100])
677
+ >>> print(table)
678
+ IQT: (100, 1)
679
+ ITE: (100, 4)
680
+ QTI: (100, 2)
681
+ TIT: (100, 3)
682
+ BIQ: (100, 0)
683
+ >>> data = {kmer: table[kmer] for kmer in table}
684
+ >>> print(data)
685
+ {4360: array([[100, 1]], dtype=uint32), 4419: array([[100, 4]], dtype=uint32), 7879: array([[100, 2]], dtype=uint32), 9400: array([[100, 3]], dtype=uint32), 11701: array([[100, 0]], dtype=uint32)}
686
+ >>> restored_table = KmerTable.from_positions(table.kmer_alphabet, data)
687
+ >>> print(restored_table)
688
+ IQT: (100, 1)
689
+ ITE: (100, 4)
690
+ QTI: (100, 2)
691
+ TIT: (100, 3)
692
+ BIQ: (100, 0)
693
+ """
694
+ cdef int64 length
695
+ cdef uint32* kmer_ptr
696
+ cdef int64 i
697
+ cdef int64 kmer
698
+ cdef uint32[:,:] positions
699
+
700
+ table = KmerTable(kmer_alphabet)
701
+
702
+ cdef ptr[:] ptr_array = table._ptr_array
703
+ cdef int64 alph_length = len(kmer_alphabet)
704
+
705
+ for kmer, position_array in kmer_positions.items():
706
+ if kmer < 0 or kmer >= alph_length:
707
+ raise AlphabetError(
708
+ f"k-mer code {kmer} does not represent a valid k-mer"
709
+ )
710
+ positions = position_array.astype(np.uint32, copy=False)
711
+ if positions.shape[0] == 0:
712
+ # No position to add -> jump to the next k-mer
713
+ continue
714
+ if positions.shape[1] != 2:
715
+ raise IndexError(
716
+ f"Each entry in position array has {positions.shape[1]} "
717
+ f"values, but 2 were expected"
718
+ )
719
+
720
+ # Plus the size of array length value (int64)
721
+ length = 2 * positions.shape[0] + 2
722
+ kmer_ptr = <uint32*>malloc(length * sizeof(uint32))
723
+ if not kmer_ptr:
724
+ raise MemoryError
725
+ ptr_array[kmer] = <ptr>kmer_ptr
726
+ (<int64*> kmer_ptr)[0] = length
727
+ # Jump behind the length value
728
+ kmer_ptr += 2
729
+
730
+ # Add entries
731
+ for i in range(positions.shape[0]):
732
+ kmer_ptr[0] = positions[i,0]
733
+ kmer_ptr += 1
734
+ kmer_ptr[0] = positions[i,1]
735
+ kmer_ptr += 1
736
+
737
+ return table
738
+
739
+
740
+ @cython.boundscheck(False)
741
+ @cython.wraparound(False)
742
+ def match_table(self, KmerTable table, similarity_rule=None):
743
+ """
744
+ match_table(table, similarity_rule=None)
745
+
746
+ Find matches between the *k-mers* in this table with the
747
+ *k-mers* in another `table`.
748
+
749
+ This means that for each *k-mer* the cartesian product between
750
+ the positions in both tables is added to the matches.
751
+
752
+ Parameters
753
+ ----------
754
+ table : KmerTable
755
+ The table to be matched.
756
+ Both tables must have equal :class:`KmerAlphabet` objects,
757
+ i.e. the same *k* and equal base alphabets.
758
+ similarity_rule : SimilarityRule, optional
759
+ If this parameter is given, not only exact *k-mer* matches
760
+ are considered, but also similar ones according to the given
761
+ :class:`SimilarityRule`.
762
+
763
+ Returns
764
+ -------
765
+ matches : ndarray, shape=(n,4), dtype=np.uint32
766
+ The *k-mer* matches.
767
+ Each row contains one match. Each match has the following
768
+ columns:
769
+
770
+ 0. The reference ID of the matched sequence in the other
771
+ table
772
+ 1. The sequence position of the matched sequence in the
773
+ other table
774
+ 2. The reference ID of the matched sequence in this
775
+ table
776
+ 3. The sequence position of the matched sequence in this
777
+ table
778
+
779
+ Notes
780
+ -----
781
+
782
+ There is no guaranteed order of the reference IDs or
783
+ sequence positions in the returned matches.
784
+
785
+ Examples
786
+ --------
787
+
788
+ >>> sequence1 = ProteinSequence("BIQTITE")
789
+ >>> table1 = KmerTable.from_sequences(3, [sequence1], ref_ids=[100])
790
+ >>> print(table1)
791
+ IQT: (100, 1)
792
+ ITE: (100, 4)
793
+ QTI: (100, 2)
794
+ TIT: (100, 3)
795
+ BIQ: (100, 0)
796
+ >>> sequence2 = ProteinSequence("TITANITE")
797
+ >>> table2 = KmerTable.from_sequences(3, [sequence2], ref_ids=[101])
798
+ >>> print(table2)
799
+ ANI: (101, 3)
800
+ ITA: (101, 1)
801
+ ITE: (101, 5)
802
+ NIT: (101, 4)
803
+ TAN: (101, 2)
804
+ TIT: (101, 0)
805
+ >>> print(table1.match_table(table2))
806
+ [[101 5 100 4]
807
+ [101 0 100 3]]
808
+ """
809
+ cdef int INIT_SIZE = 1
810
+
811
+ cdef int64 kmer, sim_kmer
812
+ cdef int64 match_i
813
+ cdef int64 i, j, l
814
+ cdef int64 self_length, other_length
815
+ cdef uint32* self_kmer_ptr
816
+ cdef uint32* other_kmer_ptr
817
+
818
+ # This variable will only be used if a similarity rule exists
819
+ cdef int64[:] similar_kmers
820
+
821
+ # Store in new variables
822
+ # to disable repetitive initialization checks
823
+ cdef ptr[:] self_ptr_array = self._ptr_array
824
+ cdef ptr[:] other_ptr_array = table._ptr_array
825
+
826
+ _check_same_kmer_alphabet((self, table))
827
+
828
+ # This array will store the match positions
829
+ # As the final number of matches is unknown, a list-like
830
+ # approach is used:
831
+ # The array is initialized with a relatively small inital size
832
+ # and every time the limit would be exceeded its size is doubled
833
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 4), dtype=np.int64)
834
+ match_i = 0
835
+ if similarity_rule is None:
836
+ for kmer in range(self_ptr_array.shape[0]):
837
+ self_kmer_ptr = <uint32*>self_ptr_array[kmer]
838
+ other_kmer_ptr = <uint32*>other_ptr_array[kmer]
839
+ # For each k-mer create the cartesian product
840
+ if self_kmer_ptr != NULL and other_kmer_ptr != NULL:
841
+ # This kmer exists for both tables
842
+ other_length = (<int64*>other_kmer_ptr)[0]
843
+ self_length = (<int64*>self_kmer_ptr )[0]
844
+ for i in range(2, other_length, 2):
845
+ for j in range(2, self_length, 2):
846
+ if match_i >= matches.shape[0]:
847
+ # The 'matches' array is full
848
+ # -> double its size
849
+ matches = expand(np.asarray(matches))
850
+ matches[match_i, 0] = other_kmer_ptr[i]
851
+ matches[match_i, 1] = other_kmer_ptr[i+1]
852
+ matches[match_i, 2] = self_kmer_ptr[j]
853
+ matches[match_i, 3] = self_kmer_ptr[j+1]
854
+ match_i += 1
855
+
856
+ else:
857
+ for kmer in range(self_ptr_array.shape[0]):
858
+ other_kmer_ptr = <uint32*>other_ptr_array[kmer]
859
+ if other_kmer_ptr != NULL:
860
+ # If a similarity rule exists, iterate not only over
861
+ # the exact k-mer, but over all k-mers similar to
862
+ # the current k-mer
863
+ similar_kmers = similarity_rule.similar_kmers(
864
+ self._kmer_alph, kmer
865
+ )
866
+ for l in range(similar_kmers.shape[0]):
867
+ sim_kmer = similar_kmers[l]
868
+ # Actual copy of the code from the other
869
+ # if-branch:
870
+ # It cannot be put properly in a cdef-function,
871
+ # as every function call would perform reference
872
+ # count changes and would decrease performance
873
+ self_kmer_ptr = <uint32*>self_ptr_array[sim_kmer]
874
+ if self_kmer_ptr != NULL:
875
+ other_length = (<int64*>other_kmer_ptr)[0]
876
+ self_length = (<int64*>self_kmer_ptr )[0]
877
+ for i in range(2, other_length, 2):
878
+ for j in range(2, self_length, 2):
879
+ if match_i >= matches.shape[0]:
880
+ matches = expand(np.asarray(matches))
881
+ matches[match_i, 0] = other_kmer_ptr[i]
882
+ matches[match_i, 1] = other_kmer_ptr[i+1]
883
+ matches[match_i, 2] = self_kmer_ptr[j]
884
+ matches[match_i, 3] = self_kmer_ptr[j+1]
885
+ match_i += 1
886
+
887
+ # Trim to correct size and return
888
+ return np.asarray(matches[:match_i])
889
+
890
+
891
+ @cython.boundscheck(False)
892
+ @cython.wraparound(False)
893
+ def match(self, sequence, similarity_rule=None, ignore_mask=None):
894
+ """
895
+ match(sequence, similarity_rule=None, ignore_mask=None)
896
+
897
+ Find matches between the *k-mers* in this table with all
898
+ overlapping *k-mers* in the given `sequence`.
899
+ *k* is determined by the table.
900
+
901
+ Parameters
902
+ ----------
903
+ sequence : Sequence
904
+ The sequence to be matched.
905
+ The table's base alphabet must extend the alphabet of the
906
+ sequence.
907
+ similarity_rule : SimilarityRule, optional
908
+ If this parameter is given, not only exact *k-mer* matches
909
+ are considered, but also similar ones according to the given
910
+ :class:`SimilarityRule`.
911
+ ignore_mask : ndarray, dtype=bool, optional
912
+ Boolean mask of sequence positions to ignore.
913
+ *k-mers* that involve these sequence positions are not added
914
+ to the table.
915
+ This is used e.g. to skip repeat regions.
916
+ By default, no sequence position is ignored.
917
+
918
+ Returns
919
+ -------
920
+ matches : ndarray, shape=(n,3), dtype=np.uint32
921
+ The *k-mer* matches.
922
+ Each row contains one match. Each match has the following
923
+ columns:
924
+
925
+ 0. The sequence position in the input sequence
926
+ 1. The reference ID of the matched sequence in the table
927
+ 2. The sequence position of the matched sequence in the
928
+ table
929
+
930
+ Notes
931
+ -----
932
+
933
+ The matches are ordered by the first column.
934
+
935
+ Examples
936
+ --------
937
+
938
+ >>> sequence1 = ProteinSequence("BIQTITE")
939
+ >>> table = KmerTable.from_sequences(3, [sequence1], ref_ids=[100])
940
+ >>> print(table)
941
+ IQT: (100, 1)
942
+ ITE: (100, 4)
943
+ QTI: (100, 2)
944
+ TIT: (100, 3)
945
+ BIQ: (100, 0)
946
+ >>> sequence2 = ProteinSequence("TITANITE")
947
+ >>> print(table.match(sequence2))
948
+ [[ 0 100 3]
949
+ [ 5 100 4]]
950
+ """
951
+ cdef int INIT_SIZE = 1
952
+
953
+ cdef int64 kmer, sim_kmer
954
+ cdef int64 match_i
955
+ cdef int64 i, j, l
956
+ cdef int64 length
957
+ cdef uint32* kmer_ptr
958
+
959
+ # This variable will only be used if a similarity rule exists
960
+ cdef int64[:] similar_kmers
961
+
962
+ # Store in new variable
963
+ # to disable repetitive initialization checks
964
+ cdef ptr[:] ptr_array = self._ptr_array
965
+
966
+ if len(sequence.code) < self._k:
967
+ raise ValueError("Sequence code is shorter than k")
968
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
969
+ raise ValueError(
970
+ "The alphabet used for the k-mer index table is not equal to "
971
+ "the alphabet of the sequence"
972
+ )
973
+
974
+ cdef int64[:] kmers = self._kmer_alph.create_kmers(sequence.code)
975
+ cdef uint8[:] kmer_mask = _prepare_mask(
976
+ self._kmer_alph, ignore_mask, len(sequence.code)
977
+ )
978
+
979
+ # This array will store the match positions
980
+ # As the final number of matches is unknown, a list-like
981
+ # approach is used:
982
+ # The array is initialized with a relatively small inital size
983
+ # and every time the limit would be exceeded its size is doubled
984
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
985
+ match_i = 0
986
+ if similarity_rule is None:
987
+ for i in range(kmers.shape[0]):
988
+ if kmer_mask[i]:
989
+ kmer = kmers[i]
990
+ kmer_ptr = <uint32*>ptr_array[kmer]
991
+ if kmer_ptr != NULL:
992
+ # There is at least one entry for the k-mer
993
+ length = (<int64*>kmer_ptr)[0]
994
+ for j in range(2, length, 2):
995
+ if match_i >= matches.shape[0]:
996
+ # The 'matches' array is full
997
+ # -> double its size
998
+ matches = expand(np.asarray(matches))
999
+ matches[match_i, 0] = i
1000
+ matches[match_i, 1] = kmer_ptr[j]
1001
+ matches[match_i, 2] = kmer_ptr[j+1]
1002
+ match_i += 1
1003
+
1004
+ else:
1005
+ for i in range(kmers.shape[0]):
1006
+ if kmer_mask[i]:
1007
+ kmer = kmers[i]
1008
+ # If a similarity rule exists, iterate not only over
1009
+ # the exact k-mer, but over all k-mers similar to
1010
+ # the current k-mer
1011
+ similar_kmers = similarity_rule.similar_kmers(
1012
+ self._kmer_alph, kmer
1013
+ )
1014
+ for l in range(similar_kmers.shape[0]):
1015
+ sim_kmer = similar_kmers[l]
1016
+ # Actual copy of the code from the other
1017
+ # if-branch:
1018
+ # It cannot be put properly in a cdef-function,
1019
+ # as every function call would perform reference
1020
+ # count changes and would decrease performance
1021
+ kmer_ptr = <uint32*>ptr_array[sim_kmer]
1022
+ if kmer_ptr != NULL:
1023
+ # There is at least one entry for the k-mer
1024
+ length = (<int64*>kmer_ptr)[0]
1025
+ for j in range(2, length, 2):
1026
+ if match_i >= matches.shape[0]:
1027
+ # The 'matches' array is full
1028
+ # -> double its size
1029
+ matches = expand(np.asarray(matches))
1030
+ matches[match_i, 0] = i
1031
+ matches[match_i, 1] = kmer_ptr[j]
1032
+ matches[match_i, 2] = kmer_ptr[j+1]
1033
+ match_i += 1
1034
+
1035
+ # Trim to correct size and return
1036
+ return np.asarray(matches[:match_i])
1037
+
1038
+
1039
+ @cython.boundscheck(False)
1040
+ @cython.wraparound(False)
1041
+ def match_kmer_selection(self, positions, kmers):
1042
+ """
1043
+ match_kmer_selection(positions, kmers)
1044
+
1045
+ Find matches between the *k-mers* in this table with the given
1046
+ *k-mer* selection.
1047
+
1048
+ It is intended to use this method to find matches in a table
1049
+ that was created using :meth:`from_kmer_selection()`.
1050
+
1051
+ Parameters
1052
+ ----------
1053
+ positions : ndarray, shape=(n,), dtype=uint32
1054
+ Sequence positions of the filtered subset of *k-mers* given
1055
+ in `kmers`.
1056
+ kmers : ndarray, shape=(n,), dtype=np.int64
1057
+ Filtered subset of *k-mer* codes to match against.
1058
+
1059
+ Returns
1060
+ -------
1061
+ matches : ndarray, shape=(n,3), dtype=np.uint32
1062
+ The *k-mer* matches.
1063
+ Each row contains one *k-mer* match.
1064
+ Each match has the following columns:
1065
+
1066
+ 0. The sequence position of the input *k-mer*, taken
1067
+ from `positions`
1068
+ 1. The reference ID of the matched sequence in the table
1069
+ 2. The sequence position of the matched *k-mer* in the
1070
+ table
1071
+
1072
+ Examples
1073
+ --------
1074
+
1075
+ Reduce the size of sequence data in the table using minimizers:
1076
+
1077
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
1078
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
1079
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
1080
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
1081
+ >>> kmer_table = KmerTable.from_kmer_selection(
1082
+ ... kmer_alph, [minimizer_pos], [minimizers]
1083
+ ... )
1084
+
1085
+ Use the same :class:`MinimizerSelector` to select the minimizers
1086
+ from the query sequence and match them against the table.
1087
+ Although the amount of *k-mers* is reduced, matching is still
1088
+ guanrateed to work, if the two sequences share identity in the
1089
+ given window:
1090
+
1091
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
1092
+ >>> minimizer_pos, minimizers = minimizer.select(sequence2)
1093
+ >>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
1094
+ >>> print(matches)
1095
+ [[ 9 0 11]
1096
+ [12 0 14]]
1097
+ >>> for query_pos, _, db_pos in matches:
1098
+ ... print(sequence1)
1099
+ ... print(" " * (db_pos-1) + "^" * kmer_table.k)
1100
+ ... print(sequence2)
1101
+ ... print(" " * (query_pos-1) + "^" * kmer_table.k)
1102
+ ... print()
1103
+ THIS*IS*A*SEQVENCE
1104
+ ^^^
1105
+ ANQTHER*SEQVENCE
1106
+ ^^^
1107
+ <BLANKLINE>
1108
+ THIS*IS*A*SEQVENCE
1109
+ ^^^
1110
+ ANQTHER*SEQVENCE
1111
+ ^^^
1112
+ <BLANKLINE>
1113
+ """
1114
+ cdef int INIT_SIZE = 1
1115
+
1116
+ cdef int64 i, j
1117
+
1118
+ cdef int64 kmer
1119
+ cdef int64 match_i
1120
+ cdef int64 seq_pos
1121
+ cdef int64 length
1122
+ cdef uint32* kmer_ptr
1123
+
1124
+ # Store in new variable
1125
+ # to disable repetitive initialization checks
1126
+ cdef ptr[:] ptr_array = self._ptr_array
1127
+
1128
+ _check_kmer_bounds(kmers, self._kmer_alph)
1129
+ if positions.shape[0] != kmers.shape[0]:
1130
+ raise IndexError(
1131
+ f"{positions.shape[0]} positions were given "
1132
+ f"for {kmers.shape[0]} k-mers"
1133
+ )
1134
+
1135
+ cdef uint32[:] pos_array = positions.astype(np.uint32, copy=False)
1136
+ cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
1137
+
1138
+ # This array will store the match positions
1139
+ # As the final number of matches is unknown, a list-like
1140
+ # approach is used:
1141
+ # The array is initialized with a relatively small inital size
1142
+ # and every time the limit would be exceeded its size is doubled
1143
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
1144
+ match_i = 0
1145
+ for i in range(kmer_array.shape[0]):
1146
+ kmer = kmer_array[i]
1147
+ seq_pos = pos_array[i]
1148
+ kmer_ptr = <uint32*>ptr_array[kmer]
1149
+ if kmer_ptr != NULL:
1150
+ # There is at least one entry for the k-mer
1151
+ length = (<int64*>kmer_ptr)[0]
1152
+ for j in range(2, length, 2):
1153
+ if match_i >= matches.shape[0]:
1154
+ # The 'matches' array is full
1155
+ # -> double its size
1156
+ matches = expand(np.asarray(matches))
1157
+ matches[match_i, 0] = seq_pos
1158
+ matches[match_i, 1] = kmer_ptr[j]
1159
+ matches[match_i, 2] = kmer_ptr[j+1]
1160
+ match_i += 1
1161
+
1162
+ # Trim to correct size and return
1163
+ return np.asarray(matches[:match_i])
1164
+
1165
+
1166
+ @cython.boundscheck(False)
1167
+ @cython.wraparound(False)
1168
+ def count(self, kmers=None):
1169
+ """
1170
+ count(kmers=None)
1171
+
1172
+ Count the number of occurences for each *k-mer* in the table.
1173
+
1174
+ Parameters
1175
+ ----------
1176
+ kmers : ndarray, dtype=np.int64, optional
1177
+ The count is returned for these *k-mer* codes.
1178
+ By default all *k-mers* are counted in ascending order, i.e.
1179
+ ``count_for_kmer = counts[kmer]``.
1180
+
1181
+ Returns
1182
+ -------
1183
+ counts : ndarray, dtype=np.int64, optional
1184
+ The counts for each given *k-mer*.
1185
+
1186
+ Examples
1187
+ --------
1188
+ >>> table = KmerTable.from_sequences(
1189
+ ... k = 2,
1190
+ ... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
1191
+ ... ref_ids = [0, 1]
1192
+ ... )
1193
+ >>> print(table)
1194
+ AG: (1, 2)
1195
+ AT: (0, 2)
1196
+ CT: (1, 0)
1197
+ TA: (0, 1), (0, 3), (1, 1)
1198
+ TT: (0, 0)
1199
+
1200
+ Count two selected *k-mers*:
1201
+
1202
+ >>> print(table.count(table.kmer_alphabet.encode_multiple(["TA", "AG"])))
1203
+ [3 1]
1204
+
1205
+ Count all *k-mers*:
1206
+
1207
+ >>> counts = table.count()
1208
+ >>> print(counts)
1209
+ [0 0 1 1 0 0 0 1 0 0 0 0 3 0 0 1]
1210
+ >>> for kmer, count in zip(table.kmer_alphabet.get_symbols(), counts):
1211
+ ... print(kmer, count)
1212
+ AA 0
1213
+ AC 0
1214
+ AG 1
1215
+ AT 1
1216
+ CA 0
1217
+ CC 0
1218
+ CG 0
1219
+ CT 1
1220
+ GA 0
1221
+ GC 0
1222
+ GG 0
1223
+ GT 0
1224
+ TA 3
1225
+ TC 0
1226
+ TG 0
1227
+ TT 1
1228
+ """
1229
+ cdef int64 i
1230
+
1231
+ cdef int64 length
1232
+ cdef int64 kmer
1233
+ cdef int64* kmer_ptr
1234
+ cdef ptr[:] ptr_array = self._ptr_array
1235
+ cdef int64[:] kmer_array
1236
+ cdef int64[:] counts
1237
+
1238
+ if kmers is None:
1239
+ counts = np.zeros(ptr_array.shape[0], dtype=np.int64)
1240
+ for kmer in range(ptr_array.shape[0]):
1241
+ kmer_ptr = <int64*> (ptr_array[kmer])
1242
+ if kmer_ptr != NULL:
1243
+ # First 64 bytes are length of C-array
1244
+ length = kmer_ptr[0]
1245
+ # Array length is measured in uint32
1246
+ # length = 2 * count + 2 -> rearrange formula
1247
+ counts[kmer] = (length - 2) // 2
1248
+
1249
+ else:
1250
+ _check_kmer_bounds(kmers, self._kmer_alph)
1251
+
1252
+ kmer_array = kmers.astype(np.int64, copy=False)
1253
+ counts = np.zeros(kmer_array.shape[0], dtype=np.int64)
1254
+ for i in range(kmer_array.shape[0]):
1255
+ kmer = kmer_array[i]
1256
+ kmer_ptr = <int64*> (ptr_array[kmer])
1257
+ if kmer_ptr != NULL:
1258
+ length = kmer_ptr[0]
1259
+ counts[i] = (length - 2) // 2
1260
+
1261
+ return np.asarray(counts)
1262
+
1263
+
1264
+ @cython.boundscheck(False)
1265
+ @cython.wraparound(False)
1266
+ def get_kmers(self):
1267
+ """
1268
+ Get the *k-mer* codes for all *k-mers* that have at least one
1269
+ position in the table.
1270
+
1271
+ Returns
1272
+ -------
1273
+ kmers : ndarray, shape=(n,), dtype=np.int64
1274
+ The *k-mer* codes.
1275
+
1276
+ Examples
1277
+ --------
1278
+
1279
+ >>> sequence = ProteinSequence("BIQTITE")
1280
+ >>> table = KmerTable.from_sequences(3, [sequence], ref_ids=[100])
1281
+ >>> print(table)
1282
+ IQT: (100, 1)
1283
+ ITE: (100, 4)
1284
+ QTI: (100, 2)
1285
+ TIT: (100, 3)
1286
+ BIQ: (100, 0)
1287
+ >>> kmer_codes = table.get_kmers()
1288
+ >>> print(kmer_codes)
1289
+ [ 4360 4419 7879 9400 11701]
1290
+ >>> for code in kmer_codes:
1291
+ ... print(table[code])
1292
+ [[100 1]]
1293
+ [[100 4]]
1294
+ [[100 2]]
1295
+ [[100 3]]
1296
+ [[100 0]]
1297
+ """
1298
+ cdef int64 kmer
1299
+ cdef ptr[:] ptr_array = self._ptr_array
1300
+
1301
+ # Pessimistic allocation:
1302
+ # The maximum number of used kmers are all possible kmers
1303
+ cdef int64[:] kmers = np.zeros(ptr_array.shape[0], dtype=np.int64)
1304
+
1305
+ cdef int64 i = 0
1306
+ for kmer in range(ptr_array.shape[0]):
1307
+ if <uint32*> (ptr_array[kmer]) != NULL:
1308
+ kmers[i] = kmer
1309
+ i += 1
1310
+
1311
+ # Trim to correct size
1312
+ return np.asarray(kmers)[:i]
1313
+
1314
+
1315
+ @cython.cdivision(True)
1316
+ @cython.boundscheck(False)
1317
+ @cython.wraparound(False)
1318
+ def __getitem__(self, int64 kmer):
1319
+ cdef int64 i, j
1320
+ cdef int64 length
1321
+ cdef uint32* kmer_ptr
1322
+ cdef uint32[:,:] positions
1323
+
1324
+ if kmer >= len(self):
1325
+ raise AlphabetError(
1326
+ f"k-mer code {kmer} is out of bounds "
1327
+ f"for the given KmerAlphabet"
1328
+ )
1329
+
1330
+ kmer_ptr = <uint32*>self._ptr_array[kmer]
1331
+ if kmer_ptr == NULL:
1332
+ return np.zeros((0, 2), dtype=np.uint32)
1333
+ else:
1334
+ length = (<int64*>kmer_ptr)[0]
1335
+ positions = np.empty(((length - 2) // 2, 2), dtype=np.uint32)
1336
+ i = 0
1337
+ for j in range(2, length, 2):
1338
+ positions[i,0] = kmer_ptr[j]
1339
+ positions[i,1] = kmer_ptr[j+1]
1340
+ i += 1
1341
+ return np.asarray(positions)
1342
+
1343
+
1344
+ def __len__(self):
1345
+ return len(self._kmer_alph)
1346
+
1347
+
1348
+ def __contains__(self, int64 kmer):
1349
+ # If there is at least one entry for a k-mer,
1350
+ # the pointer is not NULL
1351
+ return self._ptr_array[kmer] != 0
1352
+
1353
+
1354
+ def __iter__(self):
1355
+ for kmer in self.get_kmers():
1356
+ yield kmer.item()
1357
+
1358
+
1359
+ def __reversed__(self):
1360
+ return reversed(self.get_kmers())
1361
+
1362
+
1363
+ def __eq__(self, item):
1364
+ if item is self:
1365
+ return True
1366
+ if type(item) != KmerTable:
1367
+ return False
1368
+
1369
+ # Introduce static typing to access statically typed fields
1370
+ cdef KmerTable other = item
1371
+ if self._kmer_alph.base_alphabet != other._kmer_alph.base_alphabet:
1372
+ return False
1373
+ if self._k != other._k:
1374
+ return False
1375
+ return _equal_c_arrays(self._ptr_array, other._ptr_array)
1376
+
1377
+
1378
+ def __str__(self):
1379
+ return _to_string(self)
1380
+
1381
+
1382
+ def __getnewargs_ex__(self):
1383
+ return (self._kmer_alph,), {}
1384
+
1385
+
1386
+ def __getstate__(self):
1387
+ return _pickle_c_arrays(self._ptr_array)
1388
+
1389
+
1390
+ def __setstate__(self, state):
1391
+ _unpickle_c_arrays(self._ptr_array, state)
1392
+
1393
+
1394
+ def __dealloc__(self):
1395
+ if self._is_initialized():
1396
+ _deallocate_ptrs(self._ptr_array)
1397
+
1398
+
1399
+ @cython.boundscheck(False)
1400
+ @cython.wraparound(False)
1401
+ def _count_kmers(self, int64[:] kmers):
1402
+ """
1403
+ Repurpose the pointer array as count array and add the
1404
+ total number of positions for the given kmers to the values in
1405
+ the count array.
1406
+
1407
+ This can be safely done, because in this step the pointers are
1408
+ not initialized yet.
1409
+ This may save a lot of memory because no extra array is required
1410
+ to count the number of positions for each *k-mer*.
1411
+ """
1412
+ cdef uint32 seq_pos
1413
+ cdef int64 kmer
1414
+
1415
+ cdef ptr[:] count_array = self._ptr_array
1416
+
1417
+ for seq_pos in range(kmers.shape[0]):
1418
+ kmer = kmers[seq_pos]
1419
+ count_array[kmer] += 1
1420
+
1421
+ @cython.boundscheck(False)
1422
+ @cython.wraparound(False)
1423
+ def _count_masked_kmers(self, int64[:] kmers, uint8[:] mask):
1424
+ """
1425
+ Same as above, but with mask.
1426
+ """
1427
+ cdef uint32 seq_pos
1428
+ cdef int64 kmer
1429
+
1430
+ cdef ptr[:] count_array = self._ptr_array
1431
+
1432
+ for seq_pos in range(kmers.shape[0]):
1433
+ if mask[seq_pos]:
1434
+ kmer = kmers[seq_pos]
1435
+ count_array[kmer] += 1
1436
+
1437
+
1438
+ @cython.boundscheck(False)
1439
+ @cython.wraparound(False)
1440
+ def _add_kmers(self, int64[:] kmers, uint32 ref_id, uint8[:] mask):
1441
+ """
1442
+ For each *k-mer* in `kmers` add the reference ID and the
1443
+ position in the array to the corresponding C-array and update
1444
+ the length of the C-array.
1445
+ """
1446
+ cdef uint32 seq_pos
1447
+ cdef int64 current_size
1448
+ cdef int64 kmer
1449
+ cdef uint32* kmer_ptr
1450
+
1451
+ # Store in new variable
1452
+ # to disable repetitive initialization checks
1453
+ cdef ptr[:] ptr_array = self._ptr_array
1454
+
1455
+ if mask.shape[0] != kmers.shape[0]:
1456
+ raise IndexError(
1457
+ f"Mask has length {mask.shape[0]}, "
1458
+ f"but there are {kmers.shape[0]} k-mers"
1459
+ )
1460
+
1461
+ for seq_pos in range(kmers.shape[0]):
1462
+ if mask[seq_pos]:
1463
+ kmer = kmers[seq_pos]
1464
+ kmer_ptr = <uint32*> ptr_array[kmer]
1465
+
1466
+ # Append k-mer reference ID and position
1467
+ current_size = (<int64*> kmer_ptr)[0]
1468
+ kmer_ptr[current_size ] = ref_id
1469
+ kmer_ptr[current_size + 1] = seq_pos
1470
+ (<int64*> kmer_ptr)[0] = current_size + EntrySize.NO_BUCKETS
1471
+
1472
+ @cython.boundscheck(False)
1473
+ @cython.wraparound(False)
1474
+ def _add_kmer_selection(self, uint32[:] positions, int64[:] kmers,
1475
+ uint32 ref_id):
1476
+ """
1477
+ For each *k-mer* in `kmers` add the reference ID and the
1478
+ position from `positions` to the corresponding C-array and
1479
+ update the length of the C-array.
1480
+ """
1481
+ cdef uint32 i
1482
+ cdef uint32 seq_pos
1483
+ cdef int64 current_size
1484
+ cdef int64 kmer
1485
+ cdef uint32* kmer_ptr
1486
+
1487
+ if positions.shape[0] != kmers.shape[0]:
1488
+ raise IndexError(
1489
+ f"{positions.shape[0]} positions were given "
1490
+ f"for {kmers.shape[0]} k-mers"
1491
+ )
1492
+
1493
+ # Store in new variable
1494
+ # to disable repetitive initialization checks
1495
+ cdef ptr[:] ptr_array = self._ptr_array
1496
+
1497
+ for i in range(positions.shape[0]):
1498
+ kmer = kmers[i]
1499
+ seq_pos = positions[i]
1500
+ kmer_ptr = <uint32*> ptr_array[kmer]
1501
+
1502
+ # Append k-mer reference ID and position
1503
+ current_size = (<int64*> kmer_ptr)[0]
1504
+ kmer_ptr[current_size ] = ref_id
1505
+ kmer_ptr[current_size + 1] = seq_pos
1506
+ (<int64*> kmer_ptr)[0] = current_size + EntrySize.NO_BUCKETS
1507
+
1508
+
1509
+ cdef inline bint _is_initialized(self):
1510
+ # Memoryviews are not initialized on class creation
1511
+ # This method checks, if the _ptr_array memoryview was
1512
+ # initialized and is not None
1513
+ try:
1514
+ if self._ptr_array is not None:
1515
+ return True
1516
+ else:
1517
+ return False
1518
+ except AttributeError:
1519
+ return False
1520
+
1521
+
1522
+
1523
+
1524
+ cdef class BucketKmerTable:
1525
+ """
1526
+ This class represents a *k-mer* index table.
1527
+ In contrast to :class:`KmerTable` it does store each unique *k-mer*
1528
+ in a separate C-array, but limits the number of C-arrays instead
1529
+ to a number of buckets.
1530
+ Hence, different *k-mer* may be stored in the same bucket, like in a
1531
+ hash table.
1532
+ This approach makes *k-mer* indices with large *k-mer* alphabets
1533
+ fit into memory.
1534
+
1535
+ Otherwise, the API for creating a :class:`BucketKmerTable` and
1536
+ matching to it is analogous to :class:`KmerTable`.
1537
+
1538
+ Attributes
1539
+ ----------
1540
+ kmer_alphabet : KmerAlphabet
1541
+ The internal :class:`KmerAlphabet`, that is used to
1542
+ encode all overlapping *k-mers* of an input sequence.
1543
+ alphabet : Alphabet
1544
+ The base alphabet, from which this :class:`BucketKmerTable` was
1545
+ created.
1546
+ k : int
1547
+ The length of the *k-mers*.
1548
+ n_buckets : int
1549
+ The number of buckets, the *k-mers* are divided into.
1550
+
1551
+ See Also
1552
+ --------
1553
+ KmerTable
1554
+
1555
+ Notes
1556
+ -----
1557
+
1558
+ *Memory consumption*
1559
+
1560
+ For efficient mapping, a :class:`BucketKmerTable` contains a pointer
1561
+ array, that contains one 64-bit pointer for each bucket.
1562
+ If there is at least one position for a bucket, the corresponding
1563
+ pointer points to a C-array that contains
1564
+
1565
+ 1. The length of the C-array *(int64)*
1566
+ 2. The *k-mers* *(int64)*
1567
+ 3. The reference ID for each *k-mer* *(uint32)*
1568
+ 4. The sequence position for each *k-mer* *(uint32)*
1569
+
1570
+ As buckets are used, the memory requirements are limited to the number
1571
+ of buckets instead of scaling with the :class:`KmerAlphabet` size.
1572
+ If each bucket is used, the required memory space :math:`S` in byte
1573
+ is
1574
+
1575
+ .. math::
1576
+
1577
+ S = 16B + 16L
1578
+
1579
+ where :math:`B` is the number of buckets and :math:`L` is the summed
1580
+ length of all sequences added to the table.
1581
+
1582
+ *Buckets*
1583
+
1584
+ The ratio :math:`L/B` is called *load_factor*.
1585
+ By default :class:`BucketKmerTable` uses a load factor of
1586
+ approximately 0.8 to ensure efficient *k-mer* matching.
1587
+ The number fo buckets can be adjusted by setting the
1588
+ `n_buckets` parameters on :class:`BucketKmerTable` creation.
1589
+ It is recommended to use :func:`bucket_number()` to compute an
1590
+ appropriate number of buckets.
1591
+
1592
+ *Multiprocessing*
1593
+
1594
+ :class:`BucketKmerTable` objects can be used in multi-processed
1595
+ setups:
1596
+ Adding a large database of sequences to a table can be sped up by
1597
+ splitting the database into smaller chunks and create a separate
1598
+ table for each chunk in separate processes.
1599
+ Eventually, the tables can be merged to one large table using
1600
+ :meth:`from_tables()`.
1601
+
1602
+ Since :class:`BucketKmerTable` supports the *pickle* protocol,
1603
+ the matching step can also be divided into multiple processes, if
1604
+ multiple sequences need to be matched.
1605
+
1606
+ *Storage on hard drive*
1607
+
1608
+ The most time efficient way to read/write a :class:`BucketKmerTable`
1609
+ is the *pickle* format.
1610
+
1611
+ *Indexing and iteration*
1612
+
1613
+ Due to the higher complexity in the *k-mer* lookup compared to
1614
+ :class:`KmerTable`, this class is still indexable but not iterable.
1615
+
1616
+ Examples
1617
+ --------
1618
+
1619
+ Create a *2-mer* index table for some nucleotide sequences:
1620
+
1621
+ >>> table = BucketKmerTable.from_sequences(
1622
+ ... k = 2,
1623
+ ... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
1624
+ ... ref_ids = [0, 1]
1625
+ ... )
1626
+
1627
+ Display the contents of the table as
1628
+ (reference ID, sequence position) tuples:
1629
+
1630
+ >>> print(table)
1631
+ AG: (1, 2)
1632
+ AT: (0, 2)
1633
+ CT: (1, 0)
1634
+ TA: (0, 1), (0, 3), (1, 1)
1635
+ TT: (0, 0)
1636
+
1637
+ Find matches of the table with a sequence:
1638
+
1639
+ >>> query = NucleotideSequence("TAG")
1640
+ >>> matches = table.match(query)
1641
+ >>> for query_pos, table_ref_id, table_pos in matches:
1642
+ ... print("Query sequence position:", query_pos)
1643
+ ... print("Table reference ID: ", table_ref_id)
1644
+ ... print("Table sequence position:", table_pos)
1645
+ ... print()
1646
+ Query sequence position: 0
1647
+ Table reference ID: 0
1648
+ Table sequence position: 1
1649
+ <BLANKLINE>
1650
+ Query sequence position: 0
1651
+ Table reference ID: 0
1652
+ Table sequence position: 3
1653
+ <BLANKLINE>
1654
+ Query sequence position: 0
1655
+ Table reference ID: 1
1656
+ Table sequence position: 1
1657
+ <BLANKLINE>
1658
+ Query sequence position: 1
1659
+ Table reference ID: 1
1660
+ Table sequence position: 2
1661
+ <BLANKLINE>
1662
+
1663
+ Get all reference IDs and positions for a given *k-mer*:
1664
+
1665
+ >>> kmer_code = table.kmer_alphabet.encode("TA")
1666
+ >>> print(table[kmer_code])
1667
+ [[0 1]
1668
+ [0 3]
1669
+ [1 1]]
1670
+ """
1671
+
1672
+ cdef object _kmer_alph
1673
+ cdef int _k
1674
+ cdef int64 _n_buckets
1675
+
1676
+ # The pointer array is the core of the index table:
1677
+ # It maps each possible k-mer bucket (represented by its code) to a
1678
+ # C-array of indices.
1679
+ # Each entry in a C-array contains the k-mer code, a reference ID
1680
+ # and the location in that sequence where that k-mer appears
1681
+ # The memory layout of each C-array is as following:
1682
+ #
1683
+ # (Array length) (k-mer 0) (RefID 0) (Position 0) (k-mer 1) ...
1684
+ # -----int64----|--int64--|---uint32---|---uint32---|--int64--
1685
+ #
1686
+ # The array length is based on 32 bit units.
1687
+ # If there is no entry for a k-mer bucket, the respective pointer is
1688
+ # NULL.
1689
+ cdef ptr[:] _ptr_array
1690
+
1691
+
1692
+ def __cinit__(self, n_buckets, kmer_alphabet):
1693
+ # This check is necessary for proper memory management
1694
+ # of the allocated arrays
1695
+ if self._is_initialized():
1696
+ raise Exception("Duplicate call of constructor")
1697
+
1698
+ self._kmer_alph = kmer_alphabet
1699
+ self._k = kmer_alphabet.k
1700
+ if len(self._kmer_alph) < n_buckets:
1701
+ self._n_buckets = len(self._kmer_alph)
1702
+ else:
1703
+ self._n_buckets = n_buckets
1704
+ self._ptr_array = np.zeros(self._n_buckets, dtype=np.uint64)
1705
+
1706
+
1707
+ @property
1708
+ def kmer_alphabet(self):
1709
+ return self._kmer_alph
1710
+
1711
+ @property
1712
+ def alphabet(self):
1713
+ return self._kmer_alph.base_alphabet
1714
+
1715
+ @property
1716
+ def k(self):
1717
+ return self._k
1718
+
1719
+ @property
1720
+ def n_buckets(self):
1721
+ return self._n_buckets
1722
+
1723
+ @staticmethod
1724
+ def from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
1725
+ alphabet=None, spacing=None, n_buckets=None):
1726
+ """
1727
+ from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
1728
+ alphabet=None, spacing=None, n_buckets=None)
1729
+
1730
+ Create a :class:`BucketKmerTable` by storing the positions of
1731
+ all overlapping *k-mers* from the input `sequences`.
1732
+
1733
+ Parameters
1734
+ ----------
1735
+ k : int
1736
+ The length of the *k-mers*.
1737
+ sequences : sized iterable object of Sequence, length=m
1738
+ The sequences to get the *k-mer* positions from.
1739
+ These sequences must have equal alphabets, or one of these
1740
+ sequences must have an alphabet that extends the alphabets
1741
+ of all other sequences.
1742
+ ref_ids : sized iterable object of int, length=m, optional
1743
+ The reference IDs for the given sequences.
1744
+ These are used to identify the corresponding sequence for a
1745
+ *k-mer* match.
1746
+ By default the IDs are counted from *0* to *m*.
1747
+ ignore_masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
1748
+ Sequence positions to ignore.
1749
+ *k-mers* that involve these sequence positions are not added
1750
+ to the table.
1751
+ This is used e.g. to skip repeat regions.
1752
+ If provided, the list must contain one boolean mask
1753
+ (or ``None``) for each sequence, and each bolean mask must
1754
+ have the same length as the sequence.
1755
+ By default, no sequence position is ignored.
1756
+ alphabet : Alphabet, optional
1757
+ The alphabet to use for this table.
1758
+ It must extend the alphabets of the input `sequences`.
1759
+ By default, an appropriate alphabet is inferred from the
1760
+ input `sequences`.
1761
+ This option is usually used for compatibility with another
1762
+ sequence/table in the matching step.
1763
+ spacing : None or str or list or ndarray, dtype=int, shape=(k,)
1764
+ If provided, spaced *k-mers* are used instead of continuous
1765
+ ones.
1766
+ The value contains the *informative* positions relative to
1767
+ the start of the *k-mer*, also called the *model*.
1768
+ The number of *informative* positions must equal *k*.
1769
+ Refer to :class:`KmerAlphabet` for more details.
1770
+ n_buckets : int, optional
1771
+ Set the number of buckets in the table, e.g. to use a
1772
+ different load factor.
1773
+ It is recommended to use :func:`bucket_number()` for this
1774
+ purpose.
1775
+ By default, a load factor of approximately 0.8 is used.
1776
+
1777
+ See Also
1778
+ --------
1779
+ from_kmers : The same functionality based on already created *k-mers*
1780
+
1781
+ Returns
1782
+ -------
1783
+ table : BucketKmerTable
1784
+ The newly created table.
1785
+
1786
+ Examples
1787
+ --------
1788
+
1789
+ >>> sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")]
1790
+ >>> table = BucketKmerTable.from_sequences(
1791
+ ... 2, sequences, ref_ids=[100, 101]
1792
+ ... )
1793
+ >>> print(table)
1794
+ AG: (101, 2)
1795
+ AT: (100, 2)
1796
+ CT: (101, 0)
1797
+ TA: (100, 1), (100, 3), (101, 1)
1798
+ TT: (100, 0)
1799
+
1800
+ Give an explicit compatible alphabet:
1801
+
1802
+ >>> table = BucketKmerTable.from_sequences(
1803
+ ... 2, sequences, ref_ids=[100, 101],
1804
+ ... alphabet=NucleotideSequence.ambiguous_alphabet()
1805
+ ... )
1806
+
1807
+ Ignore all ``N`` in a sequence:
1808
+
1809
+ >>> sequence = NucleotideSequence("ACCNTANNG")
1810
+ >>> table = BucketKmerTable.from_sequences(
1811
+ ... 2, [sequence], ignore_masks=[sequence.symbols == "N"]
1812
+ ... )
1813
+ >>> print(table)
1814
+ AC: (0, 0)
1815
+ CC: (0, 1)
1816
+ TA: (0, 4)
1817
+ """
1818
+ ref_ids = _compute_ref_ids(ref_ids, sequences)
1819
+ ignore_masks = _compute_masks(ignore_masks, sequences)
1820
+ alphabet = _compute_alphabet(
1821
+ alphabet, (sequence.alphabet for sequence in sequences)
1822
+ )
1823
+ kmer_alphabet = KmerAlphabet(alphabet, k, spacing)
1824
+
1825
+ # Calculate k-mers
1826
+ kmers_list = [
1827
+ kmer_alphabet.create_kmers(sequence.code)
1828
+ for sequence in sequences
1829
+ ]
1830
+
1831
+ if n_buckets is None:
1832
+ n_kmers = np.sum([len(kmers) for kmers in kmers_list])
1833
+ n_buckets = bucket_number(n_kmers)
1834
+
1835
+ table = BucketKmerTable(n_buckets, kmer_alphabet)
1836
+
1837
+ masks = [
1838
+ _prepare_mask(kmer_alphabet, ignore_mask, len(sequence))
1839
+ for sequence, ignore_mask in zip(sequences, ignore_masks)
1840
+ ]
1841
+
1842
+ # Count the number of appearances of each k-mer and store the
1843
+ # result in the pointer array, that is now used as count array
1844
+ for kmers, mask in zip(kmers_list, masks):
1845
+ table._count_masked_kmers(kmers, mask)
1846
+
1847
+ # Transfrom count array into pointer array with C-array of
1848
+ # appropriate size
1849
+ _init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
1850
+
1851
+ # Fill the C-arrays with the k-mer positions
1852
+ for kmers, ref_id, mask in zip(kmers_list, ref_ids, masks):
1853
+ table._add_kmers(kmers, ref_id, mask)
1854
+
1855
+ return table
1856
+
1857
+
1858
+ @staticmethod
1859
+ def from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None,
1860
+ n_buckets=None):
1861
+ """
1862
+ from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None,
1863
+ n_buckets=None)
1864
+
1865
+ Create a :class:`BucketKmerTable` by storing the positions of
1866
+ all input *k-mers*.
1867
+
1868
+ Parameters
1869
+ ----------
1870
+ kmer_alphabet : KmerAlphabet
1871
+ The :class:`KmerAlphabet` to use for the new table.
1872
+ Should be the same alphabet that was used to calculate the
1873
+ input *kmers*.
1874
+ kmers : sized iterable object of (ndarray, dtype=np.int64), length=m
1875
+ List where each array contains the *k-mer* codes from a
1876
+ sequence.
1877
+ For each array the index of the *k-mer* code in the array
1878
+ is stored in the table as sequence position.
1879
+ ref_ids : sized iterable object of int, length=m, optional
1880
+ The reference IDs for the sequences.
1881
+ These are used to identify the corresponding sequence for a
1882
+ *k-mer* match.
1883
+ By default the IDs are counted from *0* to *m*.
1884
+ masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
1885
+ A *k-mer* code at a position, where the corresponding mask
1886
+ is false, is not added to the table.
1887
+ By default, all positions are added.
1888
+ n_buckets : int, optional
1889
+ Set the number of buckets in the table, e.g. to use a
1890
+ different load factor.
1891
+ It is recommended to use :func:`bucket_number()` for this
1892
+ purpose.
1893
+ By default, a load factor of approximately 0.8 is used.
1894
+
1895
+ See Also
1896
+ --------
1897
+ from_sequences : The same functionality based on undecomposed sequences
1898
+
1899
+ Returns
1900
+ -------
1901
+ table : BucketKmerTable
1902
+ The newly created table.
1903
+
1904
+ Examples
1905
+ --------
1906
+
1907
+ >>> sequences = [ProteinSequence("BIQTITE"), ProteinSequence("NIQBITE")]
1908
+ >>> kmer_alphabet = KmerAlphabet(ProteinSequence.alphabet, 3)
1909
+ >>> kmer_codes = [kmer_alphabet.create_kmers(s.code) for s in sequences]
1910
+ >>> for code in kmer_codes:
1911
+ ... print(code)
1912
+ [11701 4360 7879 9400 4419]
1913
+ [ 6517 4364 7975 11704 4419]
1914
+ >>> table = BucketKmerTable.from_kmers(kmer_alphabet, kmer_codes)
1915
+ >>> print(table)
1916
+ IQT: (0, 1)
1917
+ IQB: (1, 1)
1918
+ ITE: (0, 4), (1, 4)
1919
+ NIQ: (1, 0)
1920
+ QTI: (0, 2)
1921
+ QBI: (1, 2)
1922
+ TIT: (0, 3)
1923
+ BIQ: (0, 0)
1924
+ BIT: (1, 3)
1925
+ """
1926
+ _check_kmer_alphabet(kmer_alphabet)
1927
+ _check_multiple_kmer_bounds(kmers, kmer_alphabet)
1928
+
1929
+ ref_ids = _compute_ref_ids(ref_ids, kmers)
1930
+ masks = _compute_masks(masks, kmers)
1931
+
1932
+ if n_buckets is None:
1933
+ n_kmers = np.sum([len(e) for e in kmers])
1934
+ n_buckets = bucket_number(n_kmers)
1935
+
1936
+ table = BucketKmerTable(n_buckets, kmer_alphabet)
1937
+
1938
+ masks = [
1939
+ np.ones(len(arr), dtype=np.uint8) if mask is None
1940
+ # Convert boolean mask into uint8 array to be able
1941
+ # to handle it as memory view
1942
+ else np.frombuffer(
1943
+ mask.astype(bool, copy=False), dtype=np.uint8
1944
+ )
1945
+ for mask, arr in zip(masks, kmers)
1946
+ ]
1947
+
1948
+ for arr, mask in zip(kmers, masks):
1949
+ table._count_masked_kmers(arr, mask)
1950
+
1951
+ _init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
1952
+
1953
+ for arr, ref_id, mask in zip(kmers, ref_ids, masks):
1954
+ table._add_kmers(arr, ref_id, mask)
1955
+
1956
+ return table
1957
+
1958
+
1959
+ @staticmethod
1960
+ def from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None,
1961
+ n_buckets=None):
1962
+ """
1963
+ from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None,
1964
+ n_buckets=None)
1965
+
1966
+ Create a :class:`BucketKmerTable` by storing the positions of a
1967
+ filtered subset of input *k-mers*.
1968
+
1969
+ This can be used to reduce the number of stored *k-mers* using
1970
+ a *k-mer* subset selector such as :class:`MinimizerSelector`.
1971
+
1972
+ Parameters
1973
+ ----------
1974
+ kmer_alphabet : KmerAlphabet
1975
+ The :class:`KmerAlphabet` to use for the new table.
1976
+ Should be the same alphabet that was used to calculate the
1977
+ input *kmers*.
1978
+ positions : sized iterable object of (ndarray, shape=(n,), dtype=uint32), length=m
1979
+ List where each array contains the sequence positions of
1980
+ the filtered subset of *k-mers* given in `kmers`.
1981
+ The list may contain multiple elements for multiple
1982
+ sequences.
1983
+ kmers : sized iterable object of (ndarray, shape=(n,), dtype=np.int64), length=m
1984
+ List where each array contains the filtered subset of
1985
+ *k-mer* codes from a sequence.
1986
+ For each array the index of the *k-mer* code in the array,
1987
+ is stored in the table as sequence position.
1988
+ The list may contain multiple elements for multiple
1989
+ sequences.
1990
+ ref_ids : sized iterable object of int, length=m, optional
1991
+ The reference IDs for the sequences.
1992
+ These are used to identify the corresponding sequence for a
1993
+ *k-mer* match.
1994
+ By default the IDs are counted from *0* to *m*.
1995
+ n_buckets : int, optional
1996
+ Set the number of buckets in the table, e.g. to use a
1997
+ different load factor.
1998
+ It is recommended to use :func:`bucket_number()` for this
1999
+ purpose.
2000
+ By default, a load factor of approximately 0.8 is used.
2001
+
2002
+ Returns
2003
+ -------
2004
+ table : BucketKmerTable
2005
+ The newly created table.
2006
+
2007
+ Examples
2008
+ --------
2009
+
2010
+ Reduce the size of sequence data in the table using minimizers:
2011
+
2012
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
2013
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
2014
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
2015
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
2016
+ >>> kmer_table = BucketKmerTable.from_kmer_selection(
2017
+ ... kmer_alph, [minimizer_pos], [minimizers]
2018
+ ... )
2019
+
2020
+ Use the same :class:`MinimizerSelector` to select the minimizers
2021
+ from the query sequence and match them against the table.
2022
+ Although the amount of *k-mers* is reduced, matching is still
2023
+ guanrateed to work, if the two sequences share identity in the
2024
+ given window:
2025
+
2026
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
2027
+ >>> minimizer_pos, minimizers = minimizer.select(sequence2)
2028
+ >>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
2029
+ >>> print(matches)
2030
+ [[ 9 0 11]
2031
+ [12 0 14]]
2032
+ >>> for query_pos, _, db_pos in matches:
2033
+ ... print(sequence1)
2034
+ ... print(" " * (db_pos-1) + "^" * kmer_table.k)
2035
+ ... print(sequence2)
2036
+ ... print(" " * (query_pos-1) + "^" * kmer_table.k)
2037
+ ... print()
2038
+ THIS*IS*A*SEQVENCE
2039
+ ^^^
2040
+ ANQTHER*SEQVENCE
2041
+ ^^^
2042
+ <BLANKLINE>
2043
+ THIS*IS*A*SEQVENCE
2044
+ ^^^
2045
+ ANQTHER*SEQVENCE
2046
+ ^^^
2047
+ <BLANKLINE>
2048
+ """
2049
+ _check_kmer_alphabet(kmer_alphabet)
2050
+ _check_multiple_kmer_bounds(kmers, kmer_alphabet)
2051
+ _check_position_shape(positions, kmers)
2052
+
2053
+ ref_ids = _compute_ref_ids(ref_ids, kmers)
2054
+
2055
+ if n_buckets is None:
2056
+ n_kmers = np.sum([len(e) for e in kmers])
2057
+ n_buckets = bucket_number(n_kmers)
2058
+
2059
+ table = BucketKmerTable(n_buckets, kmer_alphabet)
2060
+
2061
+ for arr in kmers:
2062
+ table._count_kmers(arr)
2063
+
2064
+ _init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
2065
+
2066
+ for pos, arr, ref_id in zip(positions, kmers, ref_ids):
2067
+ table._add_kmer_selection(
2068
+ pos.astype(np.uint32, copy=False), arr, ref_id
2069
+ )
2070
+
2071
+ return table
2072
+
2073
+
2074
+ @staticmethod
2075
+ def from_tables(tables):
2076
+ """
2077
+ from_tables(tables)
2078
+
2079
+ Create a :class:`BucketKmerTable` by merging the *k-mer*
2080
+ positions from existing `tables`.
2081
+
2082
+ Parameters
2083
+ ----------
2084
+ tables : iterable object of BucketKmerTable
2085
+ The tables to be merged.
2086
+ All tables must have equal number of buckets and equal
2087
+ :class:`KmerAlphabet` objects, i.e. the same *k* and equal
2088
+ base alphabets.
2089
+
2090
+ Returns
2091
+ -------
2092
+ table : BucketKmerTable
2093
+ The newly created table.
2094
+
2095
+ Examples
2096
+ --------
2097
+ To ensure that all tables have the same number of buckets,
2098
+ `n_buckets` need to be set on table creation.
2099
+
2100
+ >>> # The sequence length is not exactly the length of resulting k-mers,
2101
+ >>> # but it is close enough for bucket computation
2102
+ >>> n_buckets = bucket_number(len("TTATA") + len("CTAG"))
2103
+ >>> table1 = BucketKmerTable.from_sequences(
2104
+ ... 2, [NucleotideSequence("TTATA")], ref_ids=[100],
2105
+ ... n_buckets=n_buckets
2106
+ ... )
2107
+ >>> table2 = BucketKmerTable.from_sequences(
2108
+ ... 2, [NucleotideSequence("CTAG")], ref_ids=[101],
2109
+ ... n_buckets=n_buckets
2110
+ ... )
2111
+ >>> merged_table = BucketKmerTable.from_tables([table1, table2])
2112
+ >>> print(merged_table)
2113
+ AG: (101, 2)
2114
+ AT: (100, 2)
2115
+ CT: (101, 0)
2116
+ TA: (100, 1), (100, 3), (101, 1)
2117
+ TT: (100, 0)
2118
+ """
2119
+ cdef BucketKmerTable table
2120
+
2121
+ _check_same_kmer_alphabet(tables)
2122
+ _check_same_buckets(tables)
2123
+
2124
+ merged_table = BucketKmerTable(
2125
+ tables[0].n_buckets,
2126
+ tables[0].kmer_alphabet
2127
+ )
2128
+
2129
+ # Sum the number of appearances of each k-mer from the tables
2130
+ for table in tables:
2131
+ _count_table_entries(
2132
+ merged_table._ptr_array, table._ptr_array,
2133
+ EntrySize.BUCKETS
2134
+ )
2135
+
2136
+ _init_c_arrays(merged_table._ptr_array, EntrySize.BUCKETS)
2137
+
2138
+ for table in tables:
2139
+ _append_entries(merged_table._ptr_array, table._ptr_array)
2140
+
2141
+ return merged_table
2142
+
2143
+
2144
+ @cython.cdivision(True)
2145
+ @cython.boundscheck(False)
2146
+ @cython.wraparound(False)
2147
+ def match_table(self, BucketKmerTable table, similarity_rule=None):
2148
+ """
2149
+ match_table(table, similarity_rule=None)
2150
+
2151
+ Find matches between the *k-mers* in this table with the
2152
+ *k-mers* in another `table`.
2153
+
2154
+ This means that for each *k-mer* the cartesian product between
2155
+ the positions in both tables is added to the matches.
2156
+
2157
+ Parameters
2158
+ ----------
2159
+ table : BucketKmerTable
2160
+ The table to be matched.
2161
+ Both tables must have equal number of buckets and equal
2162
+ :class:`KmerAlphabet` objects, i.e. the same *k* and equal
2163
+ base alphabets.
2164
+ similarity_rule : SimilarityRule, optional
2165
+ If this parameter is given, not only exact *k-mer* matches
2166
+ are considered, but also similar ones according to the given
2167
+ :class:`SimilarityRule`.
2168
+
2169
+ Returns
2170
+ -------
2171
+ matches : ndarray, shape=(n,4), dtype=np.uint32
2172
+ The *k-mer* matches.
2173
+ Each row contains one match. Each match has the following
2174
+ columns:
2175
+
2176
+ 0. The reference ID of the matched sequence in the other
2177
+ table
2178
+ 1. The sequence position of the matched sequence in the
2179
+ other table
2180
+ 2. The reference ID of the matched sequence in this
2181
+ table
2182
+ 3. The sequence position of the matched sequence in this
2183
+ table
2184
+
2185
+ Notes
2186
+ -----
2187
+
2188
+
2189
+ There is no guaranteed order of the reference IDs or
2190
+ sequence positions in the returned matches.
2191
+
2192
+ Examples
2193
+ --------
2194
+ To ensure that both tables have the same number of buckets,
2195
+ `n_buckets` need to be set on table creation.
2196
+
2197
+ >>> # The sequence length is not exactly the length of resulting k-mers,
2198
+ >>> # but it is close enouggh for bucket computation
2199
+ >>> n_buckets = bucket_number(max(len("BIQTITE"), len("TITANITE")))
2200
+ >>> sequence1 = ProteinSequence("BIQTITE")
2201
+ >>> table1 = BucketKmerTable.from_sequences(3, [sequence1], ref_ids=[100])
2202
+ >>> print(table1)
2203
+ IQT: (100, 1)
2204
+ ITE: (100, 4)
2205
+ QTI: (100, 2)
2206
+ TIT: (100, 3)
2207
+ BIQ: (100, 0)
2208
+ >>> sequence2 = ProteinSequence("TITANITE")
2209
+ >>> table2 = BucketKmerTable.from_sequences(3, [sequence2], ref_ids=[101])
2210
+ >>> print(table2)
2211
+ ANI: (101, 3)
2212
+ ITA: (101, 1)
2213
+ ITE: (101, 5)
2214
+ NIT: (101, 4)
2215
+ TAN: (101, 2)
2216
+ TIT: (101, 0)
2217
+ >>> print(table1.match_table(table2))
2218
+ [[101 0 100 3]
2219
+ [101 5 100 4]]
2220
+ """
2221
+ cdef int INIT_SIZE = 1
2222
+
2223
+ cdef int64 bucket, sim_bucket
2224
+ cdef int64 self_kmer, other_kmer, sim_kmer
2225
+ cdef int64 match_i
2226
+ cdef int64 i, j, l
2227
+ cdef int64 self_length, other_length
2228
+ cdef uint32* self_bucket_ptr
2229
+ cdef uint32* other_bucket_ptr
2230
+
2231
+ # This variable will only be used if a similarity rule exists
2232
+ cdef int64[:] similar_kmers
2233
+
2234
+ # Store in new variables
2235
+ # to disable repetitive initialization checks
2236
+ cdef ptr[:] self_ptr_array = self._ptr_array
2237
+ cdef ptr[:] other_ptr_array = table._ptr_array
2238
+
2239
+ _check_same_kmer_alphabet((self, table))
2240
+ _check_same_buckets((self, table))
2241
+
2242
+ # This array will store the match positions
2243
+ # As the final number of matches is unknown, a list-like
2244
+ # approach is used:
2245
+ # The array is initialized with a relatively small inital size
2246
+ # and every time the limit would be exceeded its size is doubled
2247
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 4), dtype=np.int64)
2248
+ match_i = 0
2249
+ if similarity_rule is None:
2250
+ for bucket in range(self_ptr_array.shape[0]):
2251
+ self_bucket_ptr = <uint32*>self_ptr_array[bucket]
2252
+ other_bucket_ptr = <uint32*>other_ptr_array[bucket]
2253
+ if self_bucket_ptr != NULL and other_bucket_ptr != NULL:
2254
+ # This bucket exists for both tables
2255
+ other_length = (<int64*>other_bucket_ptr)[0]
2256
+ self_length = (<int64*>self_bucket_ptr )[0]
2257
+ for i in range(2, other_length, 4):
2258
+ # Hacky syntax to achieve casting to int64*
2259
+ # after offset is applied
2260
+ other_kmer = (<int64*>(other_bucket_ptr + i))[0]
2261
+ for j in range(2, self_length, 4):
2262
+ self_kmer = (<int64*>(self_bucket_ptr + j))[0]
2263
+ if self_kmer == other_kmer:
2264
+ # The k-mers are not only in the same
2265
+ # bucket, but they are actually equal
2266
+ if match_i >= matches.shape[0]:
2267
+ # The 'matches' array is full
2268
+ # -> double its size
2269
+ matches = expand(np.asarray(matches))
2270
+ matches[match_i, 0] = other_bucket_ptr[i+2]
2271
+ matches[match_i, 1] = other_bucket_ptr[i+3]
2272
+ matches[match_i, 2] = self_bucket_ptr[j+2]
2273
+ matches[match_i, 3] = self_bucket_ptr[j+3]
2274
+ match_i += 1
2275
+
2276
+ else:
2277
+ for bucket in range(self_ptr_array.shape[0]):
2278
+ other_bucket_ptr = <uint32*>other_ptr_array[bucket]
2279
+ if other_bucket_ptr != NULL:
2280
+ other_length = (<int64*>other_bucket_ptr)[0]
2281
+ for i in range(2, other_length, 4):
2282
+ other_kmer = (<int64*>(other_bucket_ptr + i))[0]
2283
+ # If a similarity rule exists, iterate not only over
2284
+ # the exact k-mer, but over all k-mers similar to
2285
+ # the current k-mer
2286
+ similar_kmers = similarity_rule.similar_kmers(
2287
+ self._kmer_alph, other_kmer
2288
+ )
2289
+ for l in range(similar_kmers.shape[0]):
2290
+ sim_kmer = similar_kmers[l]
2291
+ sim_bucket = sim_kmer % self._n_buckets
2292
+ self_bucket_ptr = <uint32*>self_ptr_array[sim_bucket]
2293
+ if self_bucket_ptr != NULL:
2294
+ self_length = (<int64*>self_bucket_ptr)[0]
2295
+ for j in range(2, self_length, 4):
2296
+ self_kmer = (<int64*>(self_bucket_ptr + j))[0]
2297
+ if self_kmer == sim_kmer:
2298
+ if match_i >= matches.shape[0]:
2299
+ # The 'matches' array is full
2300
+ # -> double its size
2301
+ matches = expand(np.asarray(matches))
2302
+ matches[match_i, 0] = other_bucket_ptr[i+2]
2303
+ matches[match_i, 1] = other_bucket_ptr[i+3]
2304
+ matches[match_i, 2] = self_bucket_ptr[j+2]
2305
+ matches[match_i, 3] = self_bucket_ptr[j+3]
2306
+ match_i += 1
2307
+
2308
+ # Trim to correct size and return
2309
+ return np.asarray(matches[:match_i])
2310
+
2311
+
2312
+ @cython.cdivision(True)
2313
+ @cython.boundscheck(False)
2314
+ @cython.wraparound(False)
2315
+ def match(self, sequence, similarity_rule=None, ignore_mask=None):
2316
+ """
2317
+ match(sequence, similarity_rule=None, ignore_mask=None)
2318
+
2319
+ Find matches between the *k-mers* in this table with all
2320
+ overlapping *k-mers* in the given `sequence`.
2321
+ *k* is determined by the table.
2322
+
2323
+ Parameters
2324
+ ----------
2325
+ sequence : Sequence
2326
+ The sequence to be matched.
2327
+ The table's base alphabet must extend the alphabet of the
2328
+ sequence.
2329
+ similarity_rule : SimilarityRule, optional
2330
+ If this parameter is given, not only exact *k-mer* matches
2331
+ are considered, but also similar ones according to the given
2332
+ :class:`SimilarityRule`.
2333
+ ignore_mask : ndarray, dtype=bool, optional
2334
+ Boolean mask of sequence positions to ignore.
2335
+ *k-mers* that involve these sequence positions are not added
2336
+ to the table.
2337
+ This is used e.g. to skip repeat regions.
2338
+ By default, no sequence position is ignored.
2339
+
2340
+ Returns
2341
+ -------
2342
+ matches : ndarray, shape=(n,3), dtype=np.uint32
2343
+ The *k-mer* matches.
2344
+ Each row contains one match. Each match has the following
2345
+ columns:
2346
+
2347
+ 0. The sequence position in the input sequence
2348
+ 1. The reference ID of the matched sequence in the table
2349
+ 2. The sequence position of the matched sequence in the
2350
+ table
2351
+
2352
+ Notes
2353
+ -----
2354
+
2355
+ The matches are ordered by the first column.
2356
+
2357
+ Examples
2358
+ --------
2359
+
2360
+ >>> sequence1 = ProteinSequence("BIQTITE")
2361
+ >>> table = BucketKmerTable.from_sequences(3, [sequence1], ref_ids=[100])
2362
+ >>> print(table)
2363
+ IQT: (100, 1)
2364
+ ITE: (100, 4)
2365
+ QTI: (100, 2)
2366
+ TIT: (100, 3)
2367
+ BIQ: (100, 0)
2368
+ >>> sequence2 = ProteinSequence("TITANITE")
2369
+ >>> print(table.match(sequence2))
2370
+ [[ 0 100 3]
2371
+ [ 5 100 4]]
2372
+ """
2373
+ cdef int INIT_SIZE = 1
2374
+
2375
+ cdef int64 bucket
2376
+ cdef int64 self_kmer, other_kmer, sim_kmer
2377
+ cdef int64 match_i
2378
+ cdef int64 i, l
2379
+ cdef int64 length
2380
+ cdef uint32* bucket_ptr
2381
+ cdef uint32* array_stop
2382
+
2383
+ # This variable will only be used if a similarity rule exists
2384
+ cdef int64[:] similar_kmers
2385
+
2386
+ # Store in new variable
2387
+ # to disable repetitive initialization checks
2388
+ cdef ptr[:] ptr_array = self._ptr_array
2389
+
2390
+ if len(sequence.code) < self._k:
2391
+ raise ValueError("Sequence code is shorter than k")
2392
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
2393
+ raise ValueError(
2394
+ "The alphabet used for the k-mer index table is not equal to "
2395
+ "the alphabet of the sequence"
2396
+ )
2397
+
2398
+ cdef int64[:] kmers = self._kmer_alph.create_kmers(sequence.code)
2399
+ cdef uint8[:] kmer_mask = _prepare_mask(
2400
+ self._kmer_alph, ignore_mask, len(sequence.code)
2401
+ )
2402
+
2403
+ # This array will store the match positions
2404
+ # As the final number of matches is unknown, a list-like
2405
+ # approach is used:
2406
+ # The array is initialized with a relatively small inital size
2407
+ # and every time the limit would be exceeded its size is doubled
2408
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
2409
+ match_i = 0
2410
+ if similarity_rule is None:
2411
+ for i in range(kmers.shape[0]):
2412
+ if kmer_mask[i]:
2413
+ other_kmer = kmers[i]
2414
+ bucket = other_kmer % self._n_buckets
2415
+ bucket_ptr = <uint32*>ptr_array[bucket]
2416
+ if bucket_ptr != NULL:
2417
+ # There is at least one entry in this bucket
2418
+ length = (<int64*>bucket_ptr)[0]
2419
+ array_stop = bucket_ptr + length
2420
+ bucket_ptr += 2
2421
+ while bucket_ptr < array_stop:
2422
+ self_kmer = (<int64*>bucket_ptr)[0]
2423
+ if self_kmer == other_kmer:
2424
+ # The k-mers are not only in the same
2425
+ # bucket, but they are actually equal
2426
+ if match_i >= matches.shape[0]:
2427
+ # The 'matches' array is full
2428
+ # -> double its size
2429
+ matches = expand(np.asarray(matches))
2430
+ matches[match_i, 0] = i
2431
+ bucket_ptr += 2
2432
+ matches[match_i, 1] = bucket_ptr[0]
2433
+ bucket_ptr += 1
2434
+ matches[match_i, 2] = bucket_ptr[0]
2435
+ bucket_ptr += 1
2436
+ match_i += 1
2437
+ else:
2438
+ bucket_ptr += EntrySize.BUCKETS
2439
+
2440
+ else:
2441
+ for i in range(kmers.shape[0]):
2442
+ if kmer_mask[i]:
2443
+ other_kmer = kmers[i]
2444
+ # If a similarity rule exists, iterate not only over
2445
+ # the exact k-mer, but over all k-mers similar to
2446
+ # the current k-mer
2447
+ similar_kmers = similarity_rule.similar_kmers(
2448
+ self._kmer_alph, other_kmer
2449
+ )
2450
+ for l in range(similar_kmers.shape[0]):
2451
+ sim_kmer = similar_kmers[l]
2452
+ bucket = sim_kmer % self._n_buckets
2453
+ # Actual copy of the code from the other
2454
+ # if-branch:
2455
+ # It cannot be put properly in a cdef-function,
2456
+ # as every function call would perform reference
2457
+ # count changes and would decrease performance
2458
+ bucket_ptr = <uint32*>ptr_array[bucket]
2459
+ if bucket_ptr != NULL:
2460
+ # There is at least one entry in this bucket
2461
+ length = (<int64*>bucket_ptr)[0]
2462
+ array_stop = bucket_ptr + length
2463
+ bucket_ptr += 2
2464
+ while bucket_ptr < array_stop:
2465
+ self_kmer = (<int64*>bucket_ptr)[0]
2466
+ if self_kmer == sim_kmer:
2467
+ # The k-mers are not only in the same
2468
+ # bucket, but they are actually equal
2469
+ if match_i >= matches.shape[0]:
2470
+ # The 'matches' array is full
2471
+ # -> double its size
2472
+ matches = expand(np.asarray(matches))
2473
+ matches[match_i, 0] = i
2474
+ bucket_ptr += 2
2475
+ matches[match_i, 1] = bucket_ptr[0]
2476
+ bucket_ptr += 1
2477
+ matches[match_i, 2] = bucket_ptr[0]
2478
+ bucket_ptr += 1
2479
+ match_i += 1
2480
+ else:
2481
+ bucket_ptr += EntrySize.BUCKETS
2482
+
2483
+ # Trim to correct size and return
2484
+ return np.asarray(matches[:match_i])
2485
+
2486
+
2487
+ @cython.cdivision(True)
2488
+ @cython.boundscheck(False)
2489
+ @cython.wraparound(False)
2490
+ def match_kmer_selection(self, positions, kmers):
2491
+ """
2492
+ match_kmer_selection(positions, kmers)
2493
+
2494
+ Find matches between the *k-mers* in this table with the given
2495
+ *k-mer* selection.
2496
+
2497
+ It is intended to use this method to find matches in a table
2498
+ that was created using :meth:`from_kmer_selection()`.
2499
+
2500
+ Parameters
2501
+ ----------
2502
+ positions : ndarray, shape=(n,), dtype=uint32
2503
+ Sequence positions of the filtered subset of *k-mers* given
2504
+ in `kmers`.
2505
+ kmers : ndarray, shape=(n,), dtype=np.int64
2506
+ Filtered subset of *k-mer* codes to match against.
2507
+
2508
+ Returns
2509
+ -------
2510
+ matches : ndarray, shape=(n,3), dtype=np.uint32
2511
+ The *k-mer* matches.
2512
+ Each row contains one *k-mer* match.
2513
+ Each match has the following columns:
2514
+
2515
+ 0. The sequence position of the input *k-mer*, taken
2516
+ from `positions`
2517
+ 1. The reference ID of the matched sequence in the table
2518
+ 2. The sequence position of the matched *k-mer* in the
2519
+ table
2520
+
2521
+ Examples
2522
+ --------
2523
+
2524
+ Reduce the size of sequence data in the table using minimizers:
2525
+
2526
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
2527
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
2528
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
2529
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
2530
+ >>> kmer_table = BucketKmerTable.from_kmer_selection(
2531
+ ... kmer_alph, [minimizer_pos], [minimizers]
2532
+ ... )
2533
+
2534
+ Use the same :class:`MinimizerSelector` to select the minimizers
2535
+ from the query sequence and match them against the table.
2536
+ Although the amount of *k-mers* is reduced, matching is still
2537
+ guanrateed to work, if the two sequences share identity in the
2538
+ given window:
2539
+
2540
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
2541
+ >>> minimizer_pos, minimizers = minimizer.select(sequence2)
2542
+ >>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
2543
+ >>> print(matches)
2544
+ [[ 9 0 11]
2545
+ [12 0 14]]
2546
+ >>> for query_pos, _, db_pos in matches:
2547
+ ... print(sequence1)
2548
+ ... print(" " * (db_pos-1) + "^" * kmer_table.k)
2549
+ ... print(sequence2)
2550
+ ... print(" " * (query_pos-1) + "^" * kmer_table.k)
2551
+ ... print()
2552
+ THIS*IS*A*SEQVENCE
2553
+ ^^^
2554
+ ANQTHER*SEQVENCE
2555
+ ^^^
2556
+ <BLANKLINE>
2557
+ THIS*IS*A*SEQVENCE
2558
+ ^^^
2559
+ ANQTHER*SEQVENCE
2560
+ ^^^
2561
+ <BLANKLINE>
2562
+ """
2563
+ cdef int INIT_SIZE = 1
2564
+
2565
+ cdef int64 i
2566
+
2567
+ cdef int64 bucket
2568
+ cdef int64 self_kmer, other_kmer
2569
+ cdef int64 match_i
2570
+ cdef int64 seq_pos
2571
+ cdef int64 length
2572
+ cdef uint32* bucket_ptr
2573
+ cdef uint32* array_stop
2574
+
2575
+ # Store in new variable
2576
+ # to disable repetitive initialization checks
2577
+ cdef ptr[:] ptr_array = self._ptr_array
2578
+
2579
+ _check_kmer_bounds(kmers, self._kmer_alph)
2580
+ if positions.shape[0] != kmers.shape[0]:
2581
+ raise IndexError(
2582
+ f"{positions.shape[0]} positions were given "
2583
+ f"for {kmers.shape[0]} k-mers"
2584
+ )
2585
+
2586
+ cdef uint32[:] pos_array = positions.astype(np.uint32, copy=False)
2587
+ cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
2588
+
2589
+ # This array will store the match positions
2590
+ # As the final number of matches is unknown, a list-like
2591
+ # approach is used:
2592
+ # The array is initialized with a relatively small inital size
2593
+ # and every time the limit would be exceeded its size is doubled
2594
+ cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
2595
+ match_i = 0
2596
+ for i in range(kmer_array.shape[0]):
2597
+ other_kmer = kmer_array[i]
2598
+ seq_pos = pos_array[i]
2599
+ bucket = other_kmer % self._n_buckets
2600
+ bucket_ptr = <uint32*>ptr_array[bucket]
2601
+ if bucket_ptr != NULL:
2602
+ # There is at least one entry in this bucket
2603
+ length = (<int64*>bucket_ptr)[0]
2604
+ array_stop = bucket_ptr + length
2605
+ bucket_ptr += 2
2606
+ while bucket_ptr < array_stop:
2607
+ self_kmer = (<int64*>bucket_ptr)[0]
2608
+ if self_kmer == other_kmer:
2609
+ # The k-mers are not only in the same
2610
+ # bucket, but they are actually equal
2611
+ if match_i >= matches.shape[0]:
2612
+ # The 'matches' array is full
2613
+ # -> double its size
2614
+ matches = expand(np.asarray(matches))
2615
+ matches[match_i, 0] = seq_pos
2616
+ bucket_ptr += 2
2617
+ matches[match_i, 1] = bucket_ptr[0]
2618
+ bucket_ptr += 1
2619
+ matches[match_i, 2] = bucket_ptr[0]
2620
+ bucket_ptr += 1
2621
+ match_i += 1
2622
+ else:
2623
+ bucket_ptr += EntrySize.BUCKETS
2624
+
2625
+ # Trim to correct size and return
2626
+ return np.asarray(matches[:match_i])
2627
+
2628
+
2629
+ @cython.cdivision(True)
2630
+ @cython.boundscheck(False)
2631
+ @cython.wraparound(False)
2632
+ def count(self, kmers):
2633
+ """
2634
+ count(kmers=None)
2635
+
2636
+ Count the number of occurences for each *k-mer* in the table.
2637
+
2638
+ Parameters
2639
+ ----------
2640
+ kmers : ndarray, dtype=np.int64, optional
2641
+ The count is returned for these *k-mer* codes.
2642
+ By default all *k-mers* are counted in ascending order, i.e.
2643
+ ``count_for_kmer = counts[kmer]``.
2644
+
2645
+ Returns
2646
+ -------
2647
+ counts : ndarray, dtype=np.int64, optional
2648
+ The counts for each given *k-mer*.
2649
+
2650
+ Notes
2651
+ -----
2652
+ As each bucket need to be inspected for the actual *k-mer*
2653
+ entries, this method requires far more computation time than its
2654
+ :class:`KmerTable` equivalent.
2655
+
2656
+ Examples
2657
+ --------
2658
+ >>> table = BucketKmerTable.from_sequences(
2659
+ ... k = 2,
2660
+ ... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
2661
+ ... ref_ids = [0, 1]
2662
+ ... )
2663
+ >>> print(table)
2664
+ AG: (1, 2)
2665
+ AT: (0, 2)
2666
+ CT: (1, 0)
2667
+ TA: (0, 1), (0, 3), (1, 1)
2668
+ TT: (0, 0)
2669
+
2670
+ Count two selected *k-mers*:
2671
+
2672
+ >>> print(table.count(table.kmer_alphabet.encode_multiple(["TA", "AG"])))
2673
+ [3 1]
2674
+ """
2675
+ cdef int64 i
2676
+
2677
+ cdef int64 bucket
2678
+ cdef int64 kmer, self_kmer
2679
+ cdef int64 length
2680
+ cdef uint32* bucket_ptr
2681
+ cdef uint32* array_stop
2682
+ cdef ptr[:] ptr_array = self._ptr_array
2683
+
2684
+ _check_kmer_bounds(kmers, self._kmer_alph)
2685
+ cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
2686
+ cdef int64[:] counts = np.zeros(kmer_array.shape[0], dtype=np.int64)
2687
+
2688
+ for i in range(kmer_array.shape[0]):
2689
+ kmer = kmer_array[i]
2690
+ bucket = kmer % self._n_buckets
2691
+ bucket_ptr = <uint32*> (ptr_array[bucket])
2692
+ if bucket_ptr != NULL:
2693
+ length = (<int64*>bucket_ptr)[0]
2694
+ array_stop = bucket_ptr + length
2695
+ bucket_ptr += 2
2696
+ while bucket_ptr < array_stop:
2697
+ self_kmer = (<int64*>bucket_ptr)[0]
2698
+ if self_kmer == kmer:
2699
+ counts[i] += 1
2700
+ bucket_ptr += EntrySize.BUCKETS
2701
+
2702
+ return np.asarray(counts)
2703
+
2704
+
2705
+ @cython.boundscheck(False)
2706
+ @cython.wraparound(False)
2707
+ def get_kmers(self):
2708
+ """
2709
+ Get the *k-mer* codes for all *k-mers* that have at least one
2710
+ position in the table.
2711
+
2712
+ Returns
2713
+ -------
2714
+ kmers : ndarray, shape=(n,), dtype=np.int64
2715
+ The *k-mer* codes.
2716
+
2717
+ Notes
2718
+ -----
2719
+ As each bucket need to be inspected for the actual *k-mer*
2720
+ entries, this method requires far more computation time than its
2721
+ :class:`KmerTable` equivalent.
2722
+
2723
+ Examples
2724
+ --------
2725
+
2726
+ >>> sequence = ProteinSequence("BIQTITE")
2727
+ >>> table = BucketKmerTable.from_sequences(3, [sequence], ref_ids=[100])
2728
+ >>> print(table)
2729
+ IQT: (100, 1)
2730
+ ITE: (100, 4)
2731
+ QTI: (100, 2)
2732
+ TIT: (100, 3)
2733
+ BIQ: (100, 0)
2734
+ >>> kmer_codes = table.get_kmers()
2735
+ >>> print(kmer_codes)
2736
+ [ 4360 4419 7879 9400 11701]
2737
+ >>> for code in kmer_codes:
2738
+ ... print(table[code])
2739
+ [[100 1]]
2740
+ [[100 4]]
2741
+ [[100 2]]
2742
+ [[100 3]]
2743
+ [[100 0]]
2744
+ """
2745
+ cdef int64 bucket
2746
+ cdef int64 kmer
2747
+ cdef int64 length
2748
+ cdef uint32* bucket_ptr
2749
+ cdef uint32* array_stop
2750
+ cdef ptr[:] ptr_array = self._ptr_array
2751
+
2752
+ cdef cpp_set[int64] kmer_set
2753
+
2754
+ for bucket in range(ptr_array.shape[0]):
2755
+ bucket_ptr = <uint32*> (ptr_array[bucket])
2756
+ if bucket_ptr != NULL:
2757
+ length = (<int64*>bucket_ptr)[0]
2758
+ array_stop = bucket_ptr + length
2759
+ bucket_ptr += 2
2760
+ while bucket_ptr < array_stop:
2761
+ kmer = (<int64*>bucket_ptr)[0]
2762
+ kmer_set.insert(kmer)
2763
+ bucket_ptr += EntrySize.BUCKETS
2764
+
2765
+ cdef int64[:] kmers = np.zeros(kmer_set.size(), dtype=np.int64)
2766
+ cdef int64 i = 0
2767
+ for kmer in kmer_set:
2768
+ kmers[i] = kmer
2769
+ i += 1
2770
+ return np.sort(np.asarray(kmers))
2771
+
2772
+
2773
+ @cython.cdivision(True)
2774
+ @cython.boundscheck(False)
2775
+ @cython.wraparound(False)
2776
+ def __getitem__(self, int64 kmer):
2777
+ cdef int64 i, j
2778
+ cdef int64 self_kmer
2779
+ cdef int64 length
2780
+ cdef uint32* bucket_ptr
2781
+ cdef uint32[:,:] positions
2782
+
2783
+ if kmer >= len(self):
2784
+ raise AlphabetError(
2785
+ f"k-mer code {kmer} is out of bounds "
2786
+ f"for the given KmerAlphabet"
2787
+ )
2788
+
2789
+ bucket_ptr = <uint32*>self._ptr_array[kmer % self._n_buckets]
2790
+ if bucket_ptr == NULL:
2791
+ return np.zeros((0, 2), dtype=np.uint32)
2792
+ else:
2793
+ length = (<int64*>bucket_ptr)[0]
2794
+ # Pessimistic array allocation:
2795
+ # All k-mer positions in bucket belong to the requested k-mer
2796
+ positions = np.empty(((length - 2) // 4, 2), dtype=np.uint32)
2797
+ i = 0
2798
+ for j in range(2, length, 4):
2799
+ self_kmer = bucket_ptr[j]
2800
+ if self_kmer == kmer:
2801
+ positions[i,0] = bucket_ptr[j+2]
2802
+ positions[i,1] = bucket_ptr[j+3]
2803
+ i += 1
2804
+ # Trim to correct size
2805
+ return np.asarray(positions)[:i]
2806
+
2807
+
2808
+ def __len__(self):
2809
+ return len(self._kmer_alph)
2810
+
2811
+
2812
+ def __eq__(self, item):
2813
+ if item is self:
2814
+ return True
2815
+ if type(item) != BucketKmerTable:
2816
+ return False
2817
+
2818
+ # Introduce static typing to access statically typed fields
2819
+ cdef BucketKmerTable other = item
2820
+ if self._kmer_alph.base_alphabet != other._kmer_alph.base_alphabet:
2821
+ return False
2822
+ if self._k != other._k:
2823
+ return False
2824
+ if self._n_buckets != other._n_buckets:
2825
+ return False
2826
+ return _equal_c_arrays(self._ptr_array, other._ptr_array)
2827
+
2828
+
2829
+ def __str__(self):
2830
+ return _to_string(self)
2831
+
2832
+
2833
+ def __getnewargs_ex__(self):
2834
+ return (self._n_buckets, self._kmer_alph), {}
2835
+
2836
+
2837
+ def __getstate__(self):
2838
+ return _pickle_c_arrays(self._ptr_array)
2839
+
2840
+ def __setstate__(self, state):
2841
+ _unpickle_c_arrays(self._ptr_array, state)
2842
+
2843
+
2844
+ def __dealloc__(self):
2845
+ if self._is_initialized():
2846
+ _deallocate_ptrs(self._ptr_array)
2847
+
2848
+
2849
+ ## These private methods work analogous to KmerTable
2850
+
2851
+ @cython.cdivision(True)
2852
+ @cython.boundscheck(False)
2853
+ @cython.wraparound(False)
2854
+ def _count_kmers(self, int64[:] kmers):
2855
+ cdef uint32 seq_pos
2856
+ cdef int64 kmer
2857
+
2858
+ cdef ptr[:] count_array = self._ptr_array
2859
+
2860
+ for seq_pos in range(kmers.shape[0]):
2861
+ kmer = kmers[seq_pos]
2862
+ # Pool all k-mers that should go into the same bucket
2863
+ count_array[kmer % self._n_buckets] += 1
2864
+
2865
+ @cython.cdivision(True)
2866
+ @cython.boundscheck(False)
2867
+ @cython.wraparound(False)
2868
+ def _count_masked_kmers(self, int64[:] kmers, uint8[:] mask):
2869
+ cdef uint32 seq_pos
2870
+ cdef int64 kmer
2871
+
2872
+ cdef ptr[:] count_array = self._ptr_array
2873
+
2874
+ for seq_pos in range(kmers.shape[0]):
2875
+ if mask[seq_pos]:
2876
+ kmer = kmers[seq_pos]
2877
+ # Pool all k-mers that should go into the same bucket
2878
+ count_array[kmer % self._n_buckets] += 1
2879
+
2880
+
2881
+ @cython.cdivision(True)
2882
+ @cython.boundscheck(False)
2883
+ @cython.wraparound(False)
2884
+ def _add_kmers(self, int64[:] kmers, uint32 ref_id, uint8[:] mask):
2885
+ cdef uint32 seq_pos
2886
+ cdef int64 current_size
2887
+ cdef int64 kmer
2888
+ cdef uint32* bucket_ptr
2889
+ cdef uint32* kmer_val_ptr
2890
+
2891
+ # Store in new variable
2892
+ # to disable repetitive initialization checks
2893
+ cdef ptr[:] ptr_array = self._ptr_array
2894
+
2895
+ if mask.shape[0] != kmers.shape[0]:
2896
+ raise IndexError(
2897
+ f"Mask has length {mask.shape[0]}, "
2898
+ f"but there are {kmers.shape[0]} k-mers"
2899
+ )
2900
+
2901
+ for seq_pos in range(kmers.shape[0]):
2902
+ if mask[seq_pos]:
2903
+ kmer = kmers[seq_pos]
2904
+ bucket_ptr = <uint32*> ptr_array[kmer % self._n_buckets]
2905
+
2906
+ # Append k-mer, reference ID and position
2907
+ current_size = (<int64*> bucket_ptr)[0]
2908
+ kmer_val_ptr = &bucket_ptr[current_size]
2909
+ (<int64*> kmer_val_ptr)[0] = kmer
2910
+ bucket_ptr[current_size + 2] = ref_id
2911
+ bucket_ptr[current_size + 3] = seq_pos
2912
+ (<int64*> bucket_ptr)[0] = current_size + EntrySize.BUCKETS
2913
+
2914
+
2915
+ @cython.cdivision(True)
2916
+ @cython.boundscheck(False)
2917
+ @cython.wraparound(False)
2918
+ def _add_kmer_selection(self, uint32[:] positions, int64[:] kmers,
2919
+ uint32 ref_id):
2920
+ cdef uint32 i
2921
+ cdef uint32 seq_pos
2922
+ cdef int64 current_size
2923
+ cdef int64 kmer
2924
+ cdef uint32* bucket_ptr
2925
+ cdef uint32* kmer_val_ptr
2926
+
2927
+ if positions.shape[0] != kmers.shape[0]:
2928
+ raise IndexError(
2929
+ f"{positions.shape[0]} positions were given "
2930
+ f"for {kmers.shape[0]} k-mers"
2931
+ )
2932
+
2933
+ # Store in new variable
2934
+ # to disable repetitive initialization checks
2935
+ cdef ptr[:] ptr_array = self._ptr_array
2936
+
2937
+ for i in range(positions.shape[0]):
2938
+ kmer = kmers[i]
2939
+ seq_pos = positions[i]
2940
+ bucket_ptr = <uint32*> ptr_array[kmer % self._n_buckets]
2941
+
2942
+ # Append k-mer reference ID and position
2943
+ current_size = (<int64*> bucket_ptr)[0]
2944
+ kmer_val_ptr = &bucket_ptr[current_size]
2945
+ (<int64*> kmer_val_ptr)[0] = kmer
2946
+ bucket_ptr[current_size + 2] = ref_id
2947
+ bucket_ptr[current_size + 3] = seq_pos
2948
+ (<int64*> bucket_ptr)[0] = current_size + EntrySize.BUCKETS
2949
+
2950
+
2951
+ cdef inline bint _is_initialized(self):
2952
+ try:
2953
+ if self._ptr_array is not None:
2954
+ return True
2955
+ else:
2956
+ return False
2957
+ except AttributeError:
2958
+ return False
2959
+
2960
+
2961
+
2962
+
2963
+ @cython.cdivision(True)
2964
+ @cython.boundscheck(False)
2965
+ @cython.wraparound(False)
2966
+ def _count_table_entries(ptr[:] count_array, ptr[:] ptr_array,
2967
+ int64 element_size):
2968
+ """
2969
+ For each bucket, count the number of elements in `ptr_array` and add
2970
+ the count to the counts in `count_array`.
2971
+ The element size gives the number of 32 bit elements per entry.
2972
+ """
2973
+ cdef int64 length
2974
+ cdef int64 count
2975
+ cdef int64 bucket
2976
+ cdef uint32* bucket_ptr
2977
+
2978
+ for bucket in range(count_array.shape[0]):
2979
+ bucket_ptr = <uint32*> (ptr_array[bucket])
2980
+ if bucket_ptr != NULL:
2981
+ # First 64 bits are length of C-array
2982
+ length = (<int64*>bucket_ptr)[0]
2983
+ count = (length - 2) // element_size
2984
+ count_array[bucket] += count
2985
+
2986
+
2987
+ @cython.boundscheck(False)
2988
+ @cython.wraparound(False)
2989
+ def _init_c_arrays(ptr[:] ptr_array, int64 element_size):
2990
+ """
2991
+ Transform an array of counts into a pointer array, by replacing the
2992
+ count in each element with a pointer to an initialized but empty
2993
+ ``int32`` C-array.
2994
+ The size of each C-array is the count mutliplied by the
2995
+ `element_size`.
2996
+ The first element of each C-array is is the currently filled size
2997
+ of the C-array (an ``int64``) measured in number of ``int32``
2998
+ elements.
2999
+ """
3000
+ cdef int64 bucket
3001
+ cdef int64 count
3002
+ cdef uint32* bucket_ptr
3003
+
3004
+ for bucket in range(ptr_array.shape[0]):
3005
+ # Before the C-array for a bucket initialized, the element in
3006
+ # the pointer array contains the number of elements the C-array
3007
+ # should hold
3008
+ count = ptr_array[bucket]
3009
+ if count != 0:
3010
+ # Array size + n x element size
3011
+ bucket_ptr = <uint32*>malloc(
3012
+ (2 + count * element_size) * sizeof(uint32)
3013
+ )
3014
+ if not bucket_ptr:
3015
+ raise MemoryError()
3016
+ # The initial size is 2,
3017
+ # which is the size of the array size value (int64)
3018
+ (<int64*> bucket_ptr)[0] = 2
3019
+ ptr_array[bucket] = <ptr>bucket_ptr
3020
+
3021
+
3022
+ @cython.boundscheck(False)
3023
+ @cython.wraparound(False)
3024
+ def _equal_c_arrays(ptr[:] self_ptr_array, ptr[:] other_ptr_array):
3025
+ """
3026
+ Check if two pointer arrays are equal, i.e. they point to C-arrays
3027
+ with equal elements.
3028
+ """
3029
+ cdef int64 bucket
3030
+ cdef int64 i
3031
+ cdef int64 self_length, other_length
3032
+ cdef uint32* self_bucket_ptr
3033
+ cdef uint32* other_bucket_ptr
3034
+
3035
+ for bucket in range(self_ptr_array.shape[0]):
3036
+ self_bucket_ptr = <uint32*>self_ptr_array[bucket]
3037
+ other_bucket_ptr = <uint32*>other_ptr_array[bucket]
3038
+ if self_bucket_ptr != NULL or other_bucket_ptr != NULL:
3039
+ if self_bucket_ptr == NULL or other_bucket_ptr == NULL:
3040
+ # One of the tables has entries for this bucket
3041
+ # while the other one has not
3042
+ return False
3043
+ # This bucket exists in both tables
3044
+ self_length = (<int64*>self_bucket_ptr )[0]
3045
+ other_length = (<int64*>other_bucket_ptr)[0]
3046
+ if self_length != other_length:
3047
+ return False
3048
+ for i in range(2, self_length):
3049
+ if self_bucket_ptr[i] != other_bucket_ptr[i]:
3050
+ return False
3051
+
3052
+ # If none of the previous checks failed, both objects are equal
3053
+ return True
3054
+
3055
+
3056
+ @cython.boundscheck(False)
3057
+ @cython.wraparound(False)
3058
+ def _append_entries(ptr[:] trg_ptr_array, ptr[:] src_ptr_array):
3059
+ """
3060
+ Append the elements in all C-arrays of the source pointer array to
3061
+ the corresponding C-arrays of the target pointer array.
3062
+
3063
+ Expect that the target C-arrays are already initialized to
3064
+ sufficient capacity.
3065
+ """
3066
+ cdef int64 bucket
3067
+ cdef int64 self_length, other_length, new_length
3068
+ cdef uint32* self_kmer_ptr
3069
+ cdef uint32* other_kmer_ptr
3070
+
3071
+ for bucket in range(trg_ptr_array.shape[0]):
3072
+ self_kmer_ptr = <uint32*>trg_ptr_array[bucket]
3073
+ other_kmer_ptr = <uint32*>src_ptr_array[bucket]
3074
+ if other_kmer_ptr != NULL:
3075
+ self_length = (<int64*>self_kmer_ptr)[0]
3076
+ other_length = (<int64*>other_kmer_ptr)[0]
3077
+ # New new C-array needs the combucketed space of both
3078
+ # arrays, but only one length value
3079
+ new_length = self_length + other_length - 2
3080
+ (<int64*>self_kmer_ptr)[0] = new_length
3081
+
3082
+ # Append the entry from the other table
3083
+ # to the entry in this table
3084
+ self_kmer_ptr += self_length
3085
+ other_kmer_ptr += 2
3086
+ memcpy(
3087
+ self_kmer_ptr, other_kmer_ptr,
3088
+ (other_length - 2) * sizeof(uint32)
3089
+ )
3090
+
3091
+
3092
+ @cython.boundscheck(False)
3093
+ @cython.wraparound(False)
3094
+ def _pickle_c_arrays(ptr[:] ptr_array):
3095
+ """
3096
+ Pickle the C arrays into a single concatenated :class:`ndarray`.
3097
+ The lengths of each C-array on these concatenated array is saved as well.
3098
+ """
3099
+ cdef int64 pointer_i, bucket_i, concat_i
3100
+ cdef int64 length
3101
+ cdef uint32* bucket_ptr
3102
+
3103
+ # First pass: Count the total concatenated size
3104
+ cdef int64 total_length = 0
3105
+ for pointer_i in range(ptr_array.shape[0]):
3106
+ bucket_ptr = <uint32*>ptr_array[pointer_i]
3107
+ if bucket_ptr != NULL:
3108
+ # The first element of the C-array is the length
3109
+ # of the array
3110
+ total_length += (<int64*>bucket_ptr)[0]
3111
+
3112
+ # Second pass: Copy the C-arrays into a single concatenated array
3113
+ # and track the start position of each C-array
3114
+ cdef uint32[:] concatenated_array = np.empty(total_length, dtype=np.uint32)
3115
+ cdef int64[:] lengths = np.empty(ptr_array.shape[0], dtype=np.int64)
3116
+ concat_i = 0
3117
+ for pointer_i in range(ptr_array.shape[0]):
3118
+ bucket_ptr = <uint32*>ptr_array[pointer_i]
3119
+ if bucket_ptr != NULL:
3120
+ length = (<int64*>bucket_ptr)[0]
3121
+ lengths[pointer_i] = length
3122
+ memcpy(
3123
+ &concatenated_array[concat_i],
3124
+ bucket_ptr,
3125
+ length * sizeof(uint32),
3126
+ )
3127
+ concat_i += length
3128
+ else:
3129
+ lengths[pointer_i] = 0
3130
+
3131
+ return np.asarray(concatenated_array), np.asarray(lengths)
3132
+
3133
+
3134
+ @cython.boundscheck(False)
3135
+ @cython.wraparound(False)
3136
+ def _unpickle_c_arrays(ptr[:] ptr_array, state):
3137
+ """
3138
+ Unpickle the pickled `state` into the given `ptr_array`.
3139
+ """
3140
+ cdef int64 pointer_i, concat_i
3141
+ cdef int64 length
3142
+ cdef uint32* bucket_ptr
3143
+
3144
+ cdef uint32[:] concatenated_array = state[0]
3145
+ cdef int64[:] lengths = state[1]
3146
+
3147
+ concat_i = 0
3148
+ for pointer_i in range(ptr_array.shape[0]):
3149
+ length = lengths[pointer_i]
3150
+ if length != 0:
3151
+ bucket_ptr = <uint32*>malloc(length * sizeof(uint32))
3152
+ if not bucket_ptr:
3153
+ raise MemoryError
3154
+ memcpy(
3155
+ bucket_ptr,
3156
+ &concatenated_array[concat_i],
3157
+ length * sizeof(uint32),
3158
+ )
3159
+ concat_i += length
3160
+ ptr_array[pointer_i] = <ptr>bucket_ptr
3161
+
3162
+
3163
+ cdef inline void _deallocate_ptrs(ptr[:] ptrs):
3164
+ cdef int64 kmer
3165
+ for kmer in range(ptrs.shape[0]):
3166
+ free(<uint32*>ptrs[kmer])
3167
+
3168
+
3169
+ cdef np.ndarray expand(np.ndarray array):
3170
+ """
3171
+ Double the size of the first dimension of an existing 2D array.
3172
+ """
3173
+ new_array = np.empty(
3174
+ (array.shape[0] * 2, array.shape[1]), dtype=array.dtype
3175
+ )
3176
+ new_array[:array.shape[0], :] = array
3177
+ return new_array
3178
+
3179
+
3180
+ def _prepare_mask(kmer_alphabet, ignore_mask, seq_length):
3181
+ """
3182
+ Convert an ignore mask into a positive mask.
3183
+ Multiple formats (boolean mask, pointer array, None) are supported
3184
+ for the input.
3185
+ """
3186
+ if ignore_mask is None:
3187
+ kmer_mask = np.ones(
3188
+ kmer_alphabet.kmer_array_length(seq_length), dtype=np.uint8
3189
+ )
3190
+ else:
3191
+ if not isinstance(ignore_mask, np.ndarray):
3192
+ raise TypeError(
3193
+ f"The given mask is a '{type(ignore_mask).__name__}', "
3194
+ f"but an ndarray was expected"
3195
+ )
3196
+ if ignore_mask.dtype != np.dtype(bool):
3197
+ raise ValueError("Expected a boolean mask")
3198
+ if len(ignore_mask) != seq_length:
3199
+ raise IndexError(
3200
+ f"ignore mask has length {len(ignore_mask)}, "
3201
+ f"but the length of the sequence is {seq_length}"
3202
+ )
3203
+ kmer_mask = _to_kmer_mask(
3204
+ np.frombuffer(
3205
+ ignore_mask.astype(bool, copy=False), dtype=np.uint8
3206
+ ),
3207
+ kmer_alphabet
3208
+ )
3209
+ return kmer_mask
3210
+
3211
+
3212
+ @cython.boundscheck(False)
3213
+ @cython.wraparound(False)
3214
+ def _to_kmer_mask(uint8[:] mask not None, kmer_alphabet):
3215
+ """
3216
+ Transform a sequence ignore mask into a *k-mer* mask.
3217
+
3218
+ The difference between those masks is that
3219
+
3220
+ 1. the *k-mer* mask is shorter and
3221
+ 2. a position *i* in the *k-mer* mask is false, if any
3222
+ informative position of *k-mer[i]* is true in the ignore
3223
+ mask.
3224
+ """
3225
+ cdef int64 i, j
3226
+ cdef bint is_retained
3227
+
3228
+ cdef uint8[:] kmer_mask = np.empty(
3229
+ kmer_alphabet.kmer_array_length(mask.shape[0]), dtype=np.uint8
3230
+ )
3231
+ cdef int64 offset
3232
+ cdef int64 k = kmer_alphabet.k
3233
+ cdef int64[:] spacing
3234
+
3235
+ if kmer_alphabet.spacing is None:
3236
+ # Continuous k-mers
3237
+ for i in range(kmer_mask.shape[0]):
3238
+ is_retained = True
3239
+ # If any sequence position of this k-mer is removed,
3240
+ # discard this k-mer position
3241
+ for j in range(i, i + k):
3242
+ if mask[j]:
3243
+ is_retained = False
3244
+ kmer_mask[i] = is_retained
3245
+
3246
+ else:
3247
+ # Spaced k-mers
3248
+ spacing = kmer_alphabet.spacing
3249
+ for i in range(kmer_mask.shape[0]):
3250
+ is_retained = True
3251
+ # If any sequence position of this k-mer is removed,
3252
+ # discard this k-mer position
3253
+ for j in range(spacing.shape[0]):
3254
+ offset = spacing[j]
3255
+ if mask[j + offset]:
3256
+ is_retained = False
3257
+ kmer_mask[i] = is_retained
3258
+
3259
+ return np.asarray(kmer_mask)
3260
+
3261
+
3262
+
3263
+ def _check_position_shape(position_arrays, kmer_arrays):
3264
+ """
3265
+ Check if the given lists and each element have the same length
3266
+ and raise an exception, if this is not teh case.
3267
+ """
3268
+ if len(position_arrays) != len(kmer_arrays):
3269
+ raise IndexError(
3270
+ f"{len(position_arrays)} position arrays "
3271
+ f"for {len(kmer_arrays)} k-mer arrays were given"
3272
+ )
3273
+ for i, (positions, kmers) in enumerate(
3274
+ zip(position_arrays, kmer_arrays)
3275
+ ):
3276
+ if len(positions) != len(kmers):
3277
+ raise IndexError(
3278
+ f"{len(positions)} positions"
3279
+ f"for {len(kmers)} k-mers were given at index {i}"
3280
+ )
3281
+
3282
+
3283
+ def _check_same_kmer_alphabet(tables):
3284
+ """
3285
+ Check if the *k-mer* alphabets of all tables are equal.
3286
+ """
3287
+ ref_alph = tables[0].kmer_alphabet
3288
+ for alph in (table.kmer_alphabet for table in tables):
3289
+ if not alph == ref_alph:
3290
+ raise ValueError(
3291
+ "The *k-mer* alphabets of the tables are not equal "
3292
+ "to each other"
3293
+ )
3294
+
3295
+
3296
+ def _check_same_buckets(tables):
3297
+ """
3298
+ Check if the bucket sizes of all tables are equal.
3299
+ """
3300
+ ref_n_buckets = tables[0].n_buckets
3301
+ for buckets in (table.n_buckets for table in tables):
3302
+ if not buckets == ref_n_buckets:
3303
+ raise ValueError(
3304
+ "The number of buckets of the tables are not equal "
3305
+ "to each other"
3306
+ )
3307
+
3308
+
3309
+ def _check_kmer_bounds(kmers, kmer_alphabet):
3310
+ """
3311
+ Check k-mer codes for out-of-bounds values.
3312
+ """
3313
+ if np.any(kmers < 0) or np.any(kmers >= len(kmer_alphabet)):
3314
+ raise AlphabetError(
3315
+ "Given k-mer codes do not represent valid k-mers"
3316
+ )
3317
+
3318
+
3319
+ def _check_multiple_kmer_bounds(kmer_arrays, kmer_alphabet):
3320
+ """
3321
+ Check given arrays of k-mer codes for out-of-bounds values.
3322
+ """
3323
+ for kmers in kmer_arrays:
3324
+ if np.any(kmers < 0) or np.any(kmers >= len(kmer_alphabet)):
3325
+ raise AlphabetError(
3326
+ "Given k-mer codes do not represent valid k-mers"
3327
+ )
3328
+
3329
+
3330
+ def _check_kmer_alphabet(kmer_alph):
3331
+ """
3332
+ Check if the given object is a KmerAaphabet and raise an exception,
3333
+ if this is not the case
3334
+ """
3335
+ if not isinstance(kmer_alph, KmerAlphabet):
3336
+ raise TypeError(
3337
+ f"Got {type(kmer_alph).__name__}, but KmerAlphabet was expected"
3338
+ )
3339
+
3340
+
3341
+ def _compute_masks(masks, sequences):
3342
+ """
3343
+ Check, if the number of masks match the number of sequences, and
3344
+ raise an exception if this is not the case.
3345
+ If no masks are given, create a respective list of ``None`` values.
3346
+ """
3347
+ if masks is None:
3348
+ return [None] * len(sequences)
3349
+ else:
3350
+ if len(masks) != len(sequences):
3351
+ raise IndexError(
3352
+ f"{len(masks)} masks were given, "
3353
+ f"but there are {len(sequences)} sequences"
3354
+ )
3355
+ return masks
3356
+
3357
+
3358
+ def _compute_ref_ids(ref_ids, sequences):
3359
+ """
3360
+ Check, if the number of reference IDs match the number of
3361
+ sequences, and raise an exception, if this is not the case.
3362
+ If no reference IDs are given, create an array that simply
3363
+ enumerates.
3364
+ """
3365
+ if ref_ids is None:
3366
+ return np.arange(len(sequences))
3367
+ else:
3368
+ if len(ref_ids) != len(sequences):
3369
+ raise IndexError(
3370
+ f"{len(ref_ids)} reference IDs were given, "
3371
+ f"but there are {len(sequences)} sequences"
3372
+ )
3373
+ return ref_ids
3374
+
3375
+
3376
+ def _compute_alphabet(given_alphabet, sequence_alphabets):
3377
+ """
3378
+ If `given_alphabet` is None, find a common alphabet among
3379
+ `sequence_alphabets` and raise an exception if this is not possible.
3380
+ Otherwise just check compatibility of alphabets.
3381
+ """
3382
+ if given_alphabet is None:
3383
+ alphabet = common_alphabet(sequence_alphabets)
3384
+ if alphabet is None:
3385
+ raise ValueError(
3386
+ "There is no common alphabet that extends all alphabets"
3387
+ )
3388
+ return alphabet
3389
+ else:
3390
+ for alph in sequence_alphabets:
3391
+ if not given_alphabet.extends(alph):
3392
+ raise ValueError(
3393
+ "The given alphabet is incompatible with a least one "
3394
+ "alphabet of the given sequences"
3395
+ )
3396
+ return given_alphabet
3397
+
3398
+
3399
+ def _to_string(table):
3400
+ lines = []
3401
+ for kmer in table.get_kmers():
3402
+ symbols = table.kmer_alphabet.decode(kmer)
3403
+ if isinstance(table.alphabet, LetterAlphabet):
3404
+ symbols = "".join(symbols)
3405
+ else:
3406
+ symbols = str(tuple(symbols))
3407
+ line = symbols + ": " + ", ".join(
3408
+ [str((ref_id.item(), pos.item())) for ref_id, pos in table[kmer]]
3409
+ )
3410
+ lines.append(line)
3411
+ return "\n".join(lines)