biotite 0.41.1__cp312-cp312-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,956 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["MinimizerSelector", "SyncmerSelector", "CachedSyncmerSelector",
8
+ "MincodeSelector"]
9
+
10
+ cimport cython
11
+ cimport numpy as np
12
+
13
+ from numbers import Integral
14
+ import numpy as np
15
+ from .kmeralphabet import KmerAlphabet
16
+ from ..alphabet import AlphabetError
17
+
18
+
19
+ ctypedef np.int64_t int64
20
+ ctypedef np.uint32_t uint32
21
+
22
+
23
+ # Obtained from 'np.iinfo(np.int64).max'
24
+ DEF MAX_INT_64 = 9223372036854775807
25
+
26
+
27
+ class MinimizerSelector:
28
+ """
29
+ MinimizerSelector(kmer_alphabet, window, permutation=None)
30
+
31
+ Selects the *minimizers* in sequences.
32
+
33
+ In a rolling window of *k-mers*, the minimizer is defined as the
34
+ *k-mer* with the minimum *k-mer* code :footcite:`Roberts2004`.
35
+ If the same minimum *k-mer* appears twice in a window, the leftmost
36
+ *k-mer* is selected as minimizer.
37
+
38
+ Parameters
39
+ ----------
40
+ kmer_alphabet : KmerAlphabet
41
+ The *k-mer* alphabet that defines the *k-mer* size and the type
42
+ of sequence this :class:`MinimizerSelector` can be applied on.
43
+ window : int
44
+ The size of the rolling window, where the minimizers are
45
+ searched in.
46
+ In other words this is the number of *k-mers* per window.
47
+ The window size must be at least 2.
48
+ permutation : Permutation
49
+ If set, the *k-mer* order is permuted, i.e.
50
+ the minimizer is chosen based on the ordering of the sort keys
51
+ from :class:`Permutation.permute()`.
52
+ By default, the standard order of the :class:`KmerAlphabet` is
53
+ used.
54
+ This standard order is often the lexicographical order, which is
55
+ known to yield suboptimal *density* in many cases
56
+ :footcite:`Roberts2004`.
57
+
58
+ Attributes
59
+ ----------
60
+ kmer_alphabet : KmerAlphabet
61
+ The *k-mer* alphabet.
62
+ window : int
63
+ The window size.
64
+ permutation : Permutation
65
+ The permutation.
66
+
67
+ Notes
68
+ -----
69
+ For minimizer computation a fast algorithm :footcite:`VanHerk1992`
70
+ is used, whose runtime scales linearly with the length of the
71
+ sequence and is constant with regard to the size of the rolling
72
+ window.
73
+
74
+ References
75
+ ----------
76
+
77
+ .. footbibliography::
78
+
79
+ Examples
80
+ --------
81
+
82
+ The *k-mer* decomposition of a sequence can yield a high number of
83
+ *k-mers*:
84
+
85
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
86
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
87
+ >>> all_kmers = kmer_alph.create_kmers(sequence1.code)
88
+ >>> print(all_kmers)
89
+ [ 9367 3639 4415 9199 13431 4415 9192 13271 567 13611 8725 2057
90
+ 7899 9875 1993 6363]
91
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in all_kmers])
92
+ ['THI', 'HIS', 'IS*', 'S*I', '*IS', 'IS*', 'S*A', '*A*', 'A*S', '*SE', 'SEQ', 'EQV', 'QVE', 'VEN', 'ENC', 'NCE']
93
+
94
+ Minimizers can be used to reduce the number of *k-mers* by selecting
95
+ only the minimum *k-mer* in each window *w*:
96
+
97
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
98
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
99
+ >>> print(minimizer_pos)
100
+ [ 1 2 5 8 11 14]
101
+ >>> print(minimizers)
102
+ [3639 4415 4415 567 2057 1993]
103
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in minimizers])
104
+ ['HIS', 'IS*', 'IS*', 'A*S', 'EQV', 'ENC']
105
+
106
+ Although this approach reduces the number of *k-mers*, minimizers
107
+ are still guaranteed to match minimizers in another sequence, if
108
+ they share an equal subsequence of at least length *w + k - 1*:
109
+
110
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
111
+ >>> other_minimizer_pos, other_minimizers = minimizer.select(sequence2)
112
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in other_minimizers])
113
+ ['ANQ', 'HER', 'ER*', 'EQV', 'ENC']
114
+ >>> common_minimizers = set.intersection(set(minimizers), set(other_minimizers))
115
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in common_minimizers])
116
+ ['EQV', 'ENC']
117
+ """
118
+
119
+ def __init__(self, kmer_alphabet, window, permutation=None):
120
+ if window < 2:
121
+ raise ValueError("Window size must be at least 2")
122
+ self._window = window
123
+ self._kmer_alph = kmer_alphabet
124
+ self._permutation = permutation
125
+
126
+
127
+ @property
128
+ def kmer_alphabet(self):
129
+ return self._kmer_alph
130
+
131
+ @property
132
+ def window(self):
133
+ return self._window
134
+
135
+ @property
136
+ def permutation(self):
137
+ return self._permutation
138
+
139
+
140
+ def select(self, sequence, bint alphabet_check=True):
141
+ """
142
+ select(sequence, alphabet_check=True)
143
+
144
+ Obtain all overlapping *k-mers* from a sequence and select
145
+ the minimizers from them.
146
+
147
+ Parameters
148
+ ----------
149
+ sequence : Sequence
150
+ The sequence to find the minimizers in.
151
+ Must be compatible with the given `kmer_alphabet`
152
+ alphabet_check: bool, optional
153
+ If set to false, the compatibility between the alphabet
154
+ of the sequence and the alphabet of the
155
+ :class:`MinimizerSelector`
156
+ is not checked to gain additional performance.
157
+
158
+ Returns
159
+ -------
160
+ minimizer_indices : ndarray, dtype=np.uint32
161
+ The sequence indices where the minimizer *k-mers* start.
162
+ minimizers : ndarray, dtype=np.int64
163
+ The *k-mers* that are the selected minimizers, returned as
164
+ *k-mer* code.
165
+
166
+ Notes
167
+ -----
168
+ Duplicate minimizers are omitted, i.e. if two windows have the
169
+ same minimizer position, the return values contain this
170
+ minimizer only once.
171
+ """
172
+ if alphabet_check:
173
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
174
+ raise ValueError(
175
+ "The sequence's alphabet does not fit the k-mer alphabet"
176
+ )
177
+ kmers = self._kmer_alph.create_kmers(sequence.code)
178
+ return self.select_from_kmers(kmers)
179
+
180
+
181
+ def select_from_kmers(self, kmers):
182
+ """
183
+ select_from_kmers(kmers)
184
+
185
+ Select minimizers for the given overlapping *k-mers*.
186
+
187
+ Parameters
188
+ ----------
189
+ kmers : ndarray, dtype=np.int64
190
+ The *k-mer* codes representing the sequence to find the
191
+ minimizers in.
192
+ The *k-mer* codes correspond to the *k-mers* encoded by the
193
+ given `kmer_alphabet`.
194
+
195
+ Returns
196
+ -------
197
+ minimizer_indices : ndarray, dtype=np.uint32
198
+ The indices in the input *k-mer* sequence where a minimizer
199
+ appears.
200
+ minimizers : ndarray, dtype=np.int64
201
+ The corresponding *k-mers* codes of the minimizers.
202
+
203
+ Notes
204
+ -----
205
+ Duplicate minimizers are omitted, i.e. if two windows have the
206
+ same minimizer position, the return values contain this
207
+ minimizer only once.
208
+ """
209
+ if self._permutation is None:
210
+ ordering = kmers
211
+ else:
212
+ ordering = self._permutation.permute(kmers)
213
+ if len(ordering) != len(kmers):
214
+ raise IndexError(
215
+ f"The Permutation is defective, it gave {len(ordering)} "
216
+ f"sort keys for {len(kmers)} k-mers"
217
+ )
218
+
219
+ if len(kmers) < self._window:
220
+ raise ValueError(
221
+ "The number of k-mers is smaller than the window size"
222
+ )
223
+ return _minimize(
224
+ kmers.astype(np.int64, copy=False),
225
+ ordering.astype(np.int64, copy=False),
226
+ self._window,
227
+ include_duplicates=False
228
+ )
229
+
230
+
231
+ class SyncmerSelector:
232
+ """
233
+ SyncmerSelector(alphabet, k, s, permutation=None, offset=(0,))
234
+
235
+ Selects the *syncmers* in sequences.
236
+
237
+ Let the *s-mers* be all overlapping substrings of length *s* in a
238
+ *k-mer*.
239
+ A *k-mer* is a syncmer, if its minimum *s-mer* is at one of the
240
+ given offset positions :footcite:`Edgar2021`.
241
+ If the same minimum *s-mer* appears twice in a *k-mer*, the position
242
+ of the leftmost *s-mer* is taken.
243
+
244
+ Parameters
245
+ ----------
246
+ alphabet : Alphabet
247
+ The base alphabet the *k-mers* and *s-mers* are created from.
248
+ Defines the type of sequence this :class:`MinimizerSelector` can
249
+ be applied on.
250
+ k, s : int
251
+ The length of the *k-mers* and *s-mers*, respectively.
252
+ permutation : Permutation
253
+ If set, the *s-mer* order is permuted, i.e.
254
+ the minimum *s-mer* is chosen based on the ordering of the sort
255
+ keys from :class:`Permutation.permute()`.
256
+ This :class:`Permutation` must be compatible with *s*
257
+ (not with *k*).
258
+ By default, the standard order of the :class:`KmerAlphabet` is
259
+ used.
260
+ This standard order is often the lexicographical order, which is
261
+ known to yield suboptimal *density* in many cases
262
+ :footcite:`Roberts2004`.
263
+ offset : array-like of int
264
+ If the minimum *s-mer* in a *k-mer* is at one of the given
265
+ offset positions, that *k-mer* is a syncmer.
266
+ Negative values indicate the position from the end of the
267
+ *k-mer*.
268
+ By default, the minimum position needs to be at the start of the
269
+ *k-mer*, which is termed *open syncmer*.
270
+
271
+ Attributes
272
+ ----------
273
+ alphabet : Alphabet
274
+ The base alphabet.
275
+ kmer_alphabet, smer_alphabet : int
276
+ The :class:`KmerAlphabet` for *k* and *s*, respectively.
277
+ permutation : Permutation
278
+ The permutation.
279
+
280
+ See also
281
+ --------
282
+ CachedSyncmerSelector
283
+ A cached variant with faster syncmer selection at the cost of
284
+ increased initialization time.
285
+
286
+ Notes
287
+ -----
288
+ For syncmer computation from a sequence a fast algorithm
289
+ :footcite:`VanHerk1992` is used, whose runtime scales linearly with
290
+ the length of the sequence and is constant with regard to *k*.
291
+
292
+ References
293
+ ----------
294
+
295
+ .. footbibliography::
296
+
297
+ Examples
298
+ --------
299
+
300
+ This example is taken from :footcite:`Edgar2021`:
301
+ The subset of *k-mers* that are *closed syncmers* are selected.
302
+ Closed syncmers are syncmers, where the minimum *s-mer* is in the
303
+ first or last position of the *k-mer*.
304
+ *s-mers* are ordered lexicographically in this example.
305
+
306
+ >>> sequence = NucleotideSequence("GGCAAGTGACA")
307
+ >>> kmer_alph = KmerAlphabet(sequence.alphabet, k=5)
308
+ >>> kmers = kmer_alph.create_kmers(sequence.code)
309
+ >>> closed_syncmer_selector = CachedSyncmerSelector(
310
+ ... sequence.alphabet,
311
+ ... # The same k as in the KmerAlphabet
312
+ ... k=5,
313
+ ... s=2,
314
+ ... # The offset determines that closed syncmers will be selected
315
+ ... offset=(0, -1)
316
+ ... )
317
+ >>> syncmer_pos, syncmers = closed_syncmer_selector.select(sequence)
318
+ >>> # Print all k-mers in the sequence and mark syncmers with a '*'
319
+ >>> for pos, kmer in enumerate(kmer_alph.create_kmers(sequence.code)):
320
+ ... if pos in syncmer_pos:
321
+ ... print("* " + "".join(kmer_alph.decode(kmer)))
322
+ ... else:
323
+ ... print(" " + "".join(kmer_alph.decode(kmer)))
324
+ * GGCAA
325
+ GCAAG
326
+ CAAGT
327
+ * AAGTG
328
+ * AGTGA
329
+ * GTGAC
330
+ TGACA
331
+ """
332
+
333
+ def __init__(self, alphabet, k, s, permutation=None, offset=(0,)):
334
+ if not s < k:
335
+ raise ValueError("s must be smaller than k")
336
+ self._window = k - s + 1
337
+ self._alphabet = alphabet
338
+ self._kmer_alph = KmerAlphabet(alphabet, k)
339
+ self._smer_alph = KmerAlphabet(alphabet, s)
340
+
341
+ self._permutation = permutation
342
+
343
+ self._offset = np.asarray(offset, dtype=np.int64)
344
+ # Wrap around negative indices
345
+ self._offset = np.where(
346
+ self._offset < 0,
347
+ self._window + self._offset,
348
+ self._offset
349
+ )
350
+ if (self._offset >= self._window).any() or (self._offset < 0).any():
351
+ raise IndexError(
352
+ f"Offset is out of window range"
353
+ )
354
+ if len(np.unique(self._offset)) != len(self._offset):
355
+ raise ValueError("Offset must contain unique values")
356
+
357
+
358
+ @property
359
+ def alphabet(self):
360
+ return self._alphabet
361
+
362
+ @property
363
+ def kmer_alphabet(self):
364
+ return self._kmer_alph
365
+
366
+ @property
367
+ def smer_alphabet(self):
368
+ return self._smer_alph
369
+
370
+ @property
371
+ def permutation(self):
372
+ return self._permutation
373
+
374
+
375
+ def select(self, sequence, bint alphabet_check=True):
376
+ """
377
+ select(sequence, alphabet_check=True)
378
+
379
+ Obtain all overlapping *k-mers* from a sequence and select
380
+ the syncmers from them.
381
+
382
+ Parameters
383
+ ----------
384
+ sequence : Sequence
385
+ The sequence to find the syncmers in.
386
+ Must be compatible with the given `kmer_alphabet`
387
+ alphabet_check: bool, optional
388
+ If set to false, the compatibility between the alphabet
389
+ of the sequence and the alphabet of the
390
+ :class:`SyncmerSelector`
391
+ is not checked to gain additional performance.
392
+
393
+ Returns
394
+ -------
395
+ syncmer_indices : ndarray, dtype=np.uint32
396
+ The sequence indices where the syncmers start.
397
+ syncmers : ndarray, dtype=np.int64
398
+ The corresponding *k-mer* codes of the syncmers.
399
+ """
400
+ if alphabet_check:
401
+ if not self._alphabet.extends(sequence.alphabet):
402
+ raise ValueError(
403
+ "The sequence's alphabet does not fit "
404
+ "the selector's alphabet"
405
+ )
406
+ kmers = self._kmer_alph.create_kmers(sequence.code)
407
+ smers = self._smer_alph.create_kmers(sequence.code)
408
+
409
+ if self._permutation is None:
410
+ ordering = smers
411
+ else:
412
+ ordering = self._permutation.permute(smers)
413
+ if len(ordering) != len(smers):
414
+ raise IndexError(
415
+ f"The Permutation is defective, it gave {len(ordering)} "
416
+ f"sort keys for {len(smers)} s-mers"
417
+ )
418
+
419
+ # The aboslute position of the minimum s-mer for each k-mer
420
+ min_pos, _ = _minimize(
421
+ smers,
422
+ ordering.astype(np.int64, copy=False),
423
+ self._window,
424
+ include_duplicates=True
425
+ )
426
+ # The position of the minimum s-mer relative to the start
427
+ # of the k-mer
428
+ relative_min_pos = min_pos - np.arange(len(kmers))
429
+ syncmer_pos = self._filter_syncmer_pos(relative_min_pos)
430
+ return syncmer_pos, kmers[syncmer_pos]
431
+
432
+
433
+ def select_from_kmers(self, kmers):
434
+ """
435
+ select_from_kmers(kmers)
436
+
437
+ Select syncmers for the given *k-mers*.
438
+
439
+ The *k-mers* are not required to overlap.
440
+
441
+ Parameters
442
+ ----------
443
+ kmers : ndarray, dtype=np.int64
444
+ The *k-mer* codes to select the syncmers from.
445
+
446
+ Returns
447
+ -------
448
+ syncmer_indices : ndarray, dtype=np.uint32
449
+ The sequence indices where the syncmers start.
450
+ syncmers : ndarray, dtype=np.int64
451
+ The corresponding *k-mer* codes of the syncmers.
452
+
453
+ Notes
454
+ -----
455
+ Since for *s-mer* creation, the *k-mers* need to be converted
456
+ back to symbol codes again and since the input *k-mers* are not
457
+ required to overlap, calling :meth:`select()` is much faster.
458
+ However, :meth:`select()` is only available for
459
+ :class:`Sequence` objects.
460
+ """
461
+ cdef int64 i
462
+
463
+ symbol_codes_for_each_kmer = self._kmer_alph.split(kmers)
464
+
465
+ cdef int64[:] min_pos = np.zeros(
466
+ len(symbol_codes_for_each_kmer), dtype=np.int64
467
+ )
468
+ for i in range(symbol_codes_for_each_kmer.shape[0]):
469
+ smers = self._smer_alph.create_kmers(symbol_codes_for_each_kmer[i])
470
+ if self._permutation is None:
471
+ ordering = smers
472
+ else:
473
+ ordering = self._permutation.permute(smers)
474
+ if len(ordering) != len(smers):
475
+ raise IndexError(
476
+ f"The Permutation is defective, it gave {len(ordering)} "
477
+ f"sort keys for {len(smers)} s-mers"
478
+ )
479
+ min_pos[i] = np.argmin(ordering)
480
+
481
+ syncmer_pos = self._filter_syncmer_pos(min_pos)
482
+ return syncmer_pos, kmers[syncmer_pos]
483
+
484
+
485
+ def _filter_syncmer_pos(self, min_pos):
486
+ """
487
+ Get indices of *k-mers* that are syncmers, based on `min_pos`,
488
+ the position of the minimum *s-mer* in each *k-mer*.
489
+ Syncmers are k-mers whose the minimum s-mer is at (one of)
490
+ the given offet position(s).
491
+ """
492
+ syncmer_mask = None
493
+ for offset in self._offset:
494
+ # For the usual number of offsets, this 'loop'-appoach is
495
+ # faster than np.isin()
496
+ if syncmer_mask is None:
497
+ syncmer_mask = min_pos == offset
498
+ else:
499
+ syncmer_mask |= min_pos == offset
500
+ return np.where(syncmer_mask)[0]
501
+
502
+
503
+ class CachedSyncmerSelector(SyncmerSelector):
504
+ """
505
+ CachedSyncmerSelector(alphabet, k, s, permutation=None, offset=(0,))
506
+
507
+ Selects the *syncmers* in sequences.
508
+
509
+ Fulsfills the same purpose as :class:`SyncmerSelector`, but
510
+ precomputes for each possible *k-mer*, whether it is a syncmer,
511
+ at initialization.
512
+ Hence, syncmer selection is faster at the cost of longer
513
+ initialization time.
514
+
515
+ Parameters
516
+ ----------
517
+ alphabet : Alphabet
518
+ The base alphabet the *k-mers* and *s-mers* are created from.
519
+ Defines the type of sequence this :class:`MinimizerSelector` can
520
+ be applied on.
521
+ k, s : int
522
+ The length of the *k-mers* and *s-mers*, respectively.
523
+ permutation : Permutation
524
+ If set, the *s-mer* order is permuted, i.e.
525
+ the minimum *s-mer* is chosen based on the ordering of the sort
526
+ keys from :class:`Permutation.permute()`.
527
+ This :class:`Permutation` must be compatible with *s*
528
+ (not with *k*).
529
+ By default, the standard order of the :class:`KmerAlphabet` is
530
+ used.
531
+ This standard order is often the lexicographical order, which is
532
+ known to yield suboptimal *density* in many cases
533
+ :footcite:`Roberts2004`.
534
+ offset : array-like of int
535
+ If the minimum *s-mer* in a *k-mer* is at one of the given
536
+ offset positions, that *k-mer* is a syncmer.
537
+ Negative values indicate the position from the end of the
538
+ *k-mer*.
539
+ By default, the minimum position needs to be at the start of the
540
+ *k-mer*, which is termed *open syncmer*.
541
+
542
+ Attributes
543
+ ----------
544
+ alphabet : Alphabet
545
+ The base alphabet.
546
+ kmer_alphabet, smer_alphabet : int
547
+ The :class:`KmerAlphabet` for *k* and *s*, respectively.
548
+ permutation : Permutation
549
+ The permutation.
550
+
551
+ See also
552
+ --------
553
+ SyncmerSelector
554
+ A standard variant for syncmer selection.
555
+
556
+ Notes
557
+ -----
558
+ Both the initialization time and memory requirements are
559
+ proportional to the size of the `kmer_alphabet`, i.e. :math:`n^k`.
560
+ Hence, it is adviced to use this class only for rather small
561
+ alphabets.
562
+
563
+ References
564
+ ----------
565
+
566
+ .. footbibliography::
567
+
568
+ Examples
569
+ --------
570
+
571
+ >>> sequence = NucleotideSequence("GGCAAGTGACA")
572
+ >>> kmer_alph = KmerAlphabet(sequence.alphabet, k=5)
573
+ >>> # The initialization can quite a long time for large *k-mer* alphabets...
574
+ >>> closed_syncmer_selector = CachedSyncmerSelector(
575
+ ... sequence.alphabet,
576
+ ... # The same k as in the KmerAlphabet
577
+ ... k=5,
578
+ ... s=2,
579
+ ... # The offset determines that closed syncmers will be selected
580
+ ... offset=(0, -1)
581
+ ... )
582
+ >>> # ...but the actual syncmer identification is very fast
583
+ >>> syncmer_pos, syncmers = closed_syncmer_selector.select(sequence)
584
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in syncmers])
585
+ ['GGCAA', 'AAGTG', 'AGTGA', 'GTGAC']
586
+ """
587
+
588
+ def __init__(self, alphabet, k, s, permutation=None, offset=(0,)):
589
+ super().__init__(alphabet, k, s, permutation, offset)
590
+ # Check for all possible *k-mers*, whether they are syncmers
591
+ all_kmers = np.arange(len(self.kmer_alphabet))
592
+ syncmer_indices, _ = super().select_from_kmers(all_kmers)
593
+ # Convert the index array into a boolean mask
594
+ self._syncmer_mask = np.zeros(len(self.kmer_alphabet), dtype=bool)
595
+ self._syncmer_mask[syncmer_indices] = True
596
+
597
+
598
+ def select(self, sequence, bint alphabet_check=True):
599
+ """
600
+ select(sequence, alphabet_check=True)
601
+
602
+ Obtain all overlapping *k-mers* from a sequence and select
603
+ the syncmers from them.
604
+
605
+ Parameters
606
+ ----------
607
+ sequence : Sequence
608
+ The sequence to find the syncmers in.
609
+ Must be compatible with the given `kmer_alphabet`
610
+ alphabet_check: bool, optional
611
+ If set to false, the compatibility between the alphabet
612
+ of the sequence and the alphabet of the
613
+ :class:`CachedSyncmerSelector`
614
+ is not checked to gain additional performance.
615
+
616
+ Returns
617
+ -------
618
+ syncmer_indices : ndarray, dtype=np.uint32
619
+ The sequence indices where the syncmers start.
620
+ syncmers : ndarray, dtype=np.int64
621
+ The corresponding *k-mer* codes of the syncmers.
622
+ """
623
+ if alphabet_check:
624
+ if not self.alphabet.extends(sequence.alphabet):
625
+ raise ValueError(
626
+ "The sequence's alphabet does not fit "
627
+ "the selector's alphabet"
628
+ )
629
+ kmers = self.kmer_alphabet.create_kmers(sequence.code)
630
+ return self.select_from_kmers(kmers)
631
+
632
+
633
+ def select_from_kmers(self, kmers):
634
+ """
635
+ select_from_kmers(kmers)
636
+
637
+ Select syncmers for the given *k-mers*.
638
+
639
+ The *k-mers* are not required to overlap.
640
+
641
+ Parameters
642
+ ----------
643
+ kmers : ndarray, dtype=np.int64
644
+ The *k-mer* codes to select the syncmers from.
645
+
646
+ Returns
647
+ -------
648
+ syncmer_indices : ndarray, dtype=np.uint32
649
+ The sequence indices where the syncmers start.
650
+ syncmers : ndarray, dtype=np.int64
651
+ The corresponding *k-mer* codes of the syncmers.
652
+ """
653
+ syncmer_pos = np.where(self._syncmer_mask[kmers])[0]
654
+ return syncmer_pos, kmers[syncmer_pos]
655
+
656
+
657
+ class MincodeSelector:
658
+ r"""
659
+ MincodeSelector(self, kmer_alphabet, compression, permutation=None)
660
+
661
+ Selects the :math:`1/\text{compression}` *smallest* *k-mers* from
662
+ :class:`KmerAlphabet`. :footcite:`Edgar2021`
663
+
664
+ '*Small*' refers to the lexicographical order, or alternatively a
665
+ custom order if `permutation` is given.
666
+ The *Mincode* approach tries to reduce the number of *k-mers* from a
667
+ sequence by the factor `compression`, while it still ensures that
668
+ a common set of *k-mers* are selected from similar sequences.
669
+
670
+ Parameters
671
+ ----------
672
+ kmer_alphabet : KmerAlphabet
673
+ The *k-mer* alphabet that defines the *k-mer* size and the type
674
+ of sequence this :class:`MincodeSelector` can be applied on.
675
+ compression : float
676
+ Defines the compression factor, i.e. the approximate fraction
677
+ of *k-mers* that will be sampled from a sequence.
678
+ permutation : Permutation
679
+ If set, the *k-mer* order is permuted, i.e.
680
+ the *k-mers* are selected based on the ordering of the sort keys
681
+ from :class:`Permutation.permute()`.
682
+ By default, the standard order of the :class:`KmerAlphabet` is
683
+ used.
684
+ This standard order is often the lexicographical order.
685
+
686
+ Attributes
687
+ ----------
688
+ kmer_alphabet : KmerAlphabet
689
+ The *k-mer* alphabet.
690
+ compression : float
691
+ The compression factor.
692
+ threshold : float
693
+ Based on the compression factor and the range of (permuted)
694
+ *k-mer* values this threshold is calculated.
695
+ All *k-mers*, that are smaller than this value are selected.
696
+ permutation : Permutation
697
+ The permutation.
698
+
699
+ References
700
+ ----------
701
+
702
+ .. footbibliography::
703
+
704
+ Examples
705
+ --------
706
+
707
+ >>> kmer_alph = KmerAlphabet(NucleotideSequence.alphabet_unamb, k=2)
708
+ >>> kmers = np.arange(len(kmer_alph))
709
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers])
710
+ ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']
711
+ >>> # Select 1/4 of *k-mers* based on lexicographical k-mer order
712
+ >>> selector = MincodeSelector(kmer_alph, 4)
713
+ >>> subset_pos, kmers_subset = selector.select_from_kmers(kmers)
714
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers_subset])
715
+ ['AA', 'AC', 'AG', 'AT']
716
+ >>> # Select 1/4 based on randomized k-mer order
717
+ >>> selector = MincodeSelector(kmer_alph, 4, permutation=RandomPermutation())
718
+ >>> subset_pos, kmers_subset = selector.select_from_kmers(kmers)
719
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers_subset])
720
+ ['AG', 'CT', 'GA', 'TC']
721
+ """
722
+
723
+ def __init__(self, kmer_alphabet, compression, permutation=None):
724
+ if compression < 1:
725
+ raise ValueError(
726
+ "Compression factor must be equal to or larger than 1"
727
+ )
728
+ self._compression = compression
729
+ self._kmer_alph = kmer_alphabet
730
+ self._permutation = permutation
731
+ if permutation is None:
732
+ permutation_offset = 0
733
+ permutation_range = len(kmer_alphabet)
734
+ else:
735
+ permutation_offset = permutation.min
736
+ permutation_range = permutation.max - permutation.min + 1
737
+ self._threshold = permutation_offset + permutation_range / compression
738
+
739
+
740
+ @property
741
+ def kmer_alphabet(self):
742
+ return self._kmer_alph
743
+
744
+ @property
745
+ def compression(self):
746
+ return self._compression
747
+
748
+ @property
749
+ def threshold(self):
750
+ return self._threshold
751
+
752
+ @property
753
+ def permutation(self):
754
+ return self._permutation
755
+
756
+
757
+ def select(self, sequence, bint alphabet_check=True):
758
+ """
759
+ select(sequence, alphabet_check=True)
760
+
761
+ Obtain all overlapping *k-mers* from a sequence and select
762
+ the *Mincode k-mers* from them.
763
+
764
+ Parameters
765
+ ----------
766
+ sequence : Sequence
767
+ The sequence to find the *Mincode k-mers* in.
768
+ Must be compatible with the given `kmer_alphabet`
769
+ alphabet_check: bool, optional
770
+ If set to false, the compatibility between the alphabet
771
+ of the sequence and the alphabet of the
772
+ :class:`MincodeSelector`
773
+ is not checked to gain additional performance.
774
+
775
+ Returns
776
+ -------
777
+ mincode_indices : ndarray, dtype=np.uint32
778
+ The sequence indices where the *Mincode k-mers* start.
779
+ mincode : ndarray, dtype=np.int64
780
+ The corresponding *Mincode k-mer* codes.
781
+ """
782
+ if alphabet_check:
783
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
784
+ raise ValueError(
785
+ "The sequence's alphabet does not fit the k-mer alphabet"
786
+ )
787
+ kmers = self._kmer_alph.create_kmers(sequence.code)
788
+ return self.select_from_kmers(kmers)
789
+
790
+
791
+ def select_from_kmers(self, kmers):
792
+ """
793
+ select_from_kmers(kmers)
794
+
795
+ Select *Mincode k-mers*.
796
+
797
+ The given *k-mers* are not required to overlap.
798
+
799
+ Parameters
800
+ ----------
801
+ kmers : ndarray, dtype=np.int64
802
+ The *k-mer* codes to select the *Mincode k-mers* from.
803
+
804
+ Returns
805
+ -------
806
+ mincode_indices : ndarray, dtype=np.uint32
807
+ The sequence indices where the *Mincode k-mers* start.
808
+ mincode : ndarray, dtype=np.int64
809
+ The corresponding *Mincode k-mer* codes.
810
+ """
811
+ if self._permutation is None:
812
+ ordering = kmers
813
+ else:
814
+ ordering = self._permutation.permute(kmers)
815
+ if len(ordering) != len(kmers):
816
+ raise IndexError(
817
+ f"The Permutation is defective, it gave {len(ordering)} "
818
+ f"sort keys for {len(kmers)} k-mers"
819
+ )
820
+
821
+ mincode_pos = ordering < self._threshold
822
+ return mincode_pos, kmers[mincode_pos]
823
+
824
+
825
+ @cython.boundscheck(False)
826
+ @cython.wraparound(False)
827
+ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window,
828
+ bint include_duplicates):
829
+ """
830
+ Implementation of the algorithm originally devised by
831
+ Marcel van Herk.
832
+
833
+ In this implementation the frame is chosen differently:
834
+ For a position 'x' the frame ranges from 'x' to 'x + window-1'
835
+ instead of 'x - (window-1)/2' to 'x + (window-1)/2'.
836
+ """
837
+ cdef uint32 seq_i
838
+
839
+ cdef uint32 n_windows = kmers.shape[0] - (window - 1)
840
+ # Pessimistic array allocation size
841
+ # -> Expect that every window has a new minimizer
842
+ cdef uint32[:] mininizer_pos = np.empty(n_windows, dtype=np.uint32)
843
+ cdef int64[:] minimizers = np.empty(n_windows, dtype=np.int64)
844
+ # Counts the actual number of minimiers for later trimming
845
+ cdef uint32 n_minimizers = 0
846
+
847
+ # Variables for the position of the previous cumulative minimum
848
+ # Assign an value that can never occur for the start,
849
+ # as in the beginning there is no previous value
850
+ cdef uint32 prev_argcummin = kmers.shape[0]
851
+ # Variables for the position of the current cumulative minimum
852
+ cdef uint32 combined_argcummin, forward_argcummin, reverse_argcummin
853
+ # Variables for the current cumulative minimum
854
+ cdef int64 combined_cummin, forward_cummin, reverse_cummin
855
+ # Variables for cumulative minima at all positions
856
+ cdef uint32[:] forward_argcummins = _chunk_wise_forward_argcummin(
857
+ ordering, window
858
+ )
859
+ cdef uint32[:] reverse_argcummins = _chunk_wise_reverse_argcummin(
860
+ ordering, window
861
+ )
862
+
863
+ for seq_i in range(n_windows):
864
+ forward_argcummin = forward_argcummins[seq_i + window - 1]
865
+ reverse_argcummin = reverse_argcummins[seq_i]
866
+ forward_cummin = ordering[forward_argcummin]
867
+ reverse_cummin = ordering[reverse_argcummin]
868
+
869
+ # At ties the leftmost position is taken,
870
+ # which stems from the reverse pass
871
+ if forward_cummin < reverse_cummin:
872
+ combined_argcummin = forward_argcummin
873
+ else:
874
+ combined_argcummin = reverse_argcummin
875
+
876
+ # If the same minimizer position was observed before, the
877
+ # duplicate is simply ignored, if 'include_duplicates' is false
878
+ if include_duplicates or combined_argcummin != prev_argcummin:
879
+ # Append minimizer to return value
880
+ mininizer_pos[n_minimizers] = combined_argcummin
881
+ minimizers[n_minimizers] = kmers[combined_argcummin]
882
+ n_minimizers += 1
883
+ prev_argcummin = combined_argcummin
884
+
885
+ return (
886
+ np.asarray(mininizer_pos)[:n_minimizers],
887
+ np.asarray(minimizers)[:n_minimizers]
888
+ )
889
+
890
+ @cython.boundscheck(False)
891
+ @cython.wraparound(False)
892
+ @cython.cdivision(True)
893
+ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size):
894
+ """
895
+ Argument of the cumulative minimum.
896
+ """
897
+ cdef uint32 seq_i
898
+
899
+ cdef uint32 current_min_i = 0
900
+ cdef int64 current_min, current_val
901
+ cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
902
+
903
+ # Any actual value will be smaller than this placeholder
904
+ current_min = MAX_INT_64
905
+ for seq_i in range(values.shape[0]):
906
+ if seq_i % chunk_size == 0:
907
+ # New chunk begins
908
+ current_min = MAX_INT_64
909
+ current_val = values[seq_i]
910
+ if current_val < current_min:
911
+ current_min_i = seq_i
912
+ current_min = current_val
913
+ min_pos[seq_i] = current_min_i
914
+
915
+ return min_pos
916
+
917
+ @cython.boundscheck(False)
918
+ @cython.wraparound(False)
919
+ @cython.cdivision(True)
920
+ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size):
921
+ """
922
+ The same as above but starting from the other end and iterating
923
+ backwards.
924
+ Separation into two functions leads to code duplication.
925
+ However, single implemention with reversed `values` as input
926
+ has some disadvantages:
927
+
928
+ - Indices must be transformed so that they point to the
929
+ non-reversed `values`
930
+ - There are issues in selecting the leftmost argument
931
+ - An offset is necessary to ensure alignment of chunks with forward
932
+ pass
933
+
934
+ Hence, a separate 'reverse' variant of the function was implemented.
935
+ """
936
+ cdef uint32 seq_i
937
+
938
+ cdef uint32 current_min_i = 0
939
+ cdef int64 current_min, current_val
940
+ cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
941
+
942
+ current_min = MAX_INT_64
943
+ for seq_i in reversed(range(values.shape[0])):
944
+ # The chunk beginning is a small difference to forward
945
+ # implementation, as it begins on the left of the chunk border
946
+ if seq_i % chunk_size == chunk_size - 1:
947
+ current_min = MAX_INT_64
948
+ current_val = values[seq_i]
949
+ # The '<=' is a small difference to forward implementation
950
+ # to enure the loftmost argument is selected
951
+ if current_val <= current_min:
952
+ current_min_i = seq_i
953
+ current_min = current_val
954
+ min_pos[seq_i] = current_min_i
955
+
956
+ return min_pos