biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,954 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["MinimizerSelector", "SyncmerSelector", "CachedSyncmerSelector",
8
+ "MincodeSelector"]
9
+
10
+ cimport cython
11
+ cimport numpy as np
12
+
13
+ import numpy as np
14
+ from .kmeralphabet import KmerAlphabet
15
+
16
+
17
+ ctypedef np.int64_t int64
18
+ ctypedef np.uint32_t uint32
19
+
20
+
21
+ # Obtained from 'np.iinfo(np.int64).max'
22
+ cdef int64 MAX_INT_64 = 9223372036854775807
23
+
24
+
25
+ class MinimizerSelector:
26
+ """
27
+ MinimizerSelector(kmer_alphabet, window, permutation=None)
28
+
29
+ Selects the *minimizers* in sequences.
30
+
31
+ In a rolling window of *k-mers*, the minimizer is defined as the
32
+ *k-mer* with the minimum *k-mer* code :footcite:`Roberts2004`.
33
+ If the same minimum *k-mer* appears twice in a window, the leftmost
34
+ *k-mer* is selected as minimizer.
35
+
36
+ Parameters
37
+ ----------
38
+ kmer_alphabet : KmerAlphabet
39
+ The *k-mer* alphabet that defines the *k-mer* size and the type
40
+ of sequence this :class:`MinimizerSelector` can be applied on.
41
+ window : int
42
+ The size of the rolling window, where the minimizers are
43
+ searched in.
44
+ In other words this is the number of *k-mers* per window.
45
+ The window size must be at least 2.
46
+ permutation : Permutation
47
+ If set, the *k-mer* order is permuted, i.e.
48
+ the minimizer is chosen based on the ordering of the sort keys
49
+ from :class:`Permutation.permute()`.
50
+ By default, the standard order of the :class:`KmerAlphabet` is
51
+ used.
52
+ This standard order is often the lexicographical order, which is
53
+ known to yield suboptimal *density* in many cases
54
+ :footcite:`Roberts2004`.
55
+
56
+ Attributes
57
+ ----------
58
+ kmer_alphabet : KmerAlphabet
59
+ The *k-mer* alphabet.
60
+ window : int
61
+ The window size.
62
+ permutation : Permutation
63
+ The permutation.
64
+
65
+ Notes
66
+ -----
67
+ For minimizer computation a fast algorithm :footcite:`VanHerk1992`
68
+ is used, whose runtime scales linearly with the length of the
69
+ sequence and is constant with regard to the size of the rolling
70
+ window.
71
+
72
+ References
73
+ ----------
74
+
75
+ .. footbibliography::
76
+
77
+ Examples
78
+ --------
79
+
80
+ The *k-mer* decomposition of a sequence can yield a high number of
81
+ *k-mers*:
82
+
83
+ >>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
84
+ >>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
85
+ >>> all_kmers = kmer_alph.create_kmers(sequence1.code)
86
+ >>> print(all_kmers)
87
+ [ 9367 3639 4415 9199 13431 4415 9192 13271 567 13611 8725 2057
88
+ 7899 9875 1993 6363]
89
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in all_kmers])
90
+ ['THI', 'HIS', 'IS*', 'S*I', '*IS', 'IS*', 'S*A', '*A*', 'A*S', '*SE', 'SEQ', 'EQV', 'QVE', 'VEN', 'ENC', 'NCE']
91
+
92
+ Minimizers can be used to reduce the number of *k-mers* by selecting
93
+ only the minimum *k-mer* in each window *w*:
94
+
95
+ >>> minimizer = MinimizerSelector(kmer_alph, window=4)
96
+ >>> minimizer_pos, minimizers = minimizer.select(sequence1)
97
+ >>> print(minimizer_pos)
98
+ [ 1 2 5 8 11 14]
99
+ >>> print(minimizers)
100
+ [3639 4415 4415 567 2057 1993]
101
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in minimizers])
102
+ ['HIS', 'IS*', 'IS*', 'A*S', 'EQV', 'ENC']
103
+
104
+ Although this approach reduces the number of *k-mers*, minimizers
105
+ are still guaranteed to match minimizers in another sequence, if
106
+ they share an equal subsequence of at least length *w + k - 1*:
107
+
108
+ >>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
109
+ >>> other_minimizer_pos, other_minimizers = minimizer.select(sequence2)
110
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in other_minimizers])
111
+ ['ANQ', 'HER', 'ER*', 'EQV', 'ENC']
112
+ >>> common_minimizers = set.intersection(set(minimizers), set(other_minimizers))
113
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in common_minimizers])
114
+ ['EQV', 'ENC']
115
+ """
116
+
117
+ def __init__(self, kmer_alphabet, window, permutation=None):
118
+ if window < 2:
119
+ raise ValueError("Window size must be at least 2")
120
+ self._window = window
121
+ self._kmer_alph = kmer_alphabet
122
+ self._permutation = permutation
123
+
124
+
125
+ @property
126
+ def kmer_alphabet(self):
127
+ return self._kmer_alph
128
+
129
+ @property
130
+ def window(self):
131
+ return self._window
132
+
133
+ @property
134
+ def permutation(self):
135
+ return self._permutation
136
+
137
+
138
+ def select(self, sequence, bint alphabet_check=True):
139
+ """
140
+ select(sequence, alphabet_check=True)
141
+
142
+ Obtain all overlapping *k-mers* from a sequence and select
143
+ the minimizers from them.
144
+
145
+ Parameters
146
+ ----------
147
+ sequence : Sequence
148
+ The sequence to find the minimizers in.
149
+ Must be compatible with the given `kmer_alphabet`
150
+ alphabet_check: bool, optional
151
+ If set to false, the compatibility between the alphabet
152
+ of the sequence and the alphabet of the
153
+ :class:`MinimizerSelector`
154
+ is not checked to gain additional performance.
155
+
156
+ Returns
157
+ -------
158
+ minimizer_indices : ndarray, dtype=np.uint32
159
+ The sequence indices where the minimizer *k-mers* start.
160
+ minimizers : ndarray, dtype=np.int64
161
+ The *k-mers* that are the selected minimizers, returned as
162
+ *k-mer* code.
163
+
164
+ Notes
165
+ -----
166
+ Duplicate minimizers are omitted, i.e. if two windows have the
167
+ same minimizer position, the return values contain this
168
+ minimizer only once.
169
+ """
170
+ if alphabet_check:
171
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
172
+ raise ValueError(
173
+ "The sequence's alphabet does not fit the k-mer alphabet"
174
+ )
175
+ kmers = self._kmer_alph.create_kmers(sequence.code)
176
+ return self.select_from_kmers(kmers)
177
+
178
+
179
+ def select_from_kmers(self, kmers):
180
+ """
181
+ select_from_kmers(kmers)
182
+
183
+ Select minimizers for the given overlapping *k-mers*.
184
+
185
+ Parameters
186
+ ----------
187
+ kmers : ndarray, dtype=np.int64
188
+ The *k-mer* codes representing the sequence to find the
189
+ minimizers in.
190
+ The *k-mer* codes correspond to the *k-mers* encoded by the
191
+ given `kmer_alphabet`.
192
+
193
+ Returns
194
+ -------
195
+ minimizer_indices : ndarray, dtype=np.uint32
196
+ The indices in the input *k-mer* sequence where a minimizer
197
+ appears.
198
+ minimizers : ndarray, dtype=np.int64
199
+ The corresponding *k-mers* codes of the minimizers.
200
+
201
+ Notes
202
+ -----
203
+ Duplicate minimizers are omitted, i.e. if two windows have the
204
+ same minimizer position, the return values contain this
205
+ minimizer only once.
206
+ """
207
+ if self._permutation is None:
208
+ ordering = kmers
209
+ else:
210
+ ordering = self._permutation.permute(kmers)
211
+ if len(ordering) != len(kmers):
212
+ raise IndexError(
213
+ f"The Permutation is defective, it gave {len(ordering)} "
214
+ f"sort keys for {len(kmers)} k-mers"
215
+ )
216
+
217
+ if len(kmers) < self._window:
218
+ raise ValueError(
219
+ "The number of k-mers is smaller than the window size"
220
+ )
221
+ return _minimize(
222
+ kmers.astype(np.int64, copy=False),
223
+ ordering.astype(np.int64, copy=False),
224
+ self._window,
225
+ include_duplicates=False
226
+ )
227
+
228
+
229
+ class SyncmerSelector:
230
+ """
231
+ SyncmerSelector(alphabet, k, s, permutation=None, offset=(0,))
232
+
233
+ Selects the *syncmers* in sequences.
234
+
235
+ Let the *s-mers* be all overlapping substrings of length *s* in a
236
+ *k-mer*.
237
+ A *k-mer* is a syncmer, if its minimum *s-mer* is at one of the
238
+ given offset positions :footcite:`Edgar2021`.
239
+ If the same minimum *s-mer* appears twice in a *k-mer*, the position
240
+ of the leftmost *s-mer* is taken.
241
+
242
+ Parameters
243
+ ----------
244
+ alphabet : Alphabet
245
+ The base alphabet the *k-mers* and *s-mers* are created from.
246
+ Defines the type of sequence this :class:`MinimizerSelector` can
247
+ be applied on.
248
+ k, s : int
249
+ The length of the *k-mers* and *s-mers*, respectively.
250
+ permutation : Permutation
251
+ If set, the *s-mer* order is permuted, i.e.
252
+ the minimum *s-mer* is chosen based on the ordering of the sort
253
+ keys from :class:`Permutation.permute()`.
254
+ This :class:`Permutation` must be compatible with *s*
255
+ (not with *k*).
256
+ By default, the standard order of the :class:`KmerAlphabet` is
257
+ used.
258
+ This standard order is often the lexicographical order, which is
259
+ known to yield suboptimal *density* in many cases
260
+ :footcite:`Roberts2004`.
261
+ offset : array-like of int
262
+ If the minimum *s-mer* in a *k-mer* is at one of the given
263
+ offset positions, that *k-mer* is a syncmer.
264
+ Negative values indicate the position from the end of the
265
+ *k-mer*.
266
+ By default, the minimum position needs to be at the start of the
267
+ *k-mer*, which is termed *open syncmer*.
268
+
269
+ Attributes
270
+ ----------
271
+ alphabet : Alphabet
272
+ The base alphabet.
273
+ kmer_alphabet, smer_alphabet : int
274
+ The :class:`KmerAlphabet` for *k* and *s*, respectively.
275
+ permutation : Permutation
276
+ The permutation.
277
+
278
+ See Also
279
+ --------
280
+ CachedSyncmerSelector
281
+ A cached variant with faster syncmer selection at the cost of
282
+ increased initialization time.
283
+
284
+ Notes
285
+ -----
286
+ For syncmer computation from a sequence a fast algorithm
287
+ :footcite:`VanHerk1992` is used, whose runtime scales linearly with
288
+ the length of the sequence and is constant with regard to *k*.
289
+
290
+ References
291
+ ----------
292
+
293
+ .. footbibliography::
294
+
295
+ Examples
296
+ --------
297
+
298
+ This example is taken from :footcite:`Edgar2021`:
299
+ The subset of *k-mers* that are *closed syncmers* are selected.
300
+ Closed syncmers are syncmers, where the minimum *s-mer* is in the
301
+ first or last position of the *k-mer*.
302
+ *s-mers* are ordered lexicographically in this example.
303
+
304
+ >>> sequence = NucleotideSequence("GGCAAGTGACA")
305
+ >>> kmer_alph = KmerAlphabet(sequence.alphabet, k=5)
306
+ >>> kmers = kmer_alph.create_kmers(sequence.code)
307
+ >>> closed_syncmer_selector = CachedSyncmerSelector(
308
+ ... sequence.alphabet,
309
+ ... # The same k as in the KmerAlphabet
310
+ ... k=5,
311
+ ... s=2,
312
+ ... # The offset determines that closed syncmers will be selected
313
+ ... offset=(0, -1)
314
+ ... )
315
+ >>> syncmer_pos, syncmers = closed_syncmer_selector.select(sequence)
316
+ >>> # Print all k-mers in the sequence and mark syncmers with a '*'
317
+ >>> for pos, kmer in enumerate(kmer_alph.create_kmers(sequence.code)):
318
+ ... if pos in syncmer_pos:
319
+ ... print("* " + "".join(kmer_alph.decode(kmer)))
320
+ ... else:
321
+ ... print(" " + "".join(kmer_alph.decode(kmer)))
322
+ * GGCAA
323
+ GCAAG
324
+ CAAGT
325
+ * AAGTG
326
+ * AGTGA
327
+ * GTGAC
328
+ TGACA
329
+ """
330
+
331
+ def __init__(self, alphabet, k, s, permutation=None, offset=(0,)):
332
+ if not s < k:
333
+ raise ValueError("s must be smaller than k")
334
+ self._window = k - s + 1
335
+ self._alphabet = alphabet
336
+ self._kmer_alph = KmerAlphabet(alphabet, k)
337
+ self._smer_alph = KmerAlphabet(alphabet, s)
338
+
339
+ self._permutation = permutation
340
+
341
+ self._offset = np.asarray(offset, dtype=np.int64)
342
+ # Wrap around negative indices
343
+ self._offset = np.where(
344
+ self._offset < 0,
345
+ self._window + self._offset,
346
+ self._offset
347
+ )
348
+ if (self._offset >= self._window).any() or (self._offset < 0).any():
349
+ raise IndexError(
350
+ f"Offset is out of window range"
351
+ )
352
+ if len(np.unique(self._offset)) != len(self._offset):
353
+ raise ValueError("Offset must contain unique values")
354
+
355
+
356
+ @property
357
+ def alphabet(self):
358
+ return self._alphabet
359
+
360
+ @property
361
+ def kmer_alphabet(self):
362
+ return self._kmer_alph
363
+
364
+ @property
365
+ def smer_alphabet(self):
366
+ return self._smer_alph
367
+
368
+ @property
369
+ def permutation(self):
370
+ return self._permutation
371
+
372
+
373
+ def select(self, sequence, bint alphabet_check=True):
374
+ """
375
+ select(sequence, alphabet_check=True)
376
+
377
+ Obtain all overlapping *k-mers* from a sequence and select
378
+ the syncmers from them.
379
+
380
+ Parameters
381
+ ----------
382
+ sequence : Sequence
383
+ The sequence to find the syncmers in.
384
+ Must be compatible with the given `kmer_alphabet`
385
+ alphabet_check: bool, optional
386
+ If set to false, the compatibility between the alphabet
387
+ of the sequence and the alphabet of the
388
+ :class:`SyncmerSelector`
389
+ is not checked to gain additional performance.
390
+
391
+ Returns
392
+ -------
393
+ syncmer_indices : ndarray, dtype=np.uint32
394
+ The sequence indices where the syncmers start.
395
+ syncmers : ndarray, dtype=np.int64
396
+ The corresponding *k-mer* codes of the syncmers.
397
+ """
398
+ if alphabet_check:
399
+ if not self._alphabet.extends(sequence.alphabet):
400
+ raise ValueError(
401
+ "The sequence's alphabet does not fit "
402
+ "the selector's alphabet"
403
+ )
404
+ kmers = self._kmer_alph.create_kmers(sequence.code)
405
+ smers = self._smer_alph.create_kmers(sequence.code)
406
+
407
+ if self._permutation is None:
408
+ ordering = smers
409
+ else:
410
+ ordering = self._permutation.permute(smers)
411
+ if len(ordering) != len(smers):
412
+ raise IndexError(
413
+ f"The Permutation is defective, it gave {len(ordering)} "
414
+ f"sort keys for {len(smers)} s-mers"
415
+ )
416
+
417
+ # The aboslute position of the minimum s-mer for each k-mer
418
+ min_pos, _ = _minimize(
419
+ smers,
420
+ ordering.astype(np.int64, copy=False),
421
+ self._window,
422
+ include_duplicates=True
423
+ )
424
+ # The position of the minimum s-mer relative to the start
425
+ # of the k-mer
426
+ relative_min_pos = min_pos - np.arange(len(kmers))
427
+ syncmer_pos = self._filter_syncmer_pos(relative_min_pos)
428
+ return syncmer_pos, kmers[syncmer_pos]
429
+
430
+
431
+ def select_from_kmers(self, kmers):
432
+ """
433
+ select_from_kmers(kmers)
434
+
435
+ Select syncmers for the given *k-mers*.
436
+
437
+ The *k-mers* are not required to overlap.
438
+
439
+ Parameters
440
+ ----------
441
+ kmers : ndarray, dtype=np.int64
442
+ The *k-mer* codes to select the syncmers from.
443
+
444
+ Returns
445
+ -------
446
+ syncmer_indices : ndarray, dtype=np.uint32
447
+ The sequence indices where the syncmers start.
448
+ syncmers : ndarray, dtype=np.int64
449
+ The corresponding *k-mer* codes of the syncmers.
450
+
451
+ Notes
452
+ -----
453
+ Since for *s-mer* creation, the *k-mers* need to be converted
454
+ back to symbol codes again and since the input *k-mers* are not
455
+ required to overlap, calling :meth:`select()` is much faster.
456
+ However, :meth:`select()` is only available for
457
+ :class:`Sequence` objects.
458
+ """
459
+ cdef int64 i
460
+
461
+ symbol_codes_for_each_kmer = self._kmer_alph.split(kmers)
462
+
463
+ cdef int64[:] min_pos = np.zeros(
464
+ len(symbol_codes_for_each_kmer), dtype=np.int64
465
+ )
466
+ for i in range(symbol_codes_for_each_kmer.shape[0]):
467
+ smers = self._smer_alph.create_kmers(symbol_codes_for_each_kmer[i])
468
+ if self._permutation is None:
469
+ ordering = smers
470
+ else:
471
+ ordering = self._permutation.permute(smers)
472
+ if len(ordering) != len(smers):
473
+ raise IndexError(
474
+ f"The Permutation is defective, it gave {len(ordering)} "
475
+ f"sort keys for {len(smers)} s-mers"
476
+ )
477
+ min_pos[i] = np.argmin(ordering)
478
+
479
+ syncmer_pos = self._filter_syncmer_pos(min_pos)
480
+ return syncmer_pos, kmers[syncmer_pos]
481
+
482
+
483
+ def _filter_syncmer_pos(self, min_pos):
484
+ """
485
+ Get indices of *k-mers* that are syncmers, based on `min_pos`,
486
+ the position of the minimum *s-mer* in each *k-mer*.
487
+ Syncmers are k-mers whose the minimum s-mer is at (one of)
488
+ the given offet position(s).
489
+ """
490
+ syncmer_mask = None
491
+ for offset in self._offset:
492
+ # For the usual number of offsets, this 'loop'-appoach is
493
+ # faster than np.isin()
494
+ if syncmer_mask is None:
495
+ syncmer_mask = min_pos == offset
496
+ else:
497
+ syncmer_mask |= min_pos == offset
498
+ return np.where(syncmer_mask)[0]
499
+
500
+
501
+ class CachedSyncmerSelector(SyncmerSelector):
502
+ """
503
+ CachedSyncmerSelector(alphabet, k, s, permutation=None, offset=(0,))
504
+
505
+ Selects the *syncmers* in sequences.
506
+
507
+ Fulsfills the same purpose as :class:`SyncmerSelector`, but
508
+ precomputes for each possible *k-mer*, whether it is a syncmer,
509
+ at initialization.
510
+ Hence, syncmer selection is faster at the cost of longer
511
+ initialization time.
512
+
513
+ Parameters
514
+ ----------
515
+ alphabet : Alphabet
516
+ The base alphabet the *k-mers* and *s-mers* are created from.
517
+ Defines the type of sequence this :class:`MinimizerSelector` can
518
+ be applied on.
519
+ k, s : int
520
+ The length of the *k-mers* and *s-mers*, respectively.
521
+ permutation : Permutation
522
+ If set, the *s-mer* order is permuted, i.e.
523
+ the minimum *s-mer* is chosen based on the ordering of the sort
524
+ keys from :class:`Permutation.permute()`.
525
+ This :class:`Permutation` must be compatible with *s*
526
+ (not with *k*).
527
+ By default, the standard order of the :class:`KmerAlphabet` is
528
+ used.
529
+ This standard order is often the lexicographical order, which is
530
+ known to yield suboptimal *density* in many cases
531
+ :footcite:`Roberts2004`.
532
+ offset : array-like of int
533
+ If the minimum *s-mer* in a *k-mer* is at one of the given
534
+ offset positions, that *k-mer* is a syncmer.
535
+ Negative values indicate the position from the end of the
536
+ *k-mer*.
537
+ By default, the minimum position needs to be at the start of the
538
+ *k-mer*, which is termed *open syncmer*.
539
+
540
+ Attributes
541
+ ----------
542
+ alphabet : Alphabet
543
+ The base alphabet.
544
+ kmer_alphabet, smer_alphabet : int
545
+ The :class:`KmerAlphabet` for *k* and *s*, respectively.
546
+ permutation : Permutation
547
+ The permutation.
548
+
549
+ See Also
550
+ --------
551
+ SyncmerSelector
552
+ A standard variant for syncmer selection.
553
+
554
+ Notes
555
+ -----
556
+ Both the initialization time and memory requirements are
557
+ proportional to the size of the `kmer_alphabet`, i.e. :math:`n^k`.
558
+ Hence, it is adviced to use this class only for rather small
559
+ alphabets.
560
+
561
+ References
562
+ ----------
563
+
564
+ .. footbibliography::
565
+
566
+ Examples
567
+ --------
568
+
569
+ >>> sequence = NucleotideSequence("GGCAAGTGACA")
570
+ >>> kmer_alph = KmerAlphabet(sequence.alphabet, k=5)
571
+ >>> # The initialization can quite a long time for large *k-mer* alphabets...
572
+ >>> closed_syncmer_selector = CachedSyncmerSelector(
573
+ ... sequence.alphabet,
574
+ ... # The same k as in the KmerAlphabet
575
+ ... k=5,
576
+ ... s=2,
577
+ ... # The offset determines that closed syncmers will be selected
578
+ ... offset=(0, -1)
579
+ ... )
580
+ >>> # ...but the actual syncmer identification is very fast
581
+ >>> syncmer_pos, syncmers = closed_syncmer_selector.select(sequence)
582
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in syncmers])
583
+ ['GGCAA', 'AAGTG', 'AGTGA', 'GTGAC']
584
+ """
585
+
586
+ def __init__(self, alphabet, k, s, permutation=None, offset=(0,)):
587
+ super().__init__(alphabet, k, s, permutation, offset)
588
+ # Check for all possible *k-mers*, whether they are syncmers
589
+ all_kmers = np.arange(len(self.kmer_alphabet))
590
+ syncmer_indices, _ = super().select_from_kmers(all_kmers)
591
+ # Convert the index array into a boolean mask
592
+ self._syncmer_mask = np.zeros(len(self.kmer_alphabet), dtype=bool)
593
+ self._syncmer_mask[syncmer_indices] = True
594
+
595
+
596
+ def select(self, sequence, bint alphabet_check=True):
597
+ """
598
+ select(sequence, alphabet_check=True)
599
+
600
+ Obtain all overlapping *k-mers* from a sequence and select
601
+ the syncmers from them.
602
+
603
+ Parameters
604
+ ----------
605
+ sequence : Sequence
606
+ The sequence to find the syncmers in.
607
+ Must be compatible with the given `kmer_alphabet`
608
+ alphabet_check: bool, optional
609
+ If set to false, the compatibility between the alphabet
610
+ of the sequence and the alphabet of the
611
+ :class:`CachedSyncmerSelector`
612
+ is not checked to gain additional performance.
613
+
614
+ Returns
615
+ -------
616
+ syncmer_indices : ndarray, dtype=np.uint32
617
+ The sequence indices where the syncmers start.
618
+ syncmers : ndarray, dtype=np.int64
619
+ The corresponding *k-mer* codes of the syncmers.
620
+ """
621
+ if alphabet_check:
622
+ if not self.alphabet.extends(sequence.alphabet):
623
+ raise ValueError(
624
+ "The sequence's alphabet does not fit "
625
+ "the selector's alphabet"
626
+ )
627
+ kmers = self.kmer_alphabet.create_kmers(sequence.code)
628
+ return self.select_from_kmers(kmers)
629
+
630
+
631
+ def select_from_kmers(self, kmers):
632
+ """
633
+ select_from_kmers(kmers)
634
+
635
+ Select syncmers for the given *k-mers*.
636
+
637
+ The *k-mers* are not required to overlap.
638
+
639
+ Parameters
640
+ ----------
641
+ kmers : ndarray, dtype=np.int64
642
+ The *k-mer* codes to select the syncmers from.
643
+
644
+ Returns
645
+ -------
646
+ syncmer_indices : ndarray, dtype=np.uint32
647
+ The sequence indices where the syncmers start.
648
+ syncmers : ndarray, dtype=np.int64
649
+ The corresponding *k-mer* codes of the syncmers.
650
+ """
651
+ syncmer_pos = np.where(self._syncmer_mask[kmers])[0]
652
+ return syncmer_pos, kmers[syncmer_pos]
653
+
654
+
655
+ class MincodeSelector:
656
+ r"""
657
+ MincodeSelector(self, kmer_alphabet, compression, permutation=None)
658
+
659
+ Selects the :math:`1/\text{compression}` *smallest* *k-mers* from
660
+ :class:`KmerAlphabet`. :footcite:`Edgar2021`
661
+
662
+ '*Small*' refers to the lexicographical order, or alternatively a
663
+ custom order if `permutation` is given.
664
+ The *Mincode* approach tries to reduce the number of *k-mers* from a
665
+ sequence by the factor `compression`, while it still ensures that
666
+ a common set of *k-mers* are selected from similar sequences.
667
+
668
+ Parameters
669
+ ----------
670
+ kmer_alphabet : KmerAlphabet
671
+ The *k-mer* alphabet that defines the *k-mer* size and the type
672
+ of sequence this :class:`MincodeSelector` can be applied on.
673
+ compression : float
674
+ Defines the compression factor, i.e. the approximate fraction
675
+ of *k-mers* that will be sampled from a sequence.
676
+ permutation : Permutation
677
+ If set, the *k-mer* order is permuted, i.e.
678
+ the *k-mers* are selected based on the ordering of the sort keys
679
+ from :class:`Permutation.permute()`.
680
+ By default, the standard order of the :class:`KmerAlphabet` is
681
+ used.
682
+ This standard order is often the lexicographical order.
683
+
684
+ Attributes
685
+ ----------
686
+ kmer_alphabet : KmerAlphabet
687
+ The *k-mer* alphabet.
688
+ compression : float
689
+ The compression factor.
690
+ threshold : float
691
+ Based on the compression factor and the range of (permuted)
692
+ *k-mer* values this threshold is calculated.
693
+ All *k-mers*, that are smaller than this value are selected.
694
+ permutation : Permutation
695
+ The permutation.
696
+
697
+ References
698
+ ----------
699
+
700
+ .. footbibliography::
701
+
702
+ Examples
703
+ --------
704
+
705
+ >>> kmer_alph = KmerAlphabet(NucleotideSequence.alphabet_unamb, k=2)
706
+ >>> kmers = np.arange(len(kmer_alph))
707
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers])
708
+ ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']
709
+ >>> # Select 1/4 of *k-mers* based on lexicographical k-mer order
710
+ >>> selector = MincodeSelector(kmer_alph, 4)
711
+ >>> subset_pos, kmers_subset = selector.select_from_kmers(kmers)
712
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers_subset])
713
+ ['AA', 'AC', 'AG', 'AT']
714
+ >>> # Select 1/4 based on randomized k-mer order
715
+ >>> selector = MincodeSelector(kmer_alph, 4, permutation=RandomPermutation())
716
+ >>> subset_pos, kmers_subset = selector.select_from_kmers(kmers)
717
+ >>> print(["".join(kmer_alph.decode(kmer)) for kmer in kmers_subset])
718
+ ['AG', 'CT', 'GA', 'TC']
719
+ """
720
+
721
+ def __init__(self, kmer_alphabet, compression, permutation=None):
722
+ if compression < 1:
723
+ raise ValueError(
724
+ "Compression factor must be equal to or larger than 1"
725
+ )
726
+ self._compression = compression
727
+ self._kmer_alph = kmer_alphabet
728
+ self._permutation = permutation
729
+ if permutation is None:
730
+ permutation_offset = 0
731
+ permutation_range = len(kmer_alphabet)
732
+ else:
733
+ permutation_offset = permutation.min
734
+ permutation_range = permutation.max - permutation.min + 1
735
+ self._threshold = permutation_offset + permutation_range / compression
736
+
737
+
738
+ @property
739
+ def kmer_alphabet(self):
740
+ return self._kmer_alph
741
+
742
+ @property
743
+ def compression(self):
744
+ return self._compression
745
+
746
+ @property
747
+ def threshold(self):
748
+ return self._threshold
749
+
750
+ @property
751
+ def permutation(self):
752
+ return self._permutation
753
+
754
+
755
+ def select(self, sequence, bint alphabet_check=True):
756
+ """
757
+ select(sequence, alphabet_check=True)
758
+
759
+ Obtain all overlapping *k-mers* from a sequence and select
760
+ the *Mincode k-mers* from them.
761
+
762
+ Parameters
763
+ ----------
764
+ sequence : Sequence
765
+ The sequence to find the *Mincode k-mers* in.
766
+ Must be compatible with the given `kmer_alphabet`
767
+ alphabet_check: bool, optional
768
+ If set to false, the compatibility between the alphabet
769
+ of the sequence and the alphabet of the
770
+ :class:`MincodeSelector`
771
+ is not checked to gain additional performance.
772
+
773
+ Returns
774
+ -------
775
+ mincode_indices : ndarray, dtype=np.uint32
776
+ The sequence indices where the *Mincode k-mers* start.
777
+ mincode : ndarray, dtype=np.int64
778
+ The corresponding *Mincode k-mer* codes.
779
+ """
780
+ if alphabet_check:
781
+ if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
782
+ raise ValueError(
783
+ "The sequence's alphabet does not fit the k-mer alphabet"
784
+ )
785
+ kmers = self._kmer_alph.create_kmers(sequence.code)
786
+ return self.select_from_kmers(kmers)
787
+
788
+
789
+ def select_from_kmers(self, kmers):
790
+ """
791
+ select_from_kmers(kmers)
792
+
793
+ Select *Mincode k-mers*.
794
+
795
+ The given *k-mers* are not required to overlap.
796
+
797
+ Parameters
798
+ ----------
799
+ kmers : ndarray, dtype=np.int64
800
+ The *k-mer* codes to select the *Mincode k-mers* from.
801
+
802
+ Returns
803
+ -------
804
+ mincode_indices : ndarray, dtype=np.uint32
805
+ The sequence indices where the *Mincode k-mers* start.
806
+ mincode : ndarray, dtype=np.int64
807
+ The corresponding *Mincode k-mer* codes.
808
+ """
809
+ if self._permutation is None:
810
+ ordering = kmers
811
+ else:
812
+ ordering = self._permutation.permute(kmers)
813
+ if len(ordering) != len(kmers):
814
+ raise IndexError(
815
+ f"The Permutation is defective, it gave {len(ordering)} "
816
+ f"sort keys for {len(kmers)} k-mers"
817
+ )
818
+
819
+ mincode_pos = ordering < self._threshold
820
+ return mincode_pos, kmers[mincode_pos]
821
+
822
+
823
+ @cython.boundscheck(False)
824
+ @cython.wraparound(False)
825
+ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window,
826
+ bint include_duplicates):
827
+ """
828
+ Implementation of the algorithm originally devised by
829
+ Marcel van Herk.
830
+
831
+ In this implementation the frame is chosen differently:
832
+ For a position 'x' the frame ranges from 'x' to 'x + window-1'
833
+ instead of 'x - (window-1)/2' to 'x + (window-1)/2'.
834
+ """
835
+ cdef uint32 seq_i
836
+
837
+ cdef uint32 n_windows = kmers.shape[0] - (window - 1)
838
+ # Pessimistic array allocation size
839
+ # -> Expect that every window has a new minimizer
840
+ cdef uint32[:] mininizer_pos = np.empty(n_windows, dtype=np.uint32)
841
+ cdef int64[:] minimizers = np.empty(n_windows, dtype=np.int64)
842
+ # Counts the actual number of minimiers for later trimming
843
+ cdef uint32 n_minimizers = 0
844
+
845
+ # Variables for the position of the previous cumulative minimum
846
+ # Assign an value that can never occur for the start,
847
+ # as in the beginning there is no previous value
848
+ cdef uint32 prev_argcummin = kmers.shape[0]
849
+ # Variables for the position of the current cumulative minimum
850
+ cdef uint32 combined_argcummin, forward_argcummin, reverse_argcummin
851
+ # Variables for the current cumulative minimum
852
+ cdef int64 combined_cummin, forward_cummin, reverse_cummin
853
+ # Variables for cumulative minima at all positions
854
+ cdef uint32[:] forward_argcummins = _chunk_wise_forward_argcummin(
855
+ ordering, window
856
+ )
857
+ cdef uint32[:] reverse_argcummins = _chunk_wise_reverse_argcummin(
858
+ ordering, window
859
+ )
860
+
861
+ for seq_i in range(n_windows):
862
+ forward_argcummin = forward_argcummins[seq_i + window - 1]
863
+ reverse_argcummin = reverse_argcummins[seq_i]
864
+ forward_cummin = ordering[forward_argcummin]
865
+ reverse_cummin = ordering[reverse_argcummin]
866
+
867
+ # At ties the leftmost position is taken,
868
+ # which stems from the reverse pass
869
+ if forward_cummin < reverse_cummin:
870
+ combined_argcummin = forward_argcummin
871
+ else:
872
+ combined_argcummin = reverse_argcummin
873
+
874
+ # If the same minimizer position was observed before, the
875
+ # duplicate is simply ignored, if 'include_duplicates' is false
876
+ if include_duplicates or combined_argcummin != prev_argcummin:
877
+ # Append minimizer to return value
878
+ mininizer_pos[n_minimizers] = combined_argcummin
879
+ minimizers[n_minimizers] = kmers[combined_argcummin]
880
+ n_minimizers += 1
881
+ prev_argcummin = combined_argcummin
882
+
883
+ return (
884
+ np.asarray(mininizer_pos)[:n_minimizers],
885
+ np.asarray(minimizers)[:n_minimizers]
886
+ )
887
+
888
+ @cython.boundscheck(False)
889
+ @cython.wraparound(False)
890
+ @cython.cdivision(True)
891
+ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size):
892
+ """
893
+ Argument of the cumulative minimum.
894
+ """
895
+ cdef uint32 seq_i
896
+
897
+ cdef uint32 current_min_i = 0
898
+ cdef int64 current_min, current_val
899
+ cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
900
+
901
+ # Any actual value will be smaller than this placeholder
902
+ current_min = MAX_INT_64
903
+ for seq_i in range(values.shape[0]):
904
+ if seq_i % chunk_size == 0:
905
+ # New chunk begins
906
+ current_min = MAX_INT_64
907
+ current_val = values[seq_i]
908
+ if current_val < current_min:
909
+ current_min_i = seq_i
910
+ current_min = current_val
911
+ min_pos[seq_i] = current_min_i
912
+
913
+ return min_pos
914
+
915
+ @cython.boundscheck(False)
916
+ @cython.wraparound(False)
917
+ @cython.cdivision(True)
918
+ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size):
919
+ """
920
+ The same as above but starting from the other end and iterating
921
+ backwards.
922
+ Separation into two functions leads to code duplication.
923
+ However, single implemention with reversed `values` as input
924
+ has some disadvantages:
925
+
926
+ - Indices must be transformed so that they point to the
927
+ non-reversed `values`
928
+ - There are issues in selecting the leftmost argument
929
+ - An offset is necessary to ensure alignment of chunks with forward
930
+ pass
931
+
932
+ Hence, a separate 'reverse' variant of the function was implemented.
933
+ """
934
+ cdef uint32 seq_i
935
+
936
+ cdef uint32 current_min_i = 0
937
+ cdef int64 current_min, current_val
938
+ cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
939
+
940
+ current_min = MAX_INT_64
941
+ for seq_i in reversed(range(values.shape[0])):
942
+ # The chunk beginning is a small difference to forward
943
+ # implementation, as it begins on the left of the chunk border
944
+ if seq_i % chunk_size == chunk_size - 1:
945
+ current_min = MAX_INT_64
946
+ current_val = values[seq_i]
947
+ # The '<=' is a small difference to forward implementation
948
+ # to enure the loftmost argument is selected
949
+ if current_val <= current_min:
950
+ current_min_i = seq_i
951
+ current_min = current_val
952
+ min_pos[seq_i] = current_min_i
953
+
954
+ return min_pos