biotite 1.5.0__cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-313-x86_64-linux-gnu.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-313-x86_64-linux-gnu.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-313-x86_64-linux-gnu.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-313-x86_64-linux-gnu.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-313-x86_64-linux-gnu.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-313-x86_64-linux-gnu.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-313-x86_64-linux-gnu.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-313-x86_64-linux-gnu.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-313-x86_64-linux-gnu.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-313-x86_64-linux-gnu.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-313-x86_64-linux-gnu.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-313-x86_64-linux-gnu.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-313-x86_64-linux-gnu.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-313-x86_64-linux-gnu.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-313-x86_64-linux-gnu.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-313-x86_64-linux-gnu.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-313-x86_64-linux-gnu.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-313-x86_64-linux-gnu.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-313-x86_64-linux-gnu.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-313-x86_64-linux-gnu.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-313-x86_64-linux-gnu.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,595 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["KmerAlphabet"]
8
+
9
+ cimport cython
10
+ cimport numpy as np
11
+
12
+ import numpy as np
13
+ from ..alphabet import Alphabet, LetterAlphabet, AlphabetError
14
+
15
+
16
+ ctypedef np.uint8_t uint8
17
+ ctypedef np.uint16_t uint16
18
+ ctypedef np.uint32_t uint32
19
+ ctypedef np.uint64_t uint64
20
+ ctypedef np.int64_t int64
21
+
22
+
23
+ ctypedef fused CodeType:
24
+ uint8
25
+ uint16
26
+ uint32
27
+ uint64
28
+
29
+
30
+ class KmerAlphabet(Alphabet):
31
+ """
32
+ __init__(base_alphabet, k, spacing=None)
33
+
34
+ This type of alphabet uses *k-mers* as symbols, i.e. all
35
+ combinations of *k* symbols from its *base alphabet*.
36
+
37
+ It's primary use is its :meth:`create_kmers()` method, that iterates
38
+ over all overlapping *k-mers* in a :class:`Sequence` and encodes
39
+ each one into its corresponding *k-mer* symbol code
40
+ (*k-mer* code in short).
41
+ This functionality is prominently used by a :class:`KmerTable` to
42
+ find *k-mer* matches between two sequences.
43
+
44
+ A :class:`KmerAlphabet` has :math:`n^k` different symbols, where
45
+ :math:`n` is the number of symbols in the base alphabet.
46
+
47
+ Parameters
48
+ ----------
49
+ base_alphabet : Alphabet
50
+ The base alphabet.
51
+ The created :class:`KmerAlphabet` contains all combinations of
52
+ *k* symbols from this alphabet.
53
+ k : int
54
+ An integer greater than 1 that defines the length of the
55
+ *k-mers*.
56
+ spacing : None or str or list or ndarray, dtype=int, shape=(k,)
57
+ If provided, spaced *k-mers* are used instead of continuous
58
+ ones :footcite:`Ma2002`.
59
+ The value contains the *informative* positions relative to the
60
+ start of the *k-mer*, also called the *model*.
61
+ The number of *informative* positions must equal *k*.
62
+
63
+ If a string is given, each ``'1'`` in the string indicates an
64
+ *informative* position.
65
+ For a continuous *k-mer* the `spacing` would be ``'111...'``.
66
+
67
+ If a list or array is given, it must contain unique non-negative
68
+ integers, that indicate the *informative* positions.
69
+ For a continuous *k-mer* the `spacing` would be
70
+ ``[0, 1, 2,...]``.
71
+
72
+ Attributes
73
+ ----------
74
+ base_alphabet : Alphabet
75
+ The base alphabet, from which the :class:`KmerAlphabet` was
76
+ created.
77
+ k : int
78
+ The length of the *k-mers*.
79
+ spacing : None or ndarray, dtype=int
80
+ The *k-mer* model in array form, if spaced *k-mers* are used,
81
+ ``None`` otherwise.
82
+
83
+ Notes
84
+ -----
85
+ The symbol code for a *k-mer* :math:`s` calculates as
86
+
87
+ .. math:: RMSD = \sum_{i=0}^{k-1} n^{k-i-1} s_i
88
+
89
+ where :math:`n` is the length of the base alphabet.
90
+
91
+ Hence the :class:`KmerAlphabet` sorts *k-mers* in the order of the
92
+ base alphabet, where leading positions within the *k-mer* take
93
+ precedence.
94
+
95
+ References
96
+ ----------
97
+
98
+ .. footbibliography::
99
+
100
+ Examples
101
+ --------
102
+ Create an alphabet of nucleobase *2-mers*:
103
+
104
+ >>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
105
+ >>> print(base_alphabet.get_symbols())
106
+ ('A', 'C', 'G', 'T')
107
+ >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
108
+ >>> print(kmer_alphabet.get_symbols())
109
+ ('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT')
110
+
111
+ Encode and decode *k-mers*:
112
+
113
+ >>> print(kmer_alphabet.encode("TC"))
114
+ 13
115
+ >>> print(kmer_alphabet.decode(13))
116
+ ['T' 'C']
117
+
118
+ Fuse symbol codes from the base alphabet into a *k-mer* code
119
+ and split the *k-mer* code back into the original symbol codes:
120
+
121
+ >>> symbol_codes = base_alphabet.encode_multiple("TC")
122
+ >>> print(symbol_codes)
123
+ [3 1]
124
+ >>> print(kmer_alphabet.fuse(symbol_codes))
125
+ 13
126
+ >>> print(kmer_alphabet.split(13))
127
+ [3 1]
128
+
129
+ Encode all overlapping continuous k-mers of a sequence:
130
+
131
+ >>> sequence = NucleotideSequence("ATTGCT")
132
+ >>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
133
+ >>> print(kmer_codes)
134
+ [ 3 15 14 9 7]
135
+ >>> print(["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)])
136
+ ['AT', 'TT', 'TG', 'GC', 'CT']
137
+
138
+ Encode all overlapping k-mers using spacing:
139
+
140
+ >>> base_alphabet = ProteinSequence.alphabet
141
+ >>> kmer_alphabet = KmerAlphabet(base_alphabet, 3, spacing="1101")
142
+ >>> sequence = ProteinSequence("BIQTITE")
143
+ >>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
144
+ >>> # Pretty print k-mers
145
+ >>> strings = ["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)]
146
+ >>> print([s[0] + s[1] + "_" + s[2] for s in strings])
147
+ ['BI_T', 'IQ_I', 'QT_T', 'TI_E']
148
+ """
149
+
150
+ def __init__(self, base_alphabet, k, spacing=None):
151
+ if not isinstance(base_alphabet, Alphabet):
152
+ raise TypeError(
153
+ f"Got {type(base_alphabet).__name__}, "
154
+ f"but Alphabet was expected"
155
+ )
156
+ if k < 2:
157
+ raise ValueError("k must be at least 2")
158
+ self._base_alph = base_alphabet
159
+ self._k = k
160
+
161
+ base_alph_len = len(self._base_alph)
162
+ self._radix_multiplier = np.array(
163
+ [base_alph_len**n for n in reversed(range(0, self._k))],
164
+ dtype=np.int64
165
+ )
166
+
167
+ if spacing is None:
168
+ self._spacing = None
169
+
170
+ elif isinstance(spacing, str):
171
+ self._spacing = _to_array_form(spacing)
172
+
173
+ else:
174
+ self._spacing = np.array(spacing, dtype=np.int64)
175
+ self._spacing.sort()
176
+ if (self._spacing < 0).any():
177
+ raise ValueError(
178
+ "Only non-negative integers are allowed for spacing"
179
+ )
180
+ if len(np.unique(self._spacing)) != len(self._spacing):
181
+ raise ValueError(
182
+ "Spacing model contains duplicate values"
183
+ )
184
+
185
+ if spacing is not None and len(self._spacing) != self._k:
186
+ raise ValueError(
187
+ f"Expected {self._k} informative positions, "
188
+ f"but got {len(self._spacing)} positions in spacing"
189
+ )
190
+
191
+
192
+ @property
193
+ def base_alphabet(self):
194
+ return self._base_alph
195
+
196
+ @property
197
+ def k(self):
198
+ return self._k
199
+
200
+ @property
201
+ def spacing(self):
202
+ return None if self._spacing is None else self._spacing.copy()
203
+
204
+
205
+ def get_symbols(self):
206
+ """
207
+ get_symbols()
208
+
209
+ Get the symbols in the alphabet.
210
+
211
+ Returns
212
+ -------
213
+ symbols : tuple
214
+ A tuple of all *k-mer* symbols, i.e. all possible
215
+ combinations of *k* symbols from its *base alphabet*.
216
+
217
+ Notes
218
+ -----
219
+ In contrast the base :class:`Alphabet` and
220
+ :class:`LetterAlphabet` class, :class:`KmerAlphabet` does not
221
+ hold a list of its symbols internally for performance reasons.
222
+ Hence calling :meth:`get_symbols()` may be quite time consuming
223
+ for large base alphabets or large *k* values, as the list needs
224
+ to be created first.
225
+ """
226
+ if isinstance(self._base_alph, LetterAlphabet):
227
+ return tuple(["".join(self.decode(code)) for code in range(len(self))])
228
+ else:
229
+ return tuple([list(self.decode(code)) for code in range(len(self))])
230
+
231
+
232
+ def extends(self, alphabet):
233
+ # A KmerAlphabet cannot really extend another KmerAlphabet:
234
+ # If k is not equal, all symbols are not equal
235
+ # If the base alphabet has additional symbols, the correct
236
+ # order is not preserved
237
+ # A KmerAlphabet can only 'extend' another KmerAlphabet,
238
+ # if the two alphabets are equal
239
+ return alphabet == self
240
+
241
+
242
+ def encode(self, symbol):
243
+ return self.fuse(self._base_alph.encode_multiple(symbol))
244
+
245
+
246
+ def decode(self, code):
247
+ return self._base_alph.decode_multiple(self.split(code))
248
+
249
+
250
+ def fuse(self, codes):
251
+ """
252
+ fuse(codes)
253
+
254
+ Get the *k-mer* code for *k* symbol codes from the base
255
+ alphabet.
256
+
257
+ This method can be used in a vectorized manner to obtain
258
+ *n* *k-mer* codes from an *(n,k)* integer array.
259
+
260
+ Parameters
261
+ ----------
262
+ codes : ndarray, dtype=int, shape=(k,) or shape=(n,k)
263
+ The symbol codes from the base alphabet to be fused.
264
+
265
+ Returns
266
+ -------
267
+ kmer_codes : int or ndarray, dtype=np.int64, shape=(n,)
268
+ The fused *k-mer* code(s).
269
+
270
+ See Also
271
+ --------
272
+ split
273
+ The reverse operation.
274
+
275
+ Examples
276
+ --------
277
+
278
+ >>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
279
+ >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
280
+ >>> symbol_codes = base_alphabet.encode_multiple("TC")
281
+ >>> print(symbol_codes)
282
+ [3 1]
283
+ >>> print(kmer_alphabet.fuse(symbol_codes))
284
+ 13
285
+ >>> print(kmer_alphabet.split(13))
286
+ [3 1]
287
+ """
288
+ if codes.shape[-1] != self._k:
289
+ raise AlphabetError(
290
+ f"Given k-mer(s) has {codes.shape[-1]} symbols, "
291
+ f"but alphabet expects {self._k}-mers"
292
+ )
293
+ if np.any(codes > len(self._base_alph)):
294
+ raise AlphabetError("Given k-mer(s) contains invalid symbol code")
295
+
296
+ orig_shape = codes.shape
297
+ codes = np.atleast_2d(codes)
298
+ kmer_code = np.sum(self._radix_multiplier * codes, axis=-1)
299
+ # The last dimension is removed since it collpased in np.sum
300
+ return kmer_code.reshape(orig_shape[:-1])
301
+
302
+ def split(self, kmer_code):
303
+ """
304
+ split(kmer_code)
305
+
306
+ Convert a *k-mer* code back into *k* symbol codes from the base
307
+ alphabet.
308
+
309
+ This method can be used in a vectorized manner to split
310
+ *n* *k-mer* codes into an *(n,k)* integer array.
311
+
312
+ Parameters
313
+ ----------
314
+ kmer_code : int or ndarray, dtype=int, shape=(n,)
315
+ The *k-mer* code(s).
316
+
317
+ Returns
318
+ -------
319
+ codes : ndarray, dtype=np.uint64, shape=(k,) or shape=(n,k)
320
+ The split symbol codes from the base alphabet.
321
+
322
+ See Also
323
+ --------
324
+ fuse
325
+ The reverse operation.
326
+
327
+ Examples
328
+ --------
329
+
330
+ >>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
331
+ >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
332
+ >>> symbol_codes = base_alphabet.encode_multiple("TC")
333
+ >>> print(symbol_codes)
334
+ [3 1]
335
+ >>> print(kmer_alphabet.fuse(symbol_codes))
336
+ 13
337
+ >>> print(kmer_alphabet.split(13))
338
+ [3 1]
339
+ """
340
+ if np.any(kmer_code >= len(self)) or np.any(kmer_code < 0):
341
+ raise AlphabetError(
342
+ f"Given k-mer symbol code is invalid for this alphabet"
343
+ )
344
+
345
+ orig_shape = np.shape(kmer_code)
346
+ split_codes = self._split(
347
+ np.atleast_1d(kmer_code).astype(np.int64, copy=False)
348
+ )
349
+ return split_codes.reshape(orig_shape + (self._k,))
350
+
351
+ @cython.boundscheck(False)
352
+ @cython.wraparound(False)
353
+ @cython.cdivision(True)
354
+ def _split(self, int64[:] codes not None):
355
+ cdef int i, n
356
+ cdef int64 code, val, symbol_code
357
+
358
+ cdef int64[:] radix_multiplier = self._radix_multiplier
359
+
360
+ cdef uint64[:,:] split_codes = np.empty(
361
+ (codes.shape[0], self._k), dtype=np.uint64
362
+ )
363
+
364
+ cdef int k = self._k
365
+ for i in range(codes.shape[0]):
366
+ code = codes[i]
367
+ for n in range(k):
368
+ val = radix_multiplier[n]
369
+ symbol_code = code // val
370
+ split_codes[i,n] = symbol_code
371
+ code -= symbol_code * val
372
+
373
+ return np.asarray(split_codes)
374
+
375
+
376
+ def kmer_array_length(self, int64 length):
377
+ """
378
+ kmer_array_length(length)
379
+
380
+ Get the length of the *k-mer* array, created by
381
+ :meth:`create_kmers()`, if a sequence of size `length` would be
382
+ given.
383
+
384
+ Parameters
385
+ ----------
386
+ length : int
387
+ The length of the hypothetical sequence
388
+
389
+ Returns
390
+ -------
391
+ kmer_length : int
392
+ The length of created *k-mer* array.
393
+ """
394
+ cdef int64 max_offset
395
+ cdef int64[:] spacing
396
+
397
+ if self._spacing is None:
398
+ return length - self._k + 1
399
+ else:
400
+ spacing = self._spacing
401
+ max_offset = self._spacing[len(spacing)-1] + 1
402
+ return length - max_offset + 1
403
+
404
+
405
+ def create_kmers(self, seq_code):
406
+ """
407
+ create_kmers(seq_code)
408
+
409
+ Create *k-mer* codes for all overlapping *k-mers* in the given
410
+ sequence code.
411
+
412
+ Parameters
413
+ ----------
414
+ seq_code : ndarray, dtype={np.uint8, np.uint16, np.uint32, np.uint64}
415
+ The sequence code to be converted into *k-mers*.
416
+
417
+ Returns
418
+ -------
419
+ kmer_codes : ndarray, dtype=int64
420
+ The symbol codes for the *k-mers*.
421
+
422
+ Examples
423
+ --------
424
+
425
+ >>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
426
+ >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
427
+ >>> sequence = NucleotideSequence("ATTGCT")
428
+ >>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
429
+ >>> print(kmer_codes)
430
+ [ 3 15 14 9 7]
431
+ >>> print(["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)])
432
+ ['AT', 'TT', 'TG', 'GC', 'CT']
433
+ """
434
+ if self._spacing is None:
435
+ return self._create_continuous_kmers(seq_code)
436
+ else:
437
+ return self._create_spaced_kmers(seq_code)
438
+
439
+ @cython.boundscheck(False)
440
+ @cython.wraparound(False)
441
+ def _create_continuous_kmers(self, CodeType[:] seq_code not None):
442
+ """
443
+ Fast implementation of k-mer decomposition.
444
+ Each k-mer is computed from the previous one by removing
445
+ a symbol shifting the remaining values and add the new symbol.
446
+ Requires looping only over sequence length.
447
+ """
448
+ cdef int64 i
449
+
450
+ cdef int k = self._k
451
+ cdef uint64 alphabet_length = len(self._base_alph)
452
+ cdef int64[:] radix_multiplier = self._radix_multiplier
453
+ cdef int64 end_radix_multiplier = alphabet_length**(k-1)
454
+
455
+ if len(seq_code) < <unsigned int>k:
456
+ raise ValueError(
457
+ "The length of the sequence code is shorter than k"
458
+ )
459
+
460
+ cdef int64[:] kmers = np.empty(
461
+ self.kmer_array_length(len(seq_code)), dtype=np.int64
462
+ )
463
+
464
+ cdef CodeType code
465
+ cdef int64 kmer, prev_kmer
466
+ # Compute first k-mer using naive approach
467
+ kmer = 0
468
+ for i in range(k):
469
+ code = seq_code[i]
470
+ if code >= alphabet_length:
471
+ raise AlphabetError(f"Symbol code {code} is out of range")
472
+ kmer += radix_multiplier[i] * code
473
+ kmers[0] = kmer
474
+
475
+ # Compute all following k-mers from the previous one
476
+ prev_kmer = kmer
477
+ for i in range(1, kmers.shape[0]):
478
+ code = seq_code[i + k - 1]
479
+ if code >= alphabet_length:
480
+ raise AlphabetError(f"Symbol code {code} is out of range")
481
+ kmer = (
482
+ (
483
+ # Remove first symbol
484
+ (prev_kmer - seq_code[i - 1] * end_radix_multiplier)
485
+ # Shift k-mer to left
486
+ * alphabet_length
487
+ )
488
+ # Add new symbol
489
+ + code
490
+ )
491
+ kmers[i] = kmer
492
+ prev_kmer = kmer
493
+
494
+ return np.asarray(kmers)
495
+
496
+ @cython.boundscheck(False)
497
+ @cython.wraparound(False)
498
+ def _create_spaced_kmers(self, CodeType[:] seq_code not None):
499
+ cdef int64 i, j
500
+
501
+ cdef int k = self._k
502
+ cdef int64[:] spacing = self._spacing
503
+ # The last element of the spacing model
504
+ # defines the total k-mer 'span'
505
+ cdef int64 max_offset = spacing[len(spacing)-1] + 1
506
+ cdef uint64 alphabet_length = len(self._base_alph)
507
+ cdef int64[:] radix_multiplier = self._radix_multiplier
508
+
509
+ if len(seq_code) < <unsigned int>max_offset:
510
+ raise ValueError(
511
+ "The length of the sequence code is shorter "
512
+ "than the k-mer span"
513
+ )
514
+
515
+ cdef int64[:] kmers = np.empty(
516
+ self.kmer_array_length(len(seq_code)), dtype=np.int64
517
+ )
518
+
519
+ cdef CodeType code
520
+ cdef int64 kmer
521
+ cdef int64 offset
522
+ for i in range(kmers.shape[0]):
523
+ kmer = 0
524
+ for j in range(k):
525
+ offset = spacing[j]
526
+ code = seq_code[i + offset]
527
+ if code >= alphabet_length:
528
+ raise AlphabetError(f"Symbol code {code} is out of range")
529
+ kmer += radix_multiplier[j] * code
530
+ kmers[i] = kmer
531
+
532
+ return np.asarray(kmers)
533
+
534
+
535
+ def __str__(self):
536
+ return str(self.get_symbols())
537
+
538
+
539
+ def __repr__(self):
540
+ return f"KmerAlphabet({repr(self._base_alph)}, " \
541
+ f"{self._k}, {repr(self._spacing)})"
542
+
543
+
544
+ def __eq__(self, item):
545
+ if item is self:
546
+ return True
547
+ if not isinstance(item, KmerAlphabet):
548
+ return False
549
+ if self._base_alph != item._base_alph:
550
+ return False
551
+ if self._k != item._k:
552
+ return False
553
+
554
+ if self._spacing is None:
555
+ if item._spacing is not None:
556
+ return False
557
+ elif np.any(self._spacing != item._spacing):
558
+ return False
559
+
560
+ return True
561
+
562
+
563
+ def __hash__(self):
564
+ return hash((self._base_alph, self._k, tuple(self._spacing.tolist())))
565
+
566
+
567
+ def __len__(self):
568
+ return int(len(self._base_alph) ** self._k)
569
+
570
+
571
+ def __iter__(self):
572
+ # Creating all symbols is expensive
573
+ # -> Use a generator instead
574
+ if isinstance(self._base_alph, LetterAlphabet):
575
+ return ("".join(self.decode(code)) for code in range(len(self)))
576
+ else:
577
+ return (list(self.decode(code)) for code in range(len(self)))
578
+
579
+
580
+ def __contains__(self, symbol):
581
+ try:
582
+ self.fuse(self._base_alph.encode_multiple(symbol))
583
+ return True
584
+ except AlphabetError:
585
+ return False
586
+
587
+
588
+ def _to_array_form(model_string):
589
+ """
590
+ Convert the the common string representation of a *k-mer* spacing
591
+ model into an array, e.g. ``'1*11'`` into ``[0, 2, 3]``.
592
+ """
593
+ return np.array([
594
+ i for i in range(len(model_string)) if model_string[i] == "1"
595
+ ], dtype=np.int64)