biotite 1.5.0__cp314-cp314-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-314-darwin.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-314-darwin.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-314-darwin.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-314-darwin.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-314-darwin.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-314-darwin.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-314-darwin.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-314-darwin.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-314-darwin.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-314-darwin.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-314-darwin.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-314-darwin.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-314-darwin.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-314-darwin.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-314-darwin.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-314-darwin.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-314-darwin.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-314-darwin.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-314-darwin.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-314-darwin.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-314-darwin.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,71 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["bucket_number"]
8
+
9
+ from os.path import dirname, join, realpath
10
+ import numpy as np
11
+
12
+ _primes = None
13
+
14
+
15
+ def bucket_number(n_kmers, load_factor=0.8):
16
+ """
17
+ Find an appropriate number of buckets for a :class:`BucketKmerTable`
18
+ based on the number of elements (i.e. *k-mers*) that should be
19
+ stored in the table.
20
+
21
+ Parameters
22
+ ----------
23
+ n_kmers : int
24
+ The expected number of *k-mers* that will be stored in the
25
+ :class:`BucketKmerTable`.
26
+ If this number deviates from the actual number of *k-mers* that
27
+ will be stored, the load factor of the table will deviate
28
+ by the same percentage.
29
+ load_factor : float, optional
30
+ The ratio of bucket number to *k-mer* number.
31
+ The actual load factor will be lower, as the closest greater
32
+ prime is returned (see *Notes*).
33
+
34
+ Returns
35
+ -------
36
+ n_buckets : int
37
+ The recommended number of buckets to use for a
38
+ :class:`BucketKmerTable`, that stores `n_kmers` at the given
39
+ `load_factor`.
40
+
41
+ Notes
42
+ -----
43
+ The function returns the closest greater prime number from a
44
+ precomputed list of primes to use as the number of buckets.
45
+ The reason is that primer numbers have proven to be good hash table
46
+ sizes, if the hash function is not randomized.
47
+
48
+ Let's take unambiguous nucleotide *k-mers* as example.
49
+ If powers of two would be used as table size (another common scheme),
50
+ taking the modulo operation on the *k-mer* code would simply erase
51
+ the upper bits corresponding to the first nucleotide(s) in a
52
+ *k-mer*.
53
+ Hence, all *k-mers* with the same suffix would be stored in the same
54
+ bin.
55
+ """
56
+ global _primes
57
+ if _primes is None:
58
+ with open(join(dirname(realpath(__file__)), "primes.txt")) as file:
59
+ _primes = np.array(
60
+ [
61
+ int(line)
62
+ for line in file.read().splitlines()
63
+ if len(line) != 0 and line[0] != "#"
64
+ ]
65
+ )
66
+
67
+ number = int(n_kmers / load_factor)
68
+ index = np.searchsorted(_primes, number, side="left")
69
+ if index == len(_primes):
70
+ raise ValueError("Number of buckets too large")
71
+ return _primes[index]
@@ -0,0 +1,425 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
8
+
9
+ import enum
10
+ import numpy as np
11
+ from biotite.sequence.align.alignment import Alignment, get_codes
12
+
13
+
14
+ class CigarOp(enum.IntEnum):
15
+ """
16
+ An enum for the different CIGAR operations.
17
+ """
18
+
19
+ MATCH = 0
20
+ INSERTION = 1
21
+ DELETION = 2
22
+ INTRON = 3
23
+ SOFT_CLIP = 4
24
+ HARD_CLIP = 5
25
+ PADDING = 6
26
+ EQUAL = 7
27
+ DIFFERENT = 8
28
+ BACK = 9
29
+
30
+ @staticmethod
31
+ def from_cigar_symbol(symbol):
32
+ """
33
+ Get the enum value from the CIGAR symbol.
34
+
35
+ Parameters
36
+ ----------
37
+ symbol : str
38
+ The CIGAR symbol.
39
+
40
+ Returns
41
+ -------
42
+ op : CigarOp
43
+ The enum value.
44
+ """
45
+ return _str_to_op[symbol]
46
+
47
+ def to_cigar_symbol(self):
48
+ return _op_to_str[self]
49
+
50
+
51
+ _str_to_op = {
52
+ "M": CigarOp.MATCH,
53
+ "I": CigarOp.INSERTION,
54
+ "D": CigarOp.DELETION,
55
+ "N": CigarOp.INTRON,
56
+ "S": CigarOp.SOFT_CLIP,
57
+ "H": CigarOp.HARD_CLIP,
58
+ "P": CigarOp.PADDING,
59
+ "=": CigarOp.EQUAL,
60
+ "X": CigarOp.DIFFERENT,
61
+ "B": CigarOp.BACK,
62
+ }
63
+ _op_to_str = {v: k for k, v in _str_to_op.items()}
64
+
65
+
66
+ def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence):
67
+ """
68
+ Create an :class:`Alignment` from a CIGAR string.
69
+
70
+ Parameters
71
+ ----------
72
+ cigar : str
73
+ The CIGAR string.
74
+ position : int
75
+ 0-based position of the first aligned base in the reference.
76
+ 0-based equivalent to the ``POS`` field in the SAM/BAM file.
77
+ reference_sequence : Sequence
78
+ The reference sequence.
79
+ segment_sequence : Sequence
80
+ The segment, read or query sequence.
81
+
82
+ Returns
83
+ -------
84
+ alignment : Alignment
85
+ The alignment.
86
+
87
+ See Also
88
+ --------
89
+ write_alignment_to_cigar : The reverse operation.
90
+
91
+ Notes
92
+ -----
93
+ This function expects that the `segment_sequence` was taken from the
94
+ SAM/BAM file, hence hard-clipped bases are not part of the sequence.
95
+ Therefore, hard clipped bases are simply ignored in the CIGAR
96
+ string.
97
+
98
+ Examples
99
+ --------
100
+
101
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
102
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
103
+ >>> print(read_alignment_from_cigar("9M2D12M", 3, ref, seg))
104
+ AAAAGGTTTCCGACCGTAGGTAG
105
+ CCCCGGTTT--GACCGTATGTAG
106
+ >>> print(read_alignment_from_cigar("4X5=2D7=1X4=", 3, ref, seg))
107
+ AAAAGGTTTCCGACCGTAGGTAG
108
+ CCCCGGTTT--GACCGTATGTAG
109
+
110
+ Explicit terminal deletions are also possible.
111
+ Note that in this case the deleted positions count as aligned bases
112
+ with respect to the `position` parameter.
113
+
114
+ >>> print(read_alignment_from_cigar("3D9M2D12M4D", 0, ref, seg))
115
+ TATAAAAGGTTTCCGACCGTAGGTAGCTGA
116
+ ---CCCCGGTTT--GACCGTATGTAG----
117
+
118
+ If bases in the segment sequence are soft-clipped, they do not
119
+ appear in the alignment.
120
+ Furthermore, the start of the reference sequence must be adapted.
121
+
122
+ >>> print(read_alignment_from_cigar("4S5M2D12M", 7, ref, seg))
123
+ GGTTTCCGACCGTAGGTAG
124
+ GGTTT--GACCGTATGTAG
125
+
126
+ Hard-clipped bases are not part of the segment sequence.
127
+ Hence `H` operations are completely ignored.
128
+
129
+ >>> seg = NucleotideSequence("GGTTTGACCGTATGTAG")
130
+ >>> print(read_alignment_from_cigar("4H5M2D12M", 7, ref, seg))
131
+ GGTTTCCGACCGTAGGTAG
132
+ GGTTT--GACCGTATGTAG
133
+
134
+ Reading from BAM codes is also possible.
135
+
136
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
137
+ >>> op_tuples = [
138
+ ... (CigarOp.MATCH, 9),
139
+ ... (CigarOp.DELETION, 2),
140
+ ... (CigarOp.MATCH, 12)
141
+ ... ]
142
+ >>> print(read_alignment_from_cigar(op_tuples, 3, ref, seg))
143
+ AAAAGGTTTCCGACCGTAGGTAG
144
+ CCCCGGTTT--GACCGTATGTAG
145
+ """
146
+ if isinstance(cigar, str):
147
+ operations = _op_tuples_from_cigar(cigar)
148
+ else:
149
+ operations = np.asarray(cigar, dtype=int)
150
+ if operations.ndim != 2:
151
+ raise ValueError("Expected array with shape (n,2)")
152
+ if operations.shape[1] != 2:
153
+ raise ValueError("Expected (operation, length) pairs")
154
+
155
+ if len(operations) == 0:
156
+ return Alignment(
157
+ [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
158
+ )
159
+
160
+ trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int)
161
+ clip_mask = np.ones(trace.shape[0], dtype=bool)
162
+
163
+ i = 0
164
+ ref_pos = position
165
+ seg_pos = 0
166
+ for op, length in operations:
167
+ op = CigarOp(op)
168
+ if op in (CigarOp.MATCH, CigarOp.EQUAL, CigarOp.DIFFERENT):
169
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
170
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
171
+ ref_pos += length
172
+ seg_pos += length
173
+ elif op == CigarOp.INSERTION:
174
+ trace[i : i + length, 0] = -1
175
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
176
+ seg_pos += length
177
+ elif op in (CigarOp.DELETION, CigarOp.INTRON):
178
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
179
+ trace[i : i + length, 1] = -1
180
+ ref_pos += length
181
+ elif op == CigarOp.SOFT_CLIP:
182
+ clip_mask[i : i + length] = False
183
+ seg_pos += length
184
+ elif op == CigarOp.HARD_CLIP:
185
+ clip_mask[i : i + length] = False
186
+ else:
187
+ raise ValueError(f"CIGAR operation {op} is not implemented")
188
+ i += length
189
+ # Remove clipped positions
190
+ trace = trace[clip_mask]
191
+ return Alignment([reference_sequence, segment_sequence], trace)
192
+
193
+
194
+ def write_alignment_to_cigar(
195
+ alignment,
196
+ reference_index=0,
197
+ segment_index=1,
198
+ introns=(),
199
+ distinguish_matches=False,
200
+ hard_clip=False,
201
+ include_terminal_gaps=False,
202
+ as_string=True,
203
+ ):
204
+ """
205
+ Convert an :class:`Alignment` into a CIGAR string.
206
+
207
+ Parameters
208
+ ----------
209
+ alignment : Alignment
210
+ The alignment to be converted.
211
+ reference_index : int, optional
212
+ The index of the reference sequence in the alignment.
213
+ By default the first sequence is used.
214
+ segment_index : int, optional
215
+ The index of the segment, read or query sequence in the
216
+ alignment.
217
+ By default the second sequence is used.
218
+ introns : iterable object of tuple(int, int), optional
219
+ The introns in the reference sequence.
220
+ The introns are given as tuples of start and exclusive stop
221
+ index.
222
+ In those regions gaps in the reference sequence are reflected by
223
+ `'N'` in the CIGAR string.
224
+ By default no introns are assumed.
225
+ distinguish_matches : bool, optional
226
+ If true, matches (`'='`) are distinguished from mismatches
227
+ (`'X'`).
228
+ Otherwise, matches and mismatches are reflected equally by an
229
+ `'M'` in the CIGAR string.
230
+ hard_clip : bool, optional
231
+ If true, clipped bases are hard-clipped.
232
+ Otherwise, clipped bases are soft-clipped.
233
+ include_terminal_gaps : bool, optional
234
+ If true, terminal gaps in the segment sequence are included in
235
+ the CIGAR string.
236
+ These are represented by ``D`` operations at the start and/or
237
+ end of the string.
238
+ By default, those terminal gaps are omitted in the CIGAR, which
239
+ is the way SAM/BAM expects a CIGAR to be.
240
+ as_string : bool, optional
241
+ If true, the CIGAR string is returned.
242
+ Otherwise, a list of tuples is returned, where the first element
243
+ of each tuple specifies the :class:`CigarOp` and the second
244
+ element specifies the number of repetitions.
245
+
246
+ Returns
247
+ -------
248
+ cigar : str or ndarray, shape=(n,2) dtype=int
249
+ If `as_string` is true, the CIGAR string is returned.
250
+ Otherwise, an array is returned, where the first column
251
+ specifies the :class:`CigarOp` and the second column specifies
252
+ the number of repetitions of that operation.
253
+
254
+ See Also
255
+ --------
256
+ read_alignment_from_cigar : The reverse operation.
257
+
258
+ Notes
259
+ -----
260
+ If `include_terminal_gaps` is set to true, you usually want to set
261
+ ``position=0`` in :func:`read_alignment_from_cigar` to get the
262
+ correct alignment.
263
+
264
+ Examples
265
+ --------
266
+
267
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
268
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
269
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
270
+ >>> semiglobal_alignment = align_optimal(
271
+ ... ref, seg, matrix, local=False, terminal_penalty=False
272
+ ... )[0]
273
+ >>> print(semiglobal_alignment)
274
+ TATAAAAGGTTTCCGACCGTAGGTAGCTGA
275
+ ---CCCCGGTTT--GACCGTATGTAG----
276
+ >>> print(write_alignment_to_cigar(semiglobal_alignment))
277
+ 9M2D12M
278
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, introns=[(12, 14)]))
279
+ 9M2N12M
280
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
281
+ 4X5=2D7=1X4=
282
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, include_terminal_gaps=True))
283
+ 3D9M2D12M4D
284
+ >>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
285
+ >>> print(local_alignment)
286
+ GGTTTCCGACCGTAGGTAG
287
+ GGTTT--GACCGTATGTAG
288
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=False))
289
+ 4S5M2D12M
290
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=True))
291
+ 4H5M2D12M
292
+
293
+ Writing operations as BAM codes is also possible:
294
+
295
+ >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
296
+ >>> for op, length in op_tuples:
297
+ ... print(CigarOp(op).name, length)
298
+ MATCH 9
299
+ DELETION 2
300
+ MATCH 12
301
+ """
302
+ if not include_terminal_gaps:
303
+ alignment = _remove_terminal_segment_gaps(alignment, segment_index)
304
+
305
+ ref_trace = alignment.trace[:, reference_index]
306
+ seg_trace = alignment.trace[:, segment_index]
307
+ operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
308
+
309
+ insertion_mask = ref_trace == -1
310
+ deletion_mask = seg_trace == -1
311
+ if np.any(insertion_mask & deletion_mask):
312
+ raise ValueError(
313
+ "Alignment contains insertion and deletion at the same position"
314
+ )
315
+ operations[insertion_mask] = CigarOp.INSERTION
316
+ operations[deletion_mask] = CigarOp.DELETION
317
+
318
+ if introns is not None:
319
+ intron_mask = np.zeros(operations.shape[0], dtype=bool)
320
+ for start, stop in introns:
321
+ if start >= stop:
322
+ raise ValueError("Intron start must be smaller than intron stop")
323
+ if start < 0:
324
+ raise ValueError("Intron start must not be negative")
325
+ intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
326
+ if np.any(intron_mask & ~deletion_mask):
327
+ raise ValueError("Introns must be within gaps in the reference sequence")
328
+ operations[intron_mask] = CigarOp.INTRON
329
+
330
+ if distinguish_matches:
331
+ symbol_codes = get_codes(alignment)
332
+ ref_codes = symbol_codes[reference_index, :]
333
+ seg_codes = symbol_codes[segment_index, :]
334
+ equal_mask = ref_codes == seg_codes
335
+ match_mask = operations == CigarOp.MATCH
336
+ operations[equal_mask & match_mask] = CigarOp.EQUAL
337
+ operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
338
+
339
+ op_tuples = _aggregate_consecutive(operations)
340
+
341
+ clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
342
+ start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index)
343
+ if start_clip_length != 0:
344
+ start_clip = [(clip_op, start_clip_length)]
345
+ else:
346
+ start_clip = np.zeros((0, 2), dtype=int)
347
+ if end_clip_length != 0:
348
+ end_clip = [(clip_op, end_clip_length)]
349
+ else:
350
+ end_clip = np.zeros((0, 2), dtype=int)
351
+ op_tuples = np.concatenate((start_clip, op_tuples, end_clip))
352
+
353
+ if as_string:
354
+ cigar = _cigar_from_op_tuples(op_tuples)
355
+ return cigar
356
+ else:
357
+ return op_tuples
358
+
359
+
360
+ def _remove_terminal_segment_gaps(alignment, segment_index):
361
+ """
362
+ Remove terminal gaps in the segment sequence.
363
+ """
364
+ no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
365
+ return alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
366
+
367
+
368
+ def _find_clipped_bases(alignment, segment_index):
369
+ """
370
+ Find the number of clipped bases at the start and end of the segment.
371
+ """
372
+ # Finding the clipped part is easier, when the terminal segment gaps
373
+ # are removed (if not already done)
374
+ alignment = _remove_terminal_segment_gaps(alignment, segment_index)
375
+ seg_trace = alignment.trace[:, segment_index]
376
+ # Missing bases at the beginning and end of the segment are
377
+ # interpreted as clipped
378
+ # As first element in the segment trace is the first aligned base,
379
+ # all previous bases are clipped...
380
+ start_clip_length = seg_trace[0]
381
+ # ...and the same applies for the last base
382
+ end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
383
+ return start_clip_length, end_clip_length
384
+
385
+
386
+ def _aggregate_consecutive(operations):
387
+ """
388
+ Aggregate consecutive operations of the same type.
389
+ """
390
+ op_start_indices = np.where(operations[:-1] != operations[1:])[0]
391
+ # Also include the first operation
392
+ op_start_indices += 1
393
+ op_start_indices = np.concatenate(([0], op_start_indices))
394
+ ops = operations[op_start_indices]
395
+ length = np.diff(np.append(op_start_indices, len(operations)))
396
+ return np.stack((ops, length), axis=-1)
397
+
398
+
399
+ def _cigar_from_op_tuples(op_tuples):
400
+ """
401
+ Create a CIGAR string from a list of BAM integer tuples.
402
+
403
+ The first element of each tuple specifies the operation and the
404
+ second element specifies the number of repetitions.
405
+ """
406
+ cigar = ""
407
+ for op, count in op_tuples:
408
+ cigar += str(count) + CigarOp(op).to_cigar_symbol()
409
+ return cigar
410
+
411
+
412
+ def _op_tuples_from_cigar(cigar):
413
+ """
414
+ Create a list of tuples from a CIGAR string.
415
+ """
416
+ op_tuples = []
417
+ count = ""
418
+ for char in cigar:
419
+ if char.isdigit():
420
+ count += char
421
+ else:
422
+ op = CigarOp.from_cigar_symbol(char)
423
+ op_tuples.append((op, count))
424
+ count = ""
425
+ return np.array(op_tuples, dtype=int)