biotite 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-310-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,69 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["bucket_number"]
8
+
9
+ from os.path import realpath, dirname, join
10
+ import numpy as np
11
+
12
+
13
+ _primes = None
14
+ def bucket_number(n_kmers, load_factor=0.8):
15
+ """
16
+ Find an appropriate number of buckets for a :class:`BucketKmerTable`
17
+ based on the number of elements (i.e. *k-mers*) that should be
18
+ stored in the table.
19
+
20
+ Parameters
21
+ ----------
22
+ n_kmers : int
23
+ The expected number of *k-mers* that will be stored in the
24
+ :class:`BucketKmerTable`.
25
+ If this number deviates from the actual number of *k-mers* that
26
+ will be stored, the load factor of the table will deviate
27
+ by the same percentage.
28
+ load_factor : float, optional
29
+ The ratio of bucket number to *k-mer* number.
30
+ The actual load factor will be lower, as the closest greater
31
+ prime is returned (see *Notes*).
32
+
33
+ Returns
34
+ -------
35
+ n_buckets : int
36
+ The recommended number of buckets to use for a
37
+ :class:`BucketKmerTable`, that stores `n_kmers` at the given
38
+ `load_factor`.
39
+
40
+ Notes
41
+ -----
42
+ The function returns the closest greater prime number from a
43
+ precomputed list of primes to use as the number of buckets.
44
+ The reason is that primer numbers have proven to be good hash table
45
+ sizes, if the hash function is not randomized.
46
+
47
+ Let's take unambiguous nucleotide *k-mers* as example.
48
+ If powers of two would be used as table size (another common scheme),
49
+ taking the modulo operation on the *k-mer* code would simply erase
50
+ the upper bits corresponding to the first nucleotide(s) in a
51
+ *k-mer*.
52
+ Hence, all *k-mers* with the same suffix would be stored in the same
53
+ bin.
54
+ """
55
+ global _primes
56
+ if _primes is None:
57
+ with open(
58
+ join(dirname(realpath(__file__)), "primes.txt")
59
+ ) as file:
60
+ _primes = np.array([
61
+ int(line) for line in file.read().splitlines()
62
+ if len(line) != 0 and line[0] != "#"
63
+ ])
64
+
65
+ number = int(n_kmers / load_factor)
66
+ index = np.searchsorted(_primes, number, side="left")
67
+ if index == len(_primes):
68
+ raise ValueError("Number of buckets too large")
69
+ return _primes[index]
@@ -0,0 +1,434 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
8
+
9
+ import enum
10
+ import numpy as np
11
+ from .alignment import Alignment, get_codes
12
+
13
+
14
+ class CigarOp(enum.IntEnum):
15
+ """
16
+ An enum for the different CIGAR operations.
17
+ """
18
+ MATCH = 0
19
+ INSERTION = 1
20
+ DELETION = 2
21
+ INTRON = 3
22
+ SOFT_CLIP = 4
23
+ HARD_CLIP = 5
24
+ PADDING = 6
25
+ EQUAL = 7
26
+ DIFFERENT = 8
27
+ BACK = 9
28
+
29
+ @staticmethod
30
+ def from_cigar_symbol(symbol):
31
+ """
32
+ Get the enum value from the CIGAR symbol.
33
+
34
+ Parameters
35
+ ----------
36
+ symbol : str
37
+ The CIGAR symbol.
38
+
39
+ Returns
40
+ -------
41
+ op : CigarOp
42
+ The enum value.
43
+ """
44
+ return _str_to_op[symbol]
45
+
46
+ def to_cigar_symbol(self):
47
+ return _op_to_str[self]
48
+
49
+ _str_to_op = {
50
+ "M" : CigarOp.MATCH,
51
+ "I" : CigarOp.INSERTION,
52
+ "D" : CigarOp.DELETION,
53
+ "N" : CigarOp.INTRON,
54
+ "S" : CigarOp.SOFT_CLIP,
55
+ "H" : CigarOp.HARD_CLIP,
56
+ "P" : CigarOp.PADDING,
57
+ "=" : CigarOp.EQUAL,
58
+ "X" : CigarOp.DIFFERENT,
59
+ "B" : CigarOp.BACK
60
+ }
61
+ _op_to_str = {v: k for k, v in _str_to_op.items()}
62
+
63
+
64
+ def read_alignment_from_cigar(cigar, position,
65
+ reference_sequence, segment_sequence):
66
+ """
67
+ Create an :class:`Alignment` from a CIGAR string.
68
+
69
+ Parameters
70
+ ----------
71
+ cigar : str
72
+ The CIGAR string.
73
+ position : int
74
+ 0-based position of the first aligned base in the reference.
75
+ 0-based equivalent to the ``POS`` field in the SAM/BAM file.
76
+ reference_sequence : Sequence
77
+ The reference sequence.
78
+ segment_sequence : Sequence
79
+ The segment, read or query sequence.
80
+
81
+ Returns
82
+ -------
83
+ alignment : Alignment
84
+ The alignment.
85
+
86
+ See Also
87
+ --------
88
+ write_alignment_to_cigar
89
+
90
+ Notes
91
+ -----
92
+ This function expects that the `segment_sequence` was taken from the
93
+ SAM/BAM file, hence hard-clipped bases are not part of the sequence.
94
+ Therefore, hard clipped bases are simply ignored in the CIGAR
95
+ string.
96
+
97
+ Examples
98
+ --------
99
+
100
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
101
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
102
+ >>> print(read_alignment_from_cigar("9M2D12M", 3, ref, seg))
103
+ AAAAGGTTTCCGACCGTAGGTAG
104
+ CCCCGGTTT--GACCGTATGTAG
105
+ >>> print(read_alignment_from_cigar("4X5=2D7=1X4=", 3, ref, seg))
106
+ AAAAGGTTTCCGACCGTAGGTAG
107
+ CCCCGGTTT--GACCGTATGTAG
108
+
109
+ Explicit terminal deletions are also possible.
110
+ Note that in this case the deleted positions count as aligned bases
111
+ with respect to the `position` parameter.
112
+
113
+ >>> print(read_alignment_from_cigar("3D9M2D12M4D", 0, ref, seg))
114
+ TATAAAAGGTTTCCGACCGTAGGTAGCTGA
115
+ ---CCCCGGTTT--GACCGTATGTAG----
116
+
117
+ If bases in the segment sequence are soft-clipped, they do not
118
+ appear in the alignment.
119
+ Furthermore, the start of the reference sequence must be adapted.
120
+
121
+ >>> print(read_alignment_from_cigar("4S5M2D12M", 7, ref, seg))
122
+ GGTTTCCGACCGTAGGTAG
123
+ GGTTT--GACCGTATGTAG
124
+
125
+ Hard-clipped bases are not part of the segment sequence.
126
+ Hence `H` operations are completely ignored.
127
+
128
+ >>> seg = NucleotideSequence("GGTTTGACCGTATGTAG")
129
+ >>> print(read_alignment_from_cigar("4H5M2D12M", 7, ref, seg))
130
+ GGTTTCCGACCGTAGGTAG
131
+ GGTTT--GACCGTATGTAG
132
+
133
+ Reading from BAM codes is also possible.
134
+
135
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
136
+ >>> op_tuples = [
137
+ ... (CigarOp.MATCH, 9),
138
+ ... (CigarOp.DELETION, 2),
139
+ ... (CigarOp.MATCH, 12)
140
+ ... ]
141
+ >>> print(read_alignment_from_cigar(op_tuples, 3, ref, seg))
142
+ AAAAGGTTTCCGACCGTAGGTAG
143
+ CCCCGGTTT--GACCGTATGTAG
144
+ """
145
+ if isinstance(cigar, str):
146
+ operations = _op_tuples_from_cigar(cigar)
147
+ else:
148
+ operations = np.asarray(cigar, dtype=int)
149
+ if operations.ndim != 2:
150
+ raise ValueError(
151
+ "Expected array with shape (n,2)"
152
+ )
153
+ if operations.shape[1] != 2:
154
+ raise ValueError(
155
+ "Expected (operation, length) pairs"
156
+ )
157
+
158
+ if len(operations) == 0:
159
+ return Alignment(
160
+ [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
161
+ )
162
+
163
+ trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
164
+ clip_mask = np.ones(trace.shape[0], dtype=bool)
165
+
166
+ i = 0
167
+ ref_pos = position
168
+ seg_pos = 0
169
+ for op, length in operations:
170
+ op = CigarOp(op)
171
+ if op in (CigarOp.MATCH, CigarOp.EQUAL, CigarOp.DIFFERENT):
172
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
173
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
174
+ ref_pos += length
175
+ seg_pos += length
176
+ elif op == CigarOp.INSERTION:
177
+ trace[i : i + length, 0] = -1
178
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
179
+ seg_pos += length
180
+ elif op in (CigarOp.DELETION, CigarOp.INTRON):
181
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
182
+ trace[i : i + length, 1] = -1
183
+ ref_pos += length
184
+ elif op == CigarOp.SOFT_CLIP:
185
+ clip_mask[i : i + length] = False
186
+ seg_pos += length
187
+ elif op == CigarOp.HARD_CLIP:
188
+ clip_mask[i : i + length] = False
189
+ else:
190
+ raise ValueError(
191
+ f"CIGAR operation {op} is not implemented"
192
+ )
193
+ i += length
194
+ # Remove clipped positions
195
+ trace = trace[clip_mask]
196
+ return Alignment([reference_sequence, segment_sequence], trace)
197
+
198
+
199
+ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
200
+ introns=(), distinguish_matches=False,
201
+ hard_clip=False, include_terminal_gaps=False,
202
+ as_string=True):
203
+ """
204
+ Convert an :class:`Alignment` into a CIGAR string.
205
+
206
+ Parameters
207
+ ----------
208
+ alignment : Alignment
209
+ The alignment to be converted.
210
+ reference_index : int, optional
211
+ The index of the reference sequence in the alignment.
212
+ By default the first sequence is used.
213
+ segment_index : int, optional
214
+ The index of the segment, read or query sequence in the
215
+ alignment.
216
+ By default the second sequence is used.
217
+ introns : iterable object of tuple(int, int), optional
218
+ The introns in the reference sequence.
219
+ The introns are given as tuples of start and exclusive stop
220
+ index.
221
+ In those regions gaps in the reference sequence are reflected by
222
+ `'N'` in the CIGAR string.
223
+ By default no introns are assumed.
224
+ distinguish_matches : bool, optional
225
+ If true, matches (`'='`) are distinguished from mismatches
226
+ (`'X'`).
227
+ Otherwise, matches and mismatches are reflected equally by an
228
+ `'M'` in the CIGAR string.
229
+ hard_clip : bool, optional
230
+ If true, clipped bases are hard-clipped.
231
+ Otherwise, clipped bases are soft-clipped.
232
+ include_terminal_gaps : bool, optional
233
+ If true, terminal gaps in the segment sequence are included in
234
+ the CIGAR string.
235
+ These are represented by ``D`` operations at the start and/or
236
+ end of the string.
237
+ By default, those terminal gaps are omitted in the CIGAR, which
238
+ is the way SAM/BAM expects a CIGAR to be.
239
+ as_string : bool, optional
240
+ If true, the CIGAR string is returned.
241
+ Otherwise, a list of tuples is returned, where the first element
242
+ of each tuple specifies the :class:`CigarOp` and the second
243
+ element specifies the number of repetitions.
244
+
245
+ Returns
246
+ -------
247
+ cigar : str or ndarray, shape=(n,2) dtype=int
248
+ If `as_string` is true, the CIGAR string is returned.
249
+ Otherwise, an array is returned, where the first column
250
+ specifies the :class:`CigarOp` and the second column specifies
251
+ the number of repetitions of that operation.
252
+
253
+ See Also
254
+ --------
255
+ read_alignment_from_cigar
256
+
257
+ Notes
258
+ -----
259
+ If `include_terminal_gaps` is set to true, you usually want to set
260
+ ``position=0`` in :func:`read_alignment_from_cigar` to get the
261
+ correct alignment.
262
+
263
+ Examples
264
+ --------
265
+
266
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
267
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
268
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
269
+ >>> semiglobal_alignment = align_optimal(
270
+ ... ref, seg, matrix, local=False, terminal_penalty=False
271
+ ... )[0]
272
+ >>> print(semiglobal_alignment)
273
+ TATAAAAGGTTTCCGACCGTAGGTAGCTGA
274
+ ---CCCCGGTTT--GACCGTATGTAG----
275
+ >>> print(write_alignment_to_cigar(semiglobal_alignment))
276
+ 9M2D12M
277
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, introns=[(12, 14)]))
278
+ 9M2N12M
279
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
280
+ 4X5=2D7=1X4=
281
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, include_terminal_gaps=True))
282
+ 3D9M2D12M4D
283
+ >>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
284
+ >>> print(local_alignment)
285
+ GGTTTCCGACCGTAGGTAG
286
+ GGTTT--GACCGTATGTAG
287
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=False))
288
+ 4S5M2D12M
289
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=True))
290
+ 4H5M2D12M
291
+
292
+ Writing operations as BAM codes is also possible:
293
+
294
+ >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
295
+ >>> for op, length in op_tuples:
296
+ ... print(CigarOp(op), length)
297
+ CigarOp.MATCH 9
298
+ CigarOp.DELETION 2
299
+ CigarOp.MATCH 12
300
+ """
301
+ if not include_terminal_gaps:
302
+ alignment = _remove_terminal_segment_gaps(alignment, segment_index)
303
+
304
+ ref_trace = alignment.trace[:, reference_index]
305
+ seg_trace = alignment.trace[:, segment_index]
306
+ operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
307
+
308
+ insertion_mask = (ref_trace == -1)
309
+ deletion_mask = (seg_trace == -1)
310
+ if np.any(insertion_mask & deletion_mask):
311
+ raise ValueError(
312
+ "Alignment contains insertion and deletion at the same position"
313
+ )
314
+ operations[insertion_mask] = CigarOp.INSERTION
315
+ operations[deletion_mask] = CigarOp.DELETION
316
+
317
+ if introns is not None:
318
+ intron_mask = np.zeros(operations.shape[0], dtype=bool)
319
+ for start, stop in introns:
320
+ if start >= stop:
321
+ raise ValueError(
322
+ "Intron start must be smaller than intron stop"
323
+ )
324
+ if start < 0:
325
+ raise ValueError(
326
+ "Intron start must not be negative"
327
+ )
328
+ intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
329
+ if np.any(intron_mask & ~deletion_mask):
330
+ raise ValueError(
331
+ "Introns must be within gaps in the reference sequence"
332
+ )
333
+ operations[intron_mask] = CigarOp.INTRON
334
+
335
+ if distinguish_matches:
336
+ symbol_codes = get_codes(alignment)
337
+ ref_codes = symbol_codes[reference_index, :]
338
+ seg_codes = symbol_codes[segment_index, :]
339
+ equal_mask = (ref_codes == seg_codes)
340
+ match_mask = (operations == CigarOp.MATCH)
341
+ operations[equal_mask & match_mask] = CigarOp.EQUAL
342
+ operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
343
+
344
+ op_tuples = _aggregate_consecutive(operations)
345
+
346
+ clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
347
+ start_clip_length, end_clip_length = _find_clipped_bases(
348
+ alignment, segment_index
349
+ )
350
+ if start_clip_length != 0:
351
+ start_clip = [(clip_op, start_clip_length)]
352
+ else:
353
+ start_clip = np.zeros((0, 2), dtype=int)
354
+ if end_clip_length != 0:
355
+ end_clip = [(clip_op, end_clip_length)]
356
+ else:
357
+ end_clip = np.zeros((0, 2), dtype=int)
358
+ op_tuples = np.concatenate((start_clip, op_tuples, end_clip))
359
+
360
+ if as_string:
361
+ cigar = _cigar_from_op_tuples(op_tuples)
362
+ return cigar
363
+ else:
364
+ return op_tuples
365
+
366
+
367
+ def _remove_terminal_segment_gaps(alignment, segment_index):
368
+ """
369
+ Remove terminal gaps in the segment sequence.
370
+ """
371
+ no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
372
+ return alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
373
+
374
+
375
+ def _find_clipped_bases(alignment, segment_index):
376
+ """
377
+ Find the number of clipped bases at the start and end of the segment.
378
+ """
379
+ # Finding the clipped part is easier, when the terminal segment gaps
380
+ # are removed (if not already done)
381
+ alignment = _remove_terminal_segment_gaps(alignment, segment_index)
382
+ seg_trace = alignment.trace[:, segment_index]
383
+ # Missing bases at the beginning and end of the segment are
384
+ # interpreted as clipped
385
+ # As first element in the segment trace is the first aligned base,
386
+ # all previous bases are clipped...
387
+ start_clip_length = seg_trace[0]
388
+ # ...and the same applies for the last base
389
+ end_clip_length = (
390
+ len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
391
+ )
392
+ return start_clip_length, end_clip_length
393
+
394
+
395
+ def _aggregate_consecutive(operations):
396
+ """
397
+ Aggregate consecutive operations of the same type.
398
+ """
399
+ op_start_indices = np.where(operations[:-1] != operations[1:])[0]
400
+ # Also include the first operation
401
+ op_start_indices += 1
402
+ op_start_indices = np.concatenate(([0], op_start_indices))
403
+ ops = operations[op_start_indices]
404
+ length = np.diff(np.append(op_start_indices, len(operations)))
405
+ return np.stack((ops, length), axis=-1)
406
+
407
+
408
+ def _cigar_from_op_tuples(op_tuples):
409
+ """
410
+ Create a CIGAR string from a list of BAM integer tuples.
411
+
412
+ The first element of each tuple specifies the operation and the
413
+ second element specifies the number of repetitions.
414
+ """
415
+ cigar = ""
416
+ for op, count in op_tuples:
417
+ cigar += str(count) + CigarOp(op).to_cigar_symbol()
418
+ return cigar
419
+
420
+
421
+ def _op_tuples_from_cigar(cigar):
422
+ """
423
+ Create a list of tuples from a CIGAR string.
424
+ """
425
+ op_tuples = []
426
+ count = ""
427
+ for char in cigar:
428
+ if char.isdigit():
429
+ count += char
430
+ else:
431
+ op = CigarOp.from_cigar_symbol(char)
432
+ op_tuples.append((op, count))
433
+ count = ""
434
+ return np.array(op_tuples, dtype=int)