biotite 0.41.1__cp311-cp311-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-311-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,164 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.phylo"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["upgma"]
8
+
9
+ cimport cython
10
+ cimport numpy as np
11
+
12
+ from .tree import Tree, TreeNode
13
+ import numpy as np
14
+
15
+ ctypedef np.float32_t float32
16
+ ctypedef np.uint8_t uint8
17
+ ctypedef np.uint32_t uint32
18
+
19
+
20
+ cdef float32 MAX_FLOAT = np.finfo(np.float32).max
21
+
22
+
23
+ @cython.boundscheck(False)
24
+ @cython.wraparound(False)
25
+ def upgma(np.ndarray distances):
26
+ """
27
+ upgma(distances)
28
+
29
+ Perform hierarchical clustering using the
30
+ *unweighted pair group method with arithmetic mean* (UPGMA).
31
+
32
+ This algorithm produces leaf nodes with the same distance to the
33
+ root node.
34
+ In the context of evolution this means a constant evolution rate
35
+ (molecular clock).
36
+
37
+ Parameters
38
+ ----------
39
+ distances : ndarray, shape=(n,n)
40
+ Pairwise distance matrix.
41
+
42
+ Returns
43
+ -------
44
+ tree : Tree
45
+ A rooted binary tree. The `index` attribute in the leaf
46
+ :class:`TreeNode` objects refer to the indices of `distances`.
47
+
48
+ Raises
49
+ ------
50
+ ValueError
51
+ If the distance matrix is not symmetric
52
+ or if any matrix entry is below 0.
53
+
54
+ Examples
55
+ --------
56
+
57
+ >>> distances = np.array([
58
+ ... [0, 1, 7, 7, 9],
59
+ ... [1, 0, 7, 6, 8],
60
+ ... [7, 7, 0, 2, 4],
61
+ ... [7, 6, 2, 0, 3],
62
+ ... [9, 8, 4, 3, 0],
63
+ ... ])
64
+ >>> tree = upgma(distances)
65
+ >>> print(tree.to_newick(include_distance=False))
66
+ ((4,(3,2)),(1,0));
67
+ """
68
+ cdef int i=0, j=0, k=0
69
+ cdef int i_min=0, j_min=0
70
+ cdef float32 dist, dist_min
71
+ cdef float mean
72
+ cdef float height
73
+
74
+ if distances.shape[0] != distances.shape[1] \
75
+ or not np.allclose(distances.T, distances):
76
+ raise ValueError("Distance matrix must be symmetric")
77
+ if np.isnan(distances).any():
78
+ raise ValueError("Distance matrix contains NaN values")
79
+ if (distances >= MAX_FLOAT).any():
80
+ raise ValueError("Distance matrix contains infinity")
81
+ if (distances < 0).any():
82
+ raise ValueError("Distances must be positive")
83
+
84
+
85
+ # Keep track on clustered indices
86
+ cdef np.ndarray nodes = np.array(
87
+ [TreeNode(index=i) for i in range(distances.shape[0])]
88
+ )
89
+ # Indicates whether an index in the distance matrix has already been
90
+ # clustered and the repsective rows and columns can be ignored
91
+ cdef uint8[:] is_clustered_v = np.full(
92
+ distances.shape[0], False, dtype=np.uint8
93
+ )
94
+ # Number of indices in the current node (cardinality)
95
+ # (required for proportional averaging)
96
+ cdef uint32[:] cluster_size_v = np.ones(
97
+ distances.shape[0], dtype=np.uint32
98
+ )
99
+ # Distance of each node from leaf nodes,
100
+ # used for calculation of distance to child nodes
101
+ cdef float32[:] node_heights = np.zeros(
102
+ distances.shape[0], dtype=np.float32
103
+ )
104
+
105
+
106
+ # Cluster indices
107
+ cdef float32[:,:] distances_v = distances.astype(np.float32, copy=True)
108
+ # Exit loop via 'break'
109
+ while True:
110
+
111
+ # Find minimum distance
112
+ dist_min = MAX_FLOAT
113
+ i_min = -1
114
+ j_min = -1
115
+ for i in range(distances_v.shape[0]):
116
+ if is_clustered_v[i]:
117
+ continue
118
+ for j in range(i):
119
+ if is_clustered_v[j]:
120
+ continue
121
+ dist = distances_v[i,j]
122
+ if dist < dist_min:
123
+ dist_min = dist
124
+ i_min = i
125
+ j_min = j
126
+
127
+ if i_min == -1 or j_min == -1:
128
+ # No distance found -> all leaf nodes are clustered
129
+ # -> exit loop
130
+ break
131
+
132
+ # Cluster the nodes with minimum distance
133
+ # replacing the node at position i_min
134
+ # leaving the node at position j_min empty
135
+ # (is_clustered_v -> True)
136
+ height = dist_min/2
137
+ nodes[i_min] = TreeNode(
138
+ (nodes[i_min], nodes[j_min]),
139
+ (height-node_heights[i_min], height-node_heights[j_min])
140
+ )
141
+ node_heights[i_min] = height
142
+ # Mark position j_min as clustered
143
+ nodes[j_min] = None
144
+ is_clustered_v[j_min] = True
145
+ # Calculate arithmetic mean distances of child nodes
146
+ # as distances for new node and update matrix
147
+ for k in range(distances_v.shape[0]):
148
+ if not is_clustered_v[k] and k != i_min:
149
+ mean = (
150
+ (
151
+ distances_v[i_min,k] * cluster_size_v[i_min]
152
+ + distances_v[j_min,k] * cluster_size_v[j_min]
153
+ ) / (cluster_size_v[i_min] + cluster_size_v[j_min])
154
+ )
155
+ distances_v[i_min,k] = mean
156
+ distances_v[k,i_min] = mean
157
+ # Updating cluster size of new node
158
+ cluster_size_v[i_min] = cluster_size_v[i_min] + cluster_size_v[j_min]
159
+
160
+
161
+ # As each higher level node is always created on position i_min
162
+ # and i is always higher than j in minimum distance calculation,
163
+ # the root node must be at the last index
164
+ return Tree(nodes[len(nodes)-1])
@@ -0,0 +1,456 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ import warnings
6
+ import numpy as np
7
+ from .seqtypes import NucleotideSequence, ProteinSequence, GeneralSequence
8
+ from .alphabet import LetterAlphabet
9
+ from .align.alignment import get_codes
10
+
11
+ __name__ = "biotite.sequence"
12
+ __author__ = "Maximilian Greil"
13
+ __all__ = ["SequenceProfile"]
14
+
15
+ # Abbreviations
16
+ _NUC_DNA_ALPH = NucleotideSequence.alphabet_unamb
17
+ _NUC_RNA_ALPH = LetterAlphabet(["A", "C", "G", "U"])
18
+ _PROT_ALPH = ProteinSequence.alphabet
19
+
20
+
21
+ def _determine_common_alphabet(alphabets):
22
+ """
23
+ Determine the common alphabet from a list of alphabets, that
24
+ extends all alphabets.
25
+ """
26
+ common_alphabet = alphabets[0]
27
+ for alphabet in alphabets[1:]:
28
+ if not common_alphabet.extends(alphabet):
29
+ if alphabet.extends(common_alphabet):
30
+ common_alphabet = alphabet
31
+ else:
32
+ raise ValueError(
33
+ "There is no common alphabet that extends all alphabets"
34
+ )
35
+ return common_alphabet
36
+
37
+
38
+ def _codes_to_iupac(frequency, codes, maxes, row):
39
+ """
40
+ Returns IUPAC code for a row of 'symbols' with none, one or
41
+ multiple maximum positions.
42
+ """
43
+ if np.sum(frequency) == 0:
44
+ raise ValueError(
45
+ f"There is an empty column in the 'symbols' frequency table. "
46
+ f"This doesn't make sense in context of an alignment. "
47
+ f"Please check the 'symbols' frequency table in row {row}."
48
+ )
49
+ key = tuple(np.where(frequency == maxes)[0])
50
+ return codes[key]
51
+
52
+
53
+ class SequenceProfile(object):
54
+ """
55
+ A :class:`SequenceProfile` object stores information about a
56
+ sequence profile of aligned sequences.
57
+ It is possible to calculate and return its consensus sequence.
58
+
59
+ This class saves the position frequency matrix
60
+ (position count matrix) 'symbols' of the occurrences of each
61
+ alphabet symbol at each position.
62
+ It also saves the number of gaps at each position in the array
63
+ 'gaps'.
64
+
65
+ With :meth:`probability_matrix()` the position probability matrix
66
+ can be created based on 'symbols' and a pseudocount.
67
+
68
+ With :meth:`log_odds_matrix()` the position weight matrix can
69
+ be created based on the before calculated position probability
70
+ matrix and the background frequencies.
71
+
72
+ With :meth:`from_alignment()` a :class:`SequenceProfile` object can
73
+ be created from an indefinite number of aligned sequences.
74
+
75
+ With :meth:`sequence_probability_from_matrix()` the probability of a
76
+ sequence can be calculated based on the before calculated position
77
+ probability matrix of this instance of object SequenceProfile.
78
+
79
+ With :meth:`sequence_score_from_matrix()` the score of a sequence
80
+ can be calculated based on the before calculated position weight
81
+ matrix of this instance of object SequenceProfile.
82
+
83
+ All attributes of this class are publicly accessible.
84
+
85
+ Parameters
86
+ ----------
87
+ symbols : ndarray, dtype=int, shape=(n,k)
88
+ This matrix simply saves for each position how often absolutely
89
+ each symbol is present.
90
+ gaps : ndarray, dtype=int, shape=n
91
+ Array which indicates the number of gaps at each position.
92
+ alphabet : Alphabet, length=k
93
+ Alphabet of sequences of sequence profile
94
+
95
+ Attributes
96
+ ----------
97
+ symbols : ndarray, dtype=int, shape=(n,k)
98
+ This matrix simply saves for each position how often absolutely
99
+ each symbol is present.
100
+ gaps : ndarray, dtype=int, shape=n
101
+ Array which indicates the number of gaps at each position.
102
+ alphabet : Alphabet, length=k
103
+ Alphabet of sequences of sequence profile
104
+ """
105
+
106
+ def __init__(self, symbols, gaps, alphabet):
107
+ self._symbols = symbols
108
+ self._gaps = gaps
109
+ self._alphabet = alphabet
110
+
111
+ if len(alphabet) != symbols.shape[1]:
112
+ raise ValueError(
113
+ f"The given alphabet doesn't have the same length "
114
+ f"({len(alphabet)}) as the number of columns "
115
+ f"({symbols.shape[1]}) in the 'symbols' frequency table."
116
+ )
117
+
118
+ if gaps.shape[0] != symbols.shape[0]:
119
+ raise ValueError(
120
+ f"The given 'gaps' position matrix doesn't have the same "
121
+ f"length ({gaps.shape[0]}) as the 'symbols' "
122
+ f"frequency table ({symbols.shape[0]})"
123
+ )
124
+
125
+ @property
126
+ def symbols(self):
127
+ return self._symbols
128
+
129
+ @property
130
+ def gaps(self):
131
+ return self._gaps
132
+
133
+ @property
134
+ def alphabet(self):
135
+ return self._alphabet
136
+
137
+ @symbols.setter
138
+ def symbols(self, new_symbols):
139
+ if not new_symbols.shape == self.symbols.shape:
140
+ raise ValueError(
141
+ f"New ndarray 'symbols' must be of same shape "
142
+ f"{self.symbols.shape} as the old one"
143
+ )
144
+ self._symbols = new_symbols
145
+
146
+ @gaps.setter
147
+ def gaps(self, new_gaps):
148
+ if not new_gaps.shape == self.gaps.shape:
149
+ raise ValueError(
150
+ f"New ndarray 'gaps' must be of same shape "
151
+ f"{self.gaps.shape} as the old one"
152
+ )
153
+ self._gaps = new_gaps
154
+
155
+ def __repr__(self):
156
+ """Represent SequenceProfile as a string for debugging."""
157
+ return f"SequenceProfile(np.{np.array_repr(self.symbols)}, " \
158
+ f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
159
+
160
+ def __eq__(self, item):
161
+ if not isinstance(item, SequenceProfile):
162
+ return False
163
+ if not np.array_equal(self.symbols, item.symbols):
164
+ return False
165
+ if not np.array_equal(self.gaps, item.gaps):
166
+ return False
167
+ if not self.alphabet == item.alphabet:
168
+ return False
169
+ return True
170
+
171
+ @staticmethod
172
+ def from_alignment(alignment, alphabet=None):
173
+ """
174
+ Get an object of :class:`SequenceProfile` from an object of
175
+ :class:`Alignment`.
176
+
177
+ Based on the sequences of the alignment, the SequenceProfile
178
+ parameters symbols and gaps are calculated.
179
+
180
+ Parameters
181
+ ----------
182
+ alignment : Alignment
183
+ An Alignment object to create the SequenceProfile object
184
+ from.
185
+ alphabet : bool
186
+ This alphabet will be used when creating the SequenceProfile
187
+ object. If no alphabet is selected, the alphabet for this
188
+ SequenceProfile
189
+ object will be calculated from the sequences of object
190
+ Alignment.
191
+ (Default: None).
192
+
193
+ Returns
194
+ -------
195
+ profile: SequenceProfile
196
+ The created SequenceProfile object
197
+ """
198
+ sequences = get_codes(alignment)
199
+ if alphabet is None:
200
+ alphabet = _determine_common_alphabet(
201
+ [seq.alphabet for seq in alignment.sequences]
202
+ )
203
+ else:
204
+ for alph in (seq.alphabet for seq in alignment.sequences):
205
+ if not alphabet.extends(alph):
206
+ raise ValueError(
207
+ f"The given alphabet is incompatible with a least one "
208
+ "alphabet of the given sequences"
209
+ )
210
+ symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int)
211
+ gaps = np.zeros(len(sequences[0]), dtype=int)
212
+ sequences = np.transpose(sequences)
213
+ for i in range(len(sequences)):
214
+ row = np.where(sequences[i, ] == -1, len(alphabet), sequences[i, ])
215
+ count = np.bincount(row, minlength=len(alphabet) + 1)
216
+ symbols[i, ] = count[0:len(alphabet)]
217
+ gaps[i] = count[-1]
218
+ return SequenceProfile(symbols, gaps, alphabet)
219
+
220
+ def to_consensus(self, as_general=False):
221
+ """
222
+ Get the consensus sequence for this SequenceProfile object.
223
+
224
+ Parameters
225
+ ----------
226
+ as_general : bool
227
+ If true, returns consensus sequence as GeneralSequence
228
+ object.
229
+ Otherwise, the consensus sequence object type is chosen
230
+ based on the alphabet of this SequenceProfile object
231
+ (Default: False).
232
+
233
+ Returns
234
+ -------
235
+ consensus: Sequence
236
+ The calculated consensus sequence
237
+ """
238
+ # https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
239
+ if as_general:
240
+ return self._general_to_consensus()
241
+ elif self.alphabet == _NUC_DNA_ALPH:
242
+ return NucleotideSequence(self._dna_to_consensus())
243
+ elif self.alphabet == _NUC_RNA_ALPH:
244
+ return NucleotideSequence(self._rna_to_consensus())
245
+ elif self.alphabet == _PROT_ALPH:
246
+ return self._prot_to_consensus()
247
+ return self._general_to_consensus()
248
+
249
+ def _dna_to_consensus(self):
250
+ codes = {
251
+ (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'T',
252
+ (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
253
+ (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
254
+ (0, 1, 2, 3): 'N'
255
+ }
256
+ consensus = ""
257
+ maxes = np.max(self.symbols, axis=1)
258
+ for i in range(len(self.symbols)):
259
+ consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
260
+ return consensus
261
+
262
+ def _rna_to_consensus(self):
263
+ codes = {
264
+ (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'U',
265
+ (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
266
+ (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
267
+ (0, 1, 2, 3): 'N'
268
+ }
269
+ consensus = ""
270
+ maxes = np.max(self.symbols, axis=1)
271
+ for i in range(len(self.symbols)):
272
+ consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
273
+ return consensus
274
+
275
+ def _prot_to_consensus(self):
276
+ """
277
+ In case there is more than one symbol with the same maximal
278
+ occurrences, the alphabetically sorted first symbol will be
279
+ taken for the consensus sequence.
280
+ """
281
+ consensus = ProteinSequence()
282
+ consensus.code = np.argmax(self.symbols, axis=1)
283
+ consensus.code = np.where(
284
+ np.sum(self.symbols, axis=1) == 0, 23, consensus.code
285
+ ) # _PROT_ALPH[23] = 'X'
286
+ return consensus
287
+
288
+ def _general_to_consensus(self):
289
+ """
290
+ In case there is more than one symbol with the same maximal
291
+ occurrences, the alphabetically sorted first symbol will be
292
+ taken for the consensus sequence.
293
+ In case the sum of occurrences of all symbols at a position is
294
+ zero, the alphabetically sorted first symbol will be taken for
295
+ the consensus sequence.
296
+ """
297
+ consensus = GeneralSequence(self.alphabet)
298
+ consensus.code = np.argmax(self.symbols, axis=1)
299
+ return consensus
300
+
301
+ def probability_matrix(self, pseudocount=0):
302
+ r"""
303
+ Calculate the position probability matrix (PPM) based on
304
+ 'symbols' and the given pseudocount.
305
+ This new matrix has the same shape as 'symbols'.
306
+
307
+ .. math::
308
+
309
+ P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p}
310
+
311
+ :math:`S`: The symbol.
312
+
313
+ :math:`C_S`: The count of symbol :math:`S` at the sequence
314
+ position.
315
+
316
+ :math:`c_p`: The pseudocount.
317
+
318
+ :math:`k`: Length of the alphabet.
319
+
320
+ Parameters
321
+ ----------
322
+ pseudocount: int, optional
323
+ Amount added to the number of observed cases in order to
324
+ change the expected probability of the PPM.
325
+ (Default: 0)
326
+
327
+ Returns
328
+ -------
329
+ ppm: ndarray, dtype=float, shape=(n,k)
330
+ The calculated the position probability matrix.
331
+ """
332
+ if pseudocount < 0:
333
+ raise ValueError(
334
+ f"Pseudocount can not be smaller than zero."
335
+ )
336
+ return (self.symbols + pseudocount / self.symbols.shape[1]) / \
337
+ (np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount)
338
+
339
+ def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
340
+ r"""
341
+ Calculate the position weight matrix (PWM) based on the
342
+ position probability matrix (PPM) (with given pseudocount) and
343
+ background_frequencies.
344
+ This new matrix has the same shape as 'symbols'.
345
+
346
+ .. math::
347
+
348
+ W(S) = \log_2 \left( \frac{P(S)}{B_S} \right)
349
+
350
+ :math:`S`: The symbol.
351
+
352
+ :math:`P(S)`: The probability of symbol :math:`S` at the
353
+ sequence position.
354
+
355
+ :math:`c_p`: The background frequency of symbol :math:`S`.
356
+
357
+ Parameters
358
+ ----------
359
+ pseudocount: int, optional
360
+ Amount added to the number of observed cases in order to change
361
+ the expected probability of the PPM.
362
+ (Default: 0)
363
+ background_frequencies: ndarray, shape=(k,), dtype=float, optional
364
+ The background frequencies for each symbol in the alphabet.
365
+ By default, a uniform distribution is assumed.
366
+
367
+ Returns
368
+ -------
369
+ pwm: ndarray, dtype=float, shape=(n,k)
370
+ The calculated the position weight matrix.
371
+ """
372
+ if background_frequencies is None:
373
+ background_frequencies = 1 / len(self.alphabet)
374
+ ppm = self.probability_matrix(pseudocount=pseudocount)
375
+ # Catch warning that appears, if a symbol is missing at any
376
+ # position in the profile
377
+ with warnings.catch_warnings():
378
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
379
+ return np.log2(ppm / background_frequencies)
380
+
381
+ def sequence_probability(self, sequence, pseudocount=0):
382
+ r"""
383
+ Calculate probability of a sequence based on the
384
+ position probability matrix (PPM).
385
+
386
+ The sequence probability is the product of the probability of
387
+ the respective symbol over all sequence positions.
388
+
389
+ Parameters
390
+ ----------
391
+ sequence : Sequence
392
+ The input sequence.
393
+ pseudocount: int, optional
394
+ Amount added to the number of observed cases in order to change
395
+ the expected probability of the PPM.
396
+ (Default: 0)
397
+
398
+ Returns
399
+ -------
400
+ probability: float
401
+ The calculated probability for the input sequence based on
402
+ the PPM.
403
+ """
404
+ ppm = self.probability_matrix(pseudocount=pseudocount)
405
+ if len(sequence) != len(ppm):
406
+ raise ValueError(
407
+ f"The given sequence has a different length ({len(sequence)}) than "
408
+ f"the position probability matrix ({len(ppm)})."
409
+ )
410
+ if not ppm.shape == self.symbols.shape:
411
+ raise ValueError(
412
+ f"Position probability matrix {ppm.shape} must be of same shape "
413
+ f"as 'symbols' {self.symbols.shape}"
414
+ )
415
+ return np.prod(ppm[np.arange(len(sequence)), sequence.code])
416
+
417
+ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
418
+ """
419
+ Calculate score of a sequence based on the
420
+ position weight matrix (PWM).
421
+
422
+ The score is the sum of weights (log-odds scores) of
423
+ the respective symbol over all sequence positions.
424
+
425
+ Parameters
426
+ ----------
427
+ sequence : Sequence
428
+ The input sequence.
429
+ pseudocount: int, optional
430
+ Amount added to the number of observed cases in order to change
431
+ the expected probability of the PPM.
432
+ (Default: 0)
433
+ background_frequencies: ndarray, shape=(k,), dtype=float, optional
434
+ The background frequencies for each symbol in the alphabet.
435
+ By default a uniform distribution is assumed.
436
+
437
+ Returns
438
+ -------
439
+ score: float
440
+ The calculated score for the input sequence based on
441
+ the PWM.
442
+ """
443
+ if background_frequencies is None:
444
+ background_frequencies = 1 / len(self.alphabet)
445
+ pwm = self.log_odds_matrix(background_frequencies=background_frequencies, pseudocount=pseudocount)
446
+ if len(sequence) != len(pwm):
447
+ raise ValueError(
448
+ f"The given sequence has a different length ({len(sequence)}) than "
449
+ f"the position weight matrix ({len(pwm)})."
450
+ )
451
+ if not pwm.shape == self.symbols.shape:
452
+ raise ValueError(
453
+ f"Position weight matrix {pwm.shape} must be of same shape "
454
+ f"as 'symbols' {self.symbols.shape}"
455
+ )
456
+ return np.sum(pwm[np.arange(len(sequence)), sequence.code])