biotite 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-310-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,620 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["align_multiple"]
8
+
9
+ cimport cython
10
+ cimport numpy as np
11
+ from libc.math cimport log
12
+
13
+ import numpy as np
14
+ from .matrix import SubstitutionMatrix
15
+ from .alignment import Alignment
16
+ from .pairwise import align_optimal
17
+ from ..sequence import Sequence
18
+ from ..alphabet import Alphabet
19
+ from ..phylo.upgma import upgma
20
+ from ..phylo.tree import Tree, TreeNode, as_binary
21
+
22
+
23
+ ctypedef np.int32_t int32
24
+ ctypedef np.int64_t int64
25
+ ctypedef np.uint8_t uint8
26
+ ctypedef np.uint16_t uint16
27
+ ctypedef np.uint32_t uint32
28
+ ctypedef np.uint64_t uint64
29
+ ctypedef np.float32_t float32
30
+
31
+ ctypedef fused CodeType:
32
+ uint8
33
+ uint16
34
+ uint32
35
+ uint64
36
+
37
+
38
+ cdef float32 MAX_FLOAT = np.finfo(np.float32).max
39
+
40
+
41
+ class GapSymbol:
42
+
43
+ _instance = None
44
+
45
+ def __init__(self):
46
+ if GapSymbol._instance is not None:
47
+ raise ValueError(
48
+ "Cannot instantiate this singleton more than one time"
49
+ )
50
+ else:
51
+ GapSymbol._instance = self
52
+
53
+ @staticmethod
54
+ def instance():
55
+ if GapSymbol._instance is None:
56
+ GapSymbol._instance = GapSymbol()
57
+ return GapSymbol._instance
58
+
59
+ def __str__(self):
60
+ return "-"
61
+
62
+ def __hash__(self):
63
+ return 0
64
+
65
+
66
+ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
67
+ distances=None, guide_tree=None):
68
+ r"""
69
+ align_multiple(sequences, matrix, gap_penalty=-10,
70
+ terminal_penalty=True, distances=None,
71
+ guide_tree=None)
72
+
73
+ Perform a multiple sequence alignment using a progressive
74
+ alignment algorithm. :footcite:`Feng1987`
75
+
76
+ Based on pairwise sequence distances a guide tree is constructed.
77
+ The sequences are progessively aligned according to the tree,
78
+ following the rule 'Once a gap, always a gap'.
79
+
80
+ Parameters
81
+ ----------
82
+ sequences : list of Sequence
83
+ The sequences to be aligned.
84
+ The alpahbet of the substitution matrix must be equal or
85
+ extend the alphabet of each sequence.
86
+ matrix : SubstitutionMatrix
87
+ The substitution matrix used for scoring.
88
+ Must be symmetric.
89
+ gap_penalty : int or tuple(int, int), optional
90
+ If an integer is provided, the value will be interpreted as
91
+ general gap penalty. If a tuple is provided, an affine gap
92
+ penalty is used. The first integer in the tuple is the gap
93
+ opening penalty, the second integer is the gap extension
94
+ penalty.
95
+ The values need to be negative. (Default: *-10*)
96
+ terminal_penalty : bool, optional
97
+ If true, gap penalties are applied to terminal gaps.
98
+ (Default: True)
99
+ distances : ndarray, shape=(n,n)
100
+ Pairwise distances of the sequences.
101
+ The matrix must be symmetric and all entries must be larger
102
+ than 0.
103
+ By default the pairwise distances are calculated from
104
+ similarities obtained from optimal global pairwise alignments
105
+ (:func:`align_optimal()`).
106
+ The similarities are converted into distances using the method
107
+ proposed by Feng & Doolittle :footcite:`Feng1996`.
108
+ guide_tree : Tree
109
+ The guide tree to be used for the progressive alignment.
110
+ By default the guide tree is constructed from `distances`
111
+ via the UPGMA clustering method.
112
+
113
+ Returns
114
+ -------
115
+ alignment : Alignment
116
+ The global multiple sequence alignment of the input sequences.
117
+ order : ndarray, dtype=int
118
+ The sequence order represented by the guide tree.
119
+ When this order is applied to alignment sequence order,
120
+ similar sequences are adjacent to each other.
121
+ tree : Tree
122
+ The guide tree used for progressive alignment.
123
+ Equal to `guide_tree` if provided.
124
+ distance_matrix : ndarray, shape=(n,n), dtype=float32
125
+ The pairwise distance matrix used to construct the guide tree.
126
+ Equal to `distances` if provided.
127
+
128
+ Notes
129
+ -----
130
+ The similarity to distance conversion is performed according to the
131
+ following formula:
132
+
133
+ .. math:: D_{a,b} = -\ln\left(
134
+ \frac
135
+ { S_{a,b} - S_{a,b}^{rand} }
136
+ { S_{a,b}^{max} - S_{a,b}^{rand} }
137
+ \right)
138
+
139
+ .. math:: S_{a,b}^{max} = \frac{ S_{a,a} + S_{b,b} }{ 2 }
140
+
141
+ .. math:: S_{a,b}^{rand} = \frac{1}{L_{a,b}}
142
+ \left(
143
+ \sum_{x \in \Omega} \sum_{y \in \Omega}
144
+ s_{x,y} \cdot N_a(x) \cdot N_b(y)
145
+ \right)
146
+ + N_{a,b}^{open} \cdot p^{open} + N_{a,b}^{ext} \cdot p^{ext}
147
+
148
+ :math:`D_{a,b}` - The distance between the sequences *a* and *b*.
149
+
150
+ :math:`S_{a,b}` - The similarity score between the sequences *a* and *b*.
151
+
152
+ :math:`s_{x,y}` - The similarity score between the symbols *x* and *y*.
153
+
154
+ :math:`\Omega` - The sequence alphabet.
155
+
156
+ :math:`N_a(x)` - Number of occurences of symbol *x* in sequence *a*.
157
+
158
+ :math:`N_{a,b}^{open}, N_{a,b}^{ext}` - Number of gap openings/
159
+ extensions, in the alignment of *a* and *b*.
160
+
161
+ :math:`p^{open}, p^{ext}` - The penalty for a gap opening/extension.
162
+
163
+ :math:`L_{a,b}` - Number of columns in the alignment of *a* and *b*.
164
+
165
+ In rare cases of extremely unrelated sequences, :math:`S_{a,b}`
166
+ can be lower than :math:`S_{a,b}^{rand}`.
167
+ In this case the logaritmus cannot be calculated and a
168
+ :class:`ValueError` is raised.
169
+
170
+ References
171
+ ----------
172
+
173
+ .. footbibliography::
174
+
175
+ Examples
176
+ --------
177
+
178
+ >>> seq1 = ProteinSequence("BIQTITE")
179
+ >>> seq2 = ProteinSequence("TITANITE")
180
+ >>> seq3 = ProteinSequence("BISMITE")
181
+ >>> seq4 = ProteinSequence("IQLITE")
182
+ >>> matrix = SubstitutionMatrix.std_protein_matrix()
183
+ >>>
184
+ >>> alignment, order, tree, distances = align_multiple(
185
+ ... [seq1, seq2, seq3, seq4], matrix
186
+ ... )
187
+ >>>
188
+ >>> print(alignment)
189
+ BIQT-ITE
190
+ TITANITE
191
+ BISM-ITE
192
+ -IQL-ITE
193
+ >>> print(alignment[:, order.tolist()])
194
+ -IQL-ITE
195
+ BISM-ITE
196
+ BIQT-ITE
197
+ TITANITE
198
+ >>> print(distances)
199
+ [[0.000 1.034 0.382 0.560]
200
+ [1.034 0.000 0.923 1.132]
201
+ [0.382 0.923 0.000 0.632]
202
+ [0.560 1.132 0.632 0.000]]
203
+ >>>
204
+ >>> print(tree.to_newick(
205
+ ... labels=["seq1", "seq2", "seq3", "seq4"], include_distance=False
206
+ ... ))
207
+ ((seq4,(seq3,seq1)),seq2);
208
+ """
209
+ if not matrix.is_symmetric():
210
+ raise ValueError("A symmetric substitution matrix is required")
211
+ alphabet = matrix.get_alphabet1()
212
+ for i, seq in enumerate(sequences):
213
+ if seq.code is None:
214
+ raise ValueError(f"Code of sequence {i} is 'None'")
215
+ if not alphabet.extends(seq.get_alphabet()):
216
+ raise ValueError(
217
+ f"The substitution matrix and sequence {i} have "
218
+ f"incompatible alphabets"
219
+ )
220
+
221
+ # Create guide tree
222
+ # Template parameter workaround
223
+ _T = sequences[0].code
224
+ if distances is None:
225
+ distances = _get_distance_matrix(
226
+ _T, sequences, matrix, gap_penalty, terminal_penalty
227
+ )
228
+ else:
229
+ distances = distances.astype(np.float32, copy=True)
230
+ if guide_tree is None:
231
+ guide_tree = upgma(distances)
232
+ else:
233
+ # Assure that every node in the guide tree is binary
234
+ guide_tree = as_binary(guide_tree)
235
+
236
+ # Create new matrix with neutral gap symbol
237
+ gap_symbol = GapSymbol.instance()
238
+ new_alphabet = Alphabet(
239
+ matrix.get_alphabet1().get_symbols() + [gap_symbol]
240
+ )
241
+ new_score_matrix = np.zeros(
242
+ (len(new_alphabet), len(new_alphabet)), dtype=np.int32
243
+ )
244
+ # New substitution matrix is the same as the old one,
245
+ # except the neutral ghap symbol,
246
+ # that scores 0 with all other symbols
247
+ new_score_matrix[:-1,:-1] = matrix.score_matrix()
248
+ new_matrix = SubstitutionMatrix(
249
+ new_alphabet, new_alphabet, new_score_matrix
250
+ )
251
+
252
+ # Progressive alignment
253
+ gap_symbol_code = new_alphabet.encode(gap_symbol)
254
+ order, aligned_seqs = _progressive_align(
255
+ _T, sequences, guide_tree.root, distances, new_matrix,
256
+ gap_symbol_code, gap_penalty, terminal_penalty
257
+ )
258
+ aligned_seq_codes = [seq.code for seq in aligned_seqs]
259
+
260
+ # Remove neutral gap symbols and create actual trace
261
+ seq_i = np.zeros(len(aligned_seqs))
262
+ trace = np.full(
263
+ (len(aligned_seqs[0]), len(aligned_seqs)), -1, dtype=np.int64)
264
+ for j in range(trace.shape[1]):
265
+ seq_code = aligned_seq_codes[j]
266
+ seq_i = 0
267
+ for i in range(trace.shape[0]):
268
+ if seq_code[i] == gap_symbol_code:
269
+ trace[i,j] = -1
270
+ else:
271
+ trace[i,j] = seq_i
272
+ seq_i += 1
273
+ aligned_seq_codes = [
274
+ code[code != gap_symbol_code] for code in aligned_seq_codes
275
+ ]
276
+ for i in range(len(aligned_seqs)):
277
+ aligned_seqs[i].code = aligned_seq_codes[i]
278
+
279
+ # Reorder alignmets into original alignemnt
280
+ new_order = np.argsort(order)
281
+ aligned_seqs = [aligned_seqs[pos] for pos in new_order]
282
+ trace = trace[:, new_order]
283
+
284
+ return Alignment(aligned_seqs, trace), order, guide_tree, distances
285
+
286
+
287
+ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
288
+ gap_penalty, terminal_penalty):
289
+ """
290
+ Create all pairwise alignments for the given sequences and use the
291
+ method proposed by Feng & Doolittle to calculate the pairwise
292
+ distance matrix
293
+
294
+ Parameters
295
+ ----------
296
+ _T : ndarray, dtype=VARAIBLE
297
+ A little bit hacky workaround to get the correct dtype for the
298
+ sequence code of the sequences in a static way
299
+ (important for Cython).
300
+ sequences : list of Sequence, length=n
301
+ The sequences to get the distance matrix for.
302
+ matrix : SubstitutionMatrix
303
+ The substitution matrix used for the alignments.
304
+ gap_penalty : int or tuple(int, int)
305
+ A linear or affine gap penalty for the alignments.
306
+ terminal_penalty : bool
307
+ Whether to or not count terminal gap penalties for the
308
+ alignments.
309
+
310
+ Returns
311
+ -------
312
+ distances : ndarray, shape=(n,n), dtype=float32
313
+ The pairwise distance matrix.
314
+ """
315
+ cdef int i, j
316
+
317
+ cdef np.ndarray scores = np.zeros(
318
+ (len(sequences), len(sequences)), dtype=np.int32
319
+ )
320
+ cdef np.ndarray alignments = np.full(
321
+ (len(sequences), len(sequences)), None, dtype=object
322
+ )
323
+ for i in range(len(sequences)):
324
+ # Inclusive range
325
+ for j in range(i+1):
326
+ # For this method we only consider one alignment:
327
+ # Score is equal for all alignments
328
+ # Alignment length is equal for most alignments
329
+ alignment = align_optimal(
330
+ sequences[i], sequences[j], matrix,
331
+ gap_penalty, terminal_penalty, max_number=1
332
+ )[0]
333
+ scores[i,j] = alignment.score
334
+ alignments[i,j] = alignment
335
+
336
+ ### Distance calculation from similarity scores ###
337
+ # Calculate the occurences of each symbol code in each sequence
338
+ # This is used later for the random score
339
+ # Both alphabets are the same
340
+ cdef CodeType alphabet_size = len(matrix.get_alphabet1())
341
+ cdef np.ndarray code_count = np.zeros(
342
+ (len(sequences), alphabet_size), dtype=np.int32
343
+ )
344
+ cdef int32[:,:] code_count_v = code_count
345
+ for i in range(len(sequences)):
346
+ code_count[i] = np.bincount(sequences[i].code, minlength=alphabet_size)
347
+
348
+ cdef int gap_open=0, gap_ext=0
349
+ if type(gap_penalty) == int:
350
+ gap_open = gap_penalty
351
+ gap_ext = gap_penalty
352
+ elif type(gap_penalty) == tuple:
353
+ gap_open = gap_penalty[0]
354
+ gap_ext = gap_penalty[1]
355
+ else:
356
+ raise TypeError("Gap penalty must be either integer or tuple")
357
+
358
+ cdef const int32[:,:] score_matrix = matrix.score_matrix()
359
+ cdef int32[:,:] scores_v = scores
360
+ cdef np.ndarray distances = np.zeros(
361
+ (scores.shape[0], scores.shape[1]), dtype=np.float32
362
+ )
363
+ cdef float32[:,:] distances_v = distances
364
+ cdef CodeType[:] seq_code1, seq_code2
365
+ cdef CodeType code1, code2
366
+ cdef float32 score_rand, score_max
367
+
368
+ # Calculate distance
369
+ # i and j are indicating the alignment between the sequences i and j
370
+ for i in range(scores_v.shape[0]):
371
+ for j in range(i):
372
+ score_max = (scores_v[i,i] + scores_v[j,j]) / 2.0
373
+ score_rand = 0
374
+ for code1 in range(alphabet_size):
375
+ for code2 in range(alphabet_size):
376
+ score_rand += score_matrix[code1,code2] \
377
+ * code_count[i,code1] \
378
+ * code_count[j,code2]
379
+ score_rand /= alignments[i,j].trace.shape[0]
380
+ gap_open_count, gap_ext_count = _count_gaps(
381
+ alignments[i,j].trace.astype(np.int64, copy=False),
382
+ terminal_penalty
383
+ )
384
+ score_rand += gap_open_count * gap_open
385
+ score_rand += gap_ext_count * gap_ext
386
+ if scores_v[i,j] < score_rand:
387
+ # Randomized alignment is better than actual alignment
388
+ # -> the logaritmus argument would become negative
389
+ # resulting in an NaN distance
390
+ raise ValueError(
391
+ f"The randomized alignment of sequences {j} and {i} "
392
+ f"scores better than the real pairwise alignment, "
393
+ f"cannot calculate proper pairwise distance"
394
+ )
395
+ else:
396
+ distances_v[i,j] = -log(
397
+ (scores_v[i,j] - score_rand) / (score_max - score_rand)
398
+ )
399
+ # Pairwise distance matrix is symmetric
400
+ distances_v[j,i] = distances_v[i,j]
401
+ return distances
402
+
403
+
404
+ def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
405
+ """
406
+ Count the number of gap openings and gap extensions in an alignment
407
+ trace.
408
+
409
+ Parameters
410
+ ----------
411
+ trace_v : ndarary, shape=(n,2), dtype=int
412
+ The alignemnt trace.
413
+ terminal_penalty : bool
414
+ Whether to or not count terminal gap penalties.
415
+
416
+ Returns
417
+ -------
418
+ gap_open_count, gap_ext_count: int
419
+ The number of gap opening and gap extension columns
420
+ """
421
+ cdef int i, j
422
+ cdef int gap_open_count=0, gap_ext_count=0
423
+ cdef int start_index=-1, stop_index=-1
424
+
425
+ if not terminal_penalty:
426
+ # Ignore terminal gaps
427
+ # -> get start and exclusive stop column of the trace
428
+ # excluding terminal gaps
429
+ for i in range(trace_v.shape[0]):
430
+ # Check if all sequences have no gap at the given position
431
+ if trace_v[i,0] != -1 and trace_v[i,1] != -1:
432
+ start_index = i
433
+ break
434
+ # Reverse iteration
435
+ for i in range(trace_v.shape[0]-1, -1, -1):
436
+ # Check if all sequences have no gap at the given position
437
+ if trace_v[i,0] != -1 and trace_v[i,1] != -1:
438
+ stop_index = i+1
439
+ break
440
+ if start_index == -1 or stop_index == -1:
441
+ return 0, 0
442
+ trace_v = trace_v[start_index : stop_index]
443
+
444
+ if trace_v[0,0] == -1:
445
+ gap_open_count += 1
446
+ if trace_v[0,1] == -1:
447
+ gap_open_count += 1
448
+ for i in range(1, trace_v.shape[0]):
449
+ # trace_v.shape[1] = 2 due to pairwise alignemt
450
+ for j in range(trace_v.shape[1]):
451
+ if trace_v[i,j] == -1:
452
+ if trace_v[i-1,j] == -1:
453
+ gap_ext_count += 1
454
+ else:
455
+ gap_open_count += 1
456
+ return gap_open_count, gap_ext_count
457
+
458
+
459
+ def _progressive_align(CodeType[:] _T, sequences, tree_node,
460
+ float32[:,:]distances_v, matrix,
461
+ int gap_symbol_code, gap_penalty, terminal_penalty):
462
+ """
463
+ Conduct the progressive alignemt of the sequences that are
464
+ referred to by the given guide tree node.
465
+
466
+ At first the the two sub-MSAs are calculated from the child nodes
467
+ of the given node.
468
+ Then the sub-MSAs are combined to one MSA by aligning the two
469
+ sequences from both sub-MSAs with the lowest distance to each other,
470
+ taken from the pairwise distance matrix.
471
+ The gaps inserted in this pairwise alignment are also inserted
472
+ into all other sequences in the respective sub-MSA at the same
473
+ position.
474
+
475
+ Parameters
476
+ ----------
477
+ _T : ndarray, dtype=VARAIBLE
478
+ A little bit hacky workaround to get the correct dtype for the
479
+ sequence code of the sequences in a static way
480
+ (important for Cython).
481
+ sequences : list of Sequence, lebgth=n
482
+ All sequences that should be aligned in the MSA.
483
+ tree_node : TreeNode
484
+ This guide tree node defines, which of sequences in the
485
+ `sequences` parameter should be aligned in this call.
486
+ This is the only parameter that changes in the series of
487
+ recursive calls of this function.
488
+ distances_v : ndarray, shape=(n,n)
489
+ The pairwise distance matrix.
490
+ matrix : SubstitutionMatrix
491
+ The substitution matrix used for the alignments.
492
+ gap_symbol_code : int
493
+ The symbol code for the gap symbol.
494
+ gap_penalty : int or tuple(int, int)
495
+ A linear or affine gap penalty for the alignments.
496
+ terminal_penalty : bool
497
+ Whether to or not count terminal gap penalties for the
498
+ alignments.
499
+
500
+ Returns
501
+ -------
502
+ order : ndarray, shape=(m,), dtype=int
503
+ The index of each element in `aligned_sequences` in the
504
+ orginal `sequences` parameter.
505
+ aligned_sequences : list of Sequence, length=m
506
+ A list of the sequences that were aligned.
507
+ Instead of an :class:`Alignment` object that represents the gaps
508
+ as ``-1`` in the trace, the gaps are represented as dedicated
509
+ gap symbols in this case.
510
+ This allows for the pairwise alignemt of gapped sequences.
511
+ """
512
+ cdef int i=0, j=0
513
+ cdef int i_min=0, j_min=0
514
+ cdef float32 dist_min, dist
515
+ cdef int32[:] indices1_v, indices2_v
516
+ cdef np.ndarray incides1, incides2
517
+ cdef list aligned_seqs1, aligned_seqs2
518
+
519
+ if tree_node.is_leaf():
520
+ # Child node -> Cannot do an alignment
521
+ # -> Just return the sequence corresponding to the leaf node
522
+ # Copy sequences to avoid modification of input sequences
523
+ # when neutral gap character is inserted
524
+ return np.array([tree_node.index], dtype=np.int32), \
525
+ [sequences[tree_node.index].copy()]
526
+
527
+ else:
528
+ # Multiple alignment of sequences corresponding to both child nodes
529
+ child1, child2 = tree_node.children
530
+ incides1, aligned_seqs1 = _progressive_align(
531
+ _T, sequences, child1, distances_v, matrix,
532
+ gap_symbol_code, gap_penalty, terminal_penalty
533
+ )
534
+ indices1_v = incides1
535
+ incides2, aligned_seqs2 = _progressive_align(
536
+ _T, sequences, child2, distances_v, matrix,
537
+ gap_symbol_code, gap_penalty, terminal_penalty
538
+ )
539
+ indices2_v = incides2
540
+
541
+ # Find sequence pair with lowest distance
542
+ dist_min = MAX_FLOAT
543
+ for i in range(indices1_v.shape[0]):
544
+ for j in range(indices2_v.shape[0]):
545
+ dist = distances_v[indices1_v[i], indices2_v[j]]
546
+ if dist < dist_min:
547
+ dist_min = dist
548
+ i_min = i
549
+ j_min = j
550
+ # Alignment of sequence pair with lowest distance
551
+ # For this method we only consider one alignment:
552
+ alignment = align_optimal(
553
+ aligned_seqs1[i_min], aligned_seqs2[j_min], matrix,
554
+ gap_penalty, terminal_penalty, max_number=1
555
+ )[0]
556
+ # Place neutral gap symbol for position of new gaps
557
+ # in both sequence groups
558
+ for i in range(len(aligned_seqs1)):
559
+ seq = aligned_seqs1[i]
560
+ seq.code = _replace_gaps(
561
+ _T, alignment.trace[:,0], seq.code, gap_symbol_code
562
+ )
563
+ for i in range(len(aligned_seqs2)):
564
+ seq = aligned_seqs2[i]
565
+ seq.code = _replace_gaps(
566
+ _T, alignment.trace[:,1], seq.code, gap_symbol_code
567
+ )
568
+ return np.append(incides1, incides2), \
569
+ aligned_seqs1 + aligned_seqs2
570
+
571
+
572
+
573
+ def _replace_gaps(CodeType[:] _T,
574
+ int64[:] partial_trace_v,
575
+ np.ndarray seq_code,
576
+ int gap_symbol_code):
577
+ """
578
+ Replace gaps in a sequence in an :class:`Alignment` with a dedicated
579
+ gap symbol.
580
+
581
+ The replacement is required by the progressive alignment algorithm
582
+ to be able to align gapped sequences with each other.
583
+
584
+ Parameters
585
+ ----------
586
+ _T : ndarray, dtype=VARAIBLE
587
+ A little bit hacky workaround to get the correct dtype for the
588
+ sequence code of the sequences in a static way
589
+ (important for Cython).
590
+ partial_trace_v : ndarary, shape=(m,), dtype=int
591
+ The row of the alignemnt trace reffering to the given sequence.
592
+ seq_code : ndarary, shape=(n,)
593
+ The sequence code representing the given sequence.
594
+ gap_symbol_code : int
595
+ The symbol code for the gap symbol.
596
+
597
+ Returns
598
+ -------
599
+ new_seq_code : ndarary, shape=(m,)
600
+ The sequence code representing a new sequence, that is the given
601
+ sequence with inserted gap symbols.
602
+ """
603
+ cdef int i
604
+ cdef int64 index
605
+ cdef CodeType code
606
+
607
+ cdef CodeType[:] seq_code_v = seq_code
608
+ cdef np.ndarray new_seq_code = np.zeros(
609
+ partial_trace_v.shape[0], dtype=seq_code.dtype
610
+ )
611
+ cdef CodeType[:] new_seq_code_v = new_seq_code
612
+
613
+ for i in range(partial_trace_v.shape[0]):
614
+ index = partial_trace_v[i]
615
+ if index == -1:
616
+ new_seq_code_v[i] = gap_symbol_code
617
+ else:
618
+ new_seq_code_v[i] = seq_code[index]
619
+
620
+ return new_seq_code