biotite 0.41.1__cp311-cp311-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-311-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,658 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ import numpy as np
9
+ import numbers
10
+ import copy
11
+ import textwrap
12
+ from ..alphabet import LetterAlphabet
13
+
14
+
15
+ __all__ = ["Alignment", "get_codes", "get_symbols",
16
+ "get_sequence_identity", "get_pairwise_sequence_identity",
17
+ "score", "find_terminal_gaps", "remove_terminal_gaps"]
18
+
19
+
20
+ class Alignment(object):
21
+ """
22
+ An :class:`Alignment` object stores information about which symbols
23
+ of *n* sequences are aligned to each other and it stores the
24
+ corresponding alignment score.
25
+
26
+ Instead of saving a list of aligned symbols, this class saves the
27
+ original *n* sequences, that were aligned, and a so called *trace*,
28
+ which indicate the aligned symbols of these sequences.
29
+ The trace is a *(m x n)* :class:`ndarray` with alignment length
30
+ *m* and sequence count *n*.
31
+ Each element of the trace is the index in the corresponding
32
+ sequence.
33
+ A gap is represented by the value -1.
34
+
35
+ Furthermore this class provides multiple utility functions for
36
+ conversion into strings in order to make the alignment human
37
+ readable.
38
+
39
+ Unless an :class:`Alignment` object is the result of an multiple
40
+ sequence alignment, the object will contain only two sequences.
41
+
42
+ All attributes of this class are publicly accessible.
43
+
44
+ Parameters
45
+ ----------
46
+ sequences : list
47
+ A list of aligned sequences.
48
+ trace : ndarray, dtype=int, shape=(n,m)
49
+ The alignment trace.
50
+ score : int, optional
51
+ Alignment score.
52
+
53
+ Attributes
54
+ ----------
55
+ sequences : list
56
+ A list of aligned sequences.
57
+ trace : ndarray, dtype=int, shape=(n,m)
58
+ The alignment trace.
59
+ score : int
60
+ Alignment score.
61
+
62
+ Examples
63
+ --------
64
+
65
+ >>> seq1 = NucleotideSequence("CGTCAT")
66
+ >>> seq2 = NucleotideSequence("TCATGC")
67
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
68
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
69
+ >>> print(ali)
70
+ CGTCAT--
71
+ --TCATGC
72
+ >>> print(ali.trace)
73
+ [[ 0 -1]
74
+ [ 1 -1]
75
+ [ 2 0]
76
+ [ 3 1]
77
+ [ 4 2]
78
+ [ 5 3]
79
+ [-1 4]
80
+ [-1 5]]
81
+ >>> print(ali[1:4].trace)
82
+ [[ 1 -1]
83
+ [ 2 0]
84
+ [ 3 1]]
85
+ >>> print(ali[1:4, 0:1].trace)
86
+ [[1]
87
+ [2]
88
+ [3]]
89
+ """
90
+
91
+ def __init__(self, sequences, trace, score=None):
92
+ self.sequences = sequences.copy()
93
+ self.trace = trace
94
+ self.score = score
95
+
96
+ def __repr__(self):
97
+ """Represent Alignment a string for debugging."""
98
+ return f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " \
99
+ f"np.{np.array_repr(self.trace)}, score={self.score})"
100
+
101
+ def _gapped_str(self, seq_index):
102
+ seq_str = ""
103
+ for i in range(len(self.trace)):
104
+ j = self.trace[i][seq_index]
105
+ if j != -1:
106
+ seq_str += self.sequences[seq_index][j]
107
+ else:
108
+ seq_str += "-"
109
+ return seq_str
110
+
111
+ def get_gapped_sequences(self):
112
+ """
113
+ Get a the string representation of the gapped sequences.
114
+
115
+ Returns
116
+ -------
117
+ sequences : list of str
118
+ The list of gapped sequence strings. The order is the same
119
+ as in `Alignment.sequences`.
120
+ """
121
+ return [self._gapped_str(i) for i in range(len(self.sequences))]
122
+
123
+ def __str__(self):
124
+ # Check if any of the sequences
125
+ # has an non-single letter alphabet
126
+ all_single_letter = True
127
+ for seq in self.sequences:
128
+ if not isinstance(seq.get_alphabet(), LetterAlphabet):
129
+ all_single_letter = False
130
+ if all_single_letter:
131
+ # First dimension: sequence number,
132
+ # second dimension: line number
133
+ seq_str_lines_list = []
134
+ wrapper = textwrap.TextWrapper(break_on_hyphens=False)
135
+ for i in range(len(self.sequences)):
136
+ seq_str_lines_list.append(wrapper.wrap(self._gapped_str(i)))
137
+ ali_str = ""
138
+ for row_i in range(len(seq_str_lines_list[0])):
139
+ for seq_j in range(len(seq_str_lines_list)):
140
+ ali_str += seq_str_lines_list[seq_j][row_i] + "\n"
141
+ ali_str += "\n"
142
+ # Remove final line breaks
143
+ return ali_str[:-2]
144
+ else:
145
+ return super().__str__()
146
+
147
+ def __getitem__(self, index):
148
+ if isinstance(index, tuple):
149
+ if len(index) > 2:
150
+ raise IndexError("Only 1D or 2D indices are allowed")
151
+ if isinstance(index[0], numbers.Integral) or \
152
+ isinstance(index[0], numbers.Integral):
153
+ raise IndexError(
154
+ "Integers are invalid indices for alignments, "
155
+ "a single sequence or alignment column cannot be "
156
+ "selected"
157
+ )
158
+ return Alignment(
159
+ Alignment._index_sequences(self.sequences, index[1]),
160
+ self.trace[index],
161
+ self.score
162
+ )
163
+ else:
164
+ return Alignment(self.sequences, self.trace[index], self.score)
165
+
166
+ def __iter__(self):
167
+ raise TypeError("'Alignment' object is not iterable")
168
+
169
+ def __len__(self):
170
+ return len(self.trace)
171
+
172
+ def __eq__(self, item):
173
+ if not isinstance(item, Alignment):
174
+ return False
175
+ if self.sequences != item.sequences:
176
+ return False
177
+ if not np.array_equal(self.trace, item.trace):
178
+ return False
179
+ if self.score != item.score:
180
+ return False
181
+ return True
182
+
183
+ @staticmethod
184
+ def _index_sequences(sequences, index):
185
+ if isinstance(index, (list, tuple)) or \
186
+ (isinstance(index, np.ndarray) and index.dtype != bool):
187
+ return [sequences[i] for i in index]
188
+ elif isinstance(index, np.ndarray) and index.dtype == bool:
189
+ return [seq for seq, mask in zip(sequences, index) if mask]
190
+ if isinstance(index, slice):
191
+ return sequences[index]
192
+ else:
193
+ raise IndexError(
194
+ f"Invalid alignment index type '{type(index).__name__}'"
195
+ )
196
+
197
+ @staticmethod
198
+ def trace_from_strings(seq_str_list):
199
+ """
200
+ Create a trace from strings that represent aligned sequences.
201
+
202
+ Parameters
203
+ ----------
204
+ seq_str_list : list of str
205
+ The strings, where each each one represents a sequence
206
+ (with gaps) in an alignment.
207
+ A ``-`` is interpreted as gap.
208
+
209
+ Returns
210
+ -------
211
+ trace : ndarray, dtype=int, shape=(n,2)
212
+ The created trace.
213
+ """
214
+ if len(seq_str_list) < 2:
215
+ raise ValueError(
216
+ "An alignment must contain at least two sequences"
217
+ )
218
+ seq_i = np.zeros(len(seq_str_list))
219
+ trace = np.full(( len(seq_str_list[0]), len(seq_str_list) ),
220
+ -1, dtype=int)
221
+ # Get length of string (same length for all strings)
222
+ # rather than length of list
223
+ for pos_i in range(len(seq_str_list[0])):
224
+ for str_j in range(len(seq_str_list)):
225
+ if seq_str_list[str_j][pos_i] == "-":
226
+ trace[pos_i, str_j] = -1
227
+ else:
228
+ trace[pos_i, str_j] = seq_i[str_j]
229
+ seq_i[str_j] += 1
230
+ return trace
231
+
232
+
233
+ def get_codes(alignment):
234
+ """
235
+ Get the sequence codes of the sequences in the alignment.
236
+
237
+ The codes are built from the trace:
238
+ Instead of the indices of the aligned symbols (trace), the return
239
+ value contains the corresponding symbol codes for each index.
240
+ Gaps are still represented by *-1*.
241
+
242
+ Parameters
243
+ ----------
244
+ alignment : Alignment
245
+ The alignment to get the sequence codes for.
246
+
247
+ Returns
248
+ -------
249
+ codes : ndarray, dtype=int, shape=(n,m)
250
+ The sequence codes for the alignment.
251
+ The shape is *(n,m)* for *n* sequences and *m* alignment cloumn.
252
+ The array uses *-1* values for gaps.
253
+
254
+ Examples
255
+ --------
256
+
257
+ >>> seq1 = NucleotideSequence("CGTCAT")
258
+ >>> seq2 = NucleotideSequence("TCATGC")
259
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
260
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
261
+ >>> print(ali)
262
+ CGTCAT--
263
+ --TCATGC
264
+ >>> print(get_codes(ali))
265
+ [[ 1 2 3 1 0 3 -1 -1]
266
+ [-1 -1 3 1 0 3 2 1]]
267
+ """
268
+ trace = alignment.trace
269
+ sequences = alignment.sequences
270
+
271
+ # The number of sequences is the first dimension
272
+ codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=int)
273
+ for i in range(len(sequences)):
274
+ codes[i] = np.where(
275
+ trace[:,i] != -1, sequences[i].code[trace[:,i]], -1
276
+ )
277
+
278
+ return np.stack(codes)
279
+
280
+
281
+ def get_symbols(alignment):
282
+ """
283
+ Similar to :func:`get_codes()`, but contains the decoded symbols
284
+ instead of codes.
285
+ Gaps are still represented by *None* values.
286
+
287
+ Parameters
288
+ ----------
289
+ alignment : Alignment
290
+ The alignment to get the symbols for.
291
+
292
+ Returns
293
+ -------
294
+ symbols : list of list
295
+ The nested list of symbols.
296
+
297
+ See Also
298
+ --------
299
+ get_codes
300
+
301
+ Examples
302
+ --------
303
+
304
+ >>> seq1 = NucleotideSequence("CGTCAT")
305
+ >>> seq2 = NucleotideSequence("TCATGC")
306
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
307
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
308
+ >>> print(ali)
309
+ CGTCAT--
310
+ --TCATGC
311
+ >>> print(get_symbols(ali))
312
+ [['C', 'G', 'T', 'C', 'A', 'T', None, None], [None, None, 'T', 'C', 'A', 'T', 'G', 'C']]
313
+ """
314
+ codes = get_codes(alignment)
315
+ symbols = [None] * codes.shape[0]
316
+ for i in range(codes.shape[0]):
317
+ alphabet = alignment.sequences[i].get_alphabet()
318
+ codes_wo_gaps = codes[i, codes[i] != -1]
319
+ symbols_wo_gaps = alphabet.decode_multiple(codes_wo_gaps)
320
+ if not isinstance(symbols_wo_gaps, list):
321
+ symbols_wo_gaps = list(symbols_wo_gaps)
322
+ symbols_for_seq = np.full(len(codes[i]), None, dtype=object)
323
+ symbols_for_seq[codes[i] != -1] = symbols_wo_gaps
324
+ symbols[i] = symbols_for_seq.tolist()
325
+ return symbols
326
+
327
+
328
+ def get_sequence_identity(alignment, mode="not_terminal"):
329
+ """
330
+ Calculate the sequence identity for an alignment.
331
+
332
+ The identity is equal to the matches divided by a measure for the
333
+ length of the alignment that depends on the `mode` parameter.
334
+
335
+ Parameters
336
+ ----------
337
+ alignment : Alignment
338
+ The alignment to calculate the identity for.
339
+ mode : {'all', 'not_terminal', 'shortest'}, optional
340
+ The calculation mode for alignment length.
341
+
342
+ - **all** - The number of matches divided by the number of
343
+ all alignment columns.
344
+ - **not_terminal** - The number of matches divided by the
345
+ number of alignment columns that are not terminal gaps in
346
+ any of the sequences.
347
+ - **shortest** - The number of matches divided by the
348
+ length of the shortest sequence.
349
+
350
+ Default is *not_terminal*.
351
+
352
+ Returns
353
+ -------
354
+ identity : float
355
+ The sequence identity, ranging between 0 and 1.
356
+
357
+ See also
358
+ --------
359
+ get_pairwise_sequence_identity
360
+ """
361
+ codes = get_codes(alignment)
362
+
363
+ # Count matches
364
+ matches = 0
365
+ for i in range(codes.shape[1]):
366
+ column = codes[:,i]
367
+ # One unique value -> all symbols match
368
+ unique_symbols = np.unique(column)
369
+ if len(unique_symbols) == 1 and unique_symbols[0] != -1:
370
+ matches += 1
371
+
372
+ # Calculate length
373
+ if mode == "all":
374
+ length = len(alignment)
375
+ elif mode == "not_terminal":
376
+ start, stop = find_terminal_gaps(alignment)
377
+ if stop <= start:
378
+ raise ValueError(
379
+ "Cannot calculate non-terminal identity, "
380
+ "at least two sequences have no overlap"
381
+ )
382
+ length = stop - start
383
+ elif mode == "shortest":
384
+ length = min([len(seq) for seq in alignment.sequences])
385
+ else:
386
+ raise ValueError(f"'{mode}' is an invalid calculation mode")
387
+
388
+ return matches / length
389
+
390
+
391
+ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
392
+ """
393
+ Calculate the pairwise sequence identity for an alignment.
394
+
395
+ The identity is equal to the matches divided by a measure for the
396
+ length of the alignment that depends on the `mode` parameter.
397
+
398
+ Parameters
399
+ ----------
400
+ alignment : Alignment, length=n
401
+ The alignment to calculate the pairwise sequence identity for.
402
+ mode : {'all', 'not_terminal', 'shortest'}, optional
403
+ The calculation mode for alignment length.
404
+
405
+ - **all** - The number of matches divided by the number of
406
+ all alignment columns.
407
+ - **not_terminal** - The number of matches divided by the
408
+ number of alignment columns that are not terminal gaps in
409
+ any of the two considered sequences.
410
+ - **shortest** - The number of matches divided by the
411
+ length of the shortest one of the two sequences.
412
+
413
+ Default is *not_terminal*.
414
+
415
+ Returns
416
+ -------
417
+ identity : ndarray, dtype=float, shape=(n,n)
418
+ The pairwise sequence identity, ranging between 0 and 1.
419
+
420
+ See also
421
+ --------
422
+ get_sequence_identity
423
+ """
424
+ codes = get_codes(alignment)
425
+ n_seq = len(codes)
426
+
427
+ # Count matches
428
+ # Calculate at which positions the sequences are identical
429
+ # and are not gaps
430
+ equality_matrix = (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) \
431
+ & (codes[:, np.newaxis, :] != -1) \
432
+ & (codes[np.newaxis, :, :] != -1) \
433
+ # Sum these positions up
434
+ matches = np.count_nonzero(equality_matrix, axis=-1)
435
+
436
+ # Calculate length
437
+ if mode == "all":
438
+ length = len(alignment)
439
+ elif mode == "not_terminal":
440
+ length = np.zeros((n_seq, n_seq))
441
+ for i in range(n_seq):
442
+ for j in range(n_seq):
443
+ # Find latest start and earliest stop of all sequences
444
+ start, stop = find_terminal_gaps(alignment[:, [i,j]])
445
+ if stop <= start:
446
+ raise ValueError(
447
+ "Cannot calculate non-terminal identity, "
448
+ "as the two sequences have no overlap"
449
+ )
450
+ length[i,j] = stop - start
451
+ elif mode == "shortest":
452
+ length = np.zeros((n_seq, n_seq))
453
+ for i in range(n_seq):
454
+ for j in range(n_seq):
455
+ length[i,j] = min([
456
+ len(alignment.sequences[i]),
457
+ len(alignment.sequences[j])
458
+ ])
459
+ else:
460
+ raise ValueError(f"'{mode}' is an invalid calculation mode")
461
+
462
+ return matches / length
463
+
464
+
465
+ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
466
+ """
467
+ Calculate the similarity score of an alignment.
468
+
469
+ If the alignment contains more than two sequences,
470
+ all pairwise scores are counted.
471
+
472
+ Parameters
473
+ ----------
474
+ alignment : Alignment
475
+ The alignment to calculate the identity for.
476
+ matrix : SubstitutionMatrix
477
+ The substitution matrix used for scoring.
478
+ gap_penalty : int or (tuple, dtype=int), optional
479
+ If an integer is provided, the value will be interpreted as
480
+ general gap penalty. If a tuple is provided, an affine gap
481
+ penalty is used. The first integer in the tuple is the gap
482
+ opening penalty, the second integer is the gap extension
483
+ penalty.
484
+ The values need to be negative. (Default: *-10*)
485
+ terminal_penalty : bool, optional
486
+ If true, gap penalties are applied to terminal gaps.
487
+ (Default: True)
488
+
489
+ Returns
490
+ -------
491
+ score : int
492
+ The similarity score.
493
+ """
494
+ codes = get_codes(alignment)
495
+ matrix = matrix.score_matrix()
496
+
497
+ # Sum similarity scores (without gaps)
498
+ score = 0
499
+ # Iterate over all positions
500
+ for pos in range(codes.shape[1]):
501
+ column = codes[:, pos]
502
+ # Iterate over all possible pairs
503
+ # Do not count self-similarity
504
+ # and do not count similarity twice (not S(i,j) and S(j,i))
505
+ for i in range(codes.shape[0]):
506
+ for j in range(i+1, codes.shape[0]):
507
+ code_i = column[i]
508
+ code_j = column[j]
509
+ # Ignore gaps
510
+ if code_i != -1 and code_j != -1:
511
+ score += matrix[code_i, code_j]
512
+
513
+ # Sum gap penalties
514
+ if type(gap_penalty) == int:
515
+ gap_open = gap_penalty
516
+ gap_ext = gap_penalty
517
+ elif type(gap_penalty) == tuple:
518
+ gap_open = gap_penalty[0]
519
+ gap_ext = gap_penalty[1]
520
+ else:
521
+ raise TypeError("Gap penalty must be either integer or tuple")
522
+ # Iterate over all sequences
523
+ for seq_code in codes:
524
+ in_gap = False
525
+ if terminal_penalty:
526
+ start_index = 0
527
+ stop_index = len(seq_code)
528
+ else:
529
+ # Find a start and stop index excluding terminal gaps
530
+ start_index, stop_index = find_terminal_gaps(alignment)
531
+ for i in range(start_index, stop_index):
532
+ if seq_code[i] == -1:
533
+ if in_gap:
534
+ score += gap_ext
535
+ else:
536
+ score += gap_open
537
+ in_gap = True
538
+ else:
539
+ in_gap = False
540
+ return score
541
+
542
+
543
+ def find_terminal_gaps(alignment):
544
+ """
545
+ Find the slice indices that would remove terminal gaps from an
546
+ alignment.
547
+
548
+ Terminal gaps are gaps that appear before all sequences start and
549
+ after any sequence ends.
550
+
551
+ Parameters
552
+ ----------
553
+ alignment : Alignment
554
+ The alignment, where the slice indices should be found in.
555
+
556
+ Returns
557
+ -------
558
+ start, stop : int
559
+ Indices that point to the start and exclusive stop of the
560
+ alignment columns without terminal gaps.
561
+ When these indices are used as slice index for an alignment or
562
+ trace, the index would remove terminal gaps.
563
+
564
+ See also
565
+ --------
566
+ remove_terminal_gaps
567
+
568
+ Examples
569
+ --------
570
+
571
+ >>> sequences = [
572
+ ... NucleotideSequence(seq_string) for seq_string in (
573
+ ... "AAAAACTGATTC",
574
+ ... "AAACTGTTCA",
575
+ ... "CTGATTCAAA"
576
+ ... )
577
+ ... ]
578
+ >>> trace = np.transpose([
579
+ ... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
580
+ ... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
581
+ ... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
582
+ ... ])
583
+ >>> alignment = Alignment(sequences, trace)
584
+ >>> print(alignment)
585
+ AAAAACTGATTC---
586
+ --AAACTG-TTCA--
587
+ -----CTGATTCAAA
588
+ >>> print(find_terminal_gaps(alignment))
589
+ (5, 12)
590
+ """
591
+ trace = alignment.trace
592
+ # Find for each sequence the positions of non-gap symbols
593
+ no_gap_pos = [np.where(trace[:,i] != -1)[0] for i in range(trace.shape[1])]
594
+ # Find for each sequence the positions of the sequence start and end
595
+ # in the alignment
596
+ firsts = [no_gap_pos[i][0 ] for i in range(trace.shape[1])]
597
+ lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
598
+ # The terminal gaps are before all sequences start and after any
599
+ # sequence ends
600
+ # Use exclusive stop -> -1
601
+ return np.max(firsts), np.min(lasts) + 1
602
+
603
+
604
+ def remove_terminal_gaps(alignment):
605
+ """
606
+ Remove terminal gaps from an alignment.
607
+
608
+ Terminal gaps are gaps that appear before all sequences start and
609
+ after any sequence ends.
610
+
611
+ Parameters
612
+ ----------
613
+ alignment : Alignment
614
+ The alignment, where the terminal gaps should be removed from.
615
+
616
+ Returns
617
+ -------
618
+ truncated_alignment : Alignment
619
+ A shallow copy of the input `alignment` with an truncated trace,
620
+ that does not contain alignment columns with terminal gaps.
621
+
622
+ See also
623
+ --------
624
+ find_terminal_gaps
625
+
626
+ Examples
627
+ --------
628
+
629
+ >>> sequences = [
630
+ ... NucleotideSequence(seq_string) for seq_string in (
631
+ ... "AAAAACTGATTC",
632
+ ... "AAACTGTTCA",
633
+ ... "CTGATTCAAA"
634
+ ... )
635
+ ... ]
636
+ >>> trace = np.transpose([
637
+ ... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
638
+ ... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
639
+ ... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
640
+ ... ])
641
+ >>> alignment = Alignment(sequences, trace)
642
+ >>> print(alignment)
643
+ AAAAACTGATTC---
644
+ --AAACTG-TTCA--
645
+ -----CTGATTCAAA
646
+ >>> truncated_alignment = remove_terminal_gaps(alignment)
647
+ >>> print(truncated_alignment)
648
+ CTGATTC
649
+ CTG-TTC
650
+ CTGATTC
651
+ """
652
+ start, stop = find_terminal_gaps(alignment)
653
+ if stop < start:
654
+ raise ValueError(
655
+ "Cannot remove terminal gaps, since at least two sequences have "
656
+ "no overlap and the resulting alignment would be empty"
657
+ )
658
+ return alignment[start : stop]