biotite 1.1.0__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (332) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +452 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +57 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +206 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +60 -0
  35. biotite/database/entrez/dbnames.py +91 -0
  36. biotite/database/entrez/download.py +229 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +262 -0
  39. biotite/database/error.py +16 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +258 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +830 -0
  44. biotite/database/pubchem/throttle.py +98 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +159 -0
  47. biotite/database/rcsb/query.py +964 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +40 -0
  50. biotite/database/uniprot/download.py +129 -0
  51. biotite/database/uniprot/query.py +293 -0
  52. biotite/file.py +232 -0
  53. biotite/sequence/__init__.py +84 -0
  54. biotite/sequence/align/__init__.py +203 -0
  55. biotite/sequence/align/alignment.py +680 -0
  56. biotite/sequence/align/banded.cpython-313-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +71 -0
  59. biotite/sequence/align/cigar.py +425 -0
  60. biotite/sequence/align/kmeralphabet.cpython-313-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +595 -0
  62. biotite/sequence/align/kmersimilarity.cpython-313-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-313-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3411 -0
  66. biotite/sequence/align/localgapped.cpython-313-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-313-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +622 -0
  71. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  72. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  81. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  87. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  99. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  100. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  101. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  102. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  103. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  104. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  105. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  155. biotite/sequence/align/matrix_data/PB.license +21 -0
  156. biotite/sequence/align/matrix_data/PB.mat +18 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  160. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  161. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  162. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  163. biotite/sequence/align/multiple.cpython-313-darwin.so +0 -0
  164. biotite/sequence/align/multiple.pyx +620 -0
  165. biotite/sequence/align/pairwise.cpython-313-darwin.so +0 -0
  166. biotite/sequence/align/pairwise.pyx +587 -0
  167. biotite/sequence/align/permutation.cpython-313-darwin.so +0 -0
  168. biotite/sequence/align/permutation.pyx +313 -0
  169. biotite/sequence/align/primes.txt +821 -0
  170. biotite/sequence/align/selector.cpython-313-darwin.so +0 -0
  171. biotite/sequence/align/selector.pyx +954 -0
  172. biotite/sequence/align/statistics.py +264 -0
  173. biotite/sequence/align/tracetable.cpython-313-darwin.so +0 -0
  174. biotite/sequence/align/tracetable.pxd +64 -0
  175. biotite/sequence/align/tracetable.pyx +370 -0
  176. biotite/sequence/alphabet.py +555 -0
  177. biotite/sequence/annotation.py +830 -0
  178. biotite/sequence/codec.cpython-313-darwin.so +0 -0
  179. biotite/sequence/codec.pyx +155 -0
  180. biotite/sequence/codon.py +477 -0
  181. biotite/sequence/codon_tables.txt +202 -0
  182. biotite/sequence/graphics/__init__.py +33 -0
  183. biotite/sequence/graphics/alignment.py +1115 -0
  184. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  185. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  186. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  187. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  188. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  189. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  190. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  192. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  193. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  194. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  195. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  196. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  197. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  198. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  199. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  200. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  201. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  202. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  203. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  204. biotite/sequence/graphics/colorschemes.py +170 -0
  205. biotite/sequence/graphics/dendrogram.py +229 -0
  206. biotite/sequence/graphics/features.py +544 -0
  207. biotite/sequence/graphics/logo.py +104 -0
  208. biotite/sequence/graphics/plasmid.py +712 -0
  209. biotite/sequence/io/__init__.py +12 -0
  210. biotite/sequence/io/fasta/__init__.py +22 -0
  211. biotite/sequence/io/fasta/convert.py +284 -0
  212. biotite/sequence/io/fasta/file.py +265 -0
  213. biotite/sequence/io/fastq/__init__.py +19 -0
  214. biotite/sequence/io/fastq/convert.py +117 -0
  215. biotite/sequence/io/fastq/file.py +507 -0
  216. biotite/sequence/io/genbank/__init__.py +17 -0
  217. biotite/sequence/io/genbank/annotation.py +269 -0
  218. biotite/sequence/io/genbank/file.py +573 -0
  219. biotite/sequence/io/genbank/metadata.py +336 -0
  220. biotite/sequence/io/genbank/sequence.py +171 -0
  221. biotite/sequence/io/general.py +201 -0
  222. biotite/sequence/io/gff/__init__.py +26 -0
  223. biotite/sequence/io/gff/convert.py +128 -0
  224. biotite/sequence/io/gff/file.py +450 -0
  225. biotite/sequence/phylo/__init__.py +36 -0
  226. biotite/sequence/phylo/nj.cpython-313-darwin.so +0 -0
  227. biotite/sequence/phylo/nj.pyx +221 -0
  228. biotite/sequence/phylo/tree.cpython-313-darwin.so +0 -0
  229. biotite/sequence/phylo/tree.pyx +1169 -0
  230. biotite/sequence/phylo/upgma.cpython-313-darwin.so +0 -0
  231. biotite/sequence/phylo/upgma.pyx +164 -0
  232. biotite/sequence/profile.py +567 -0
  233. biotite/sequence/search.py +118 -0
  234. biotite/sequence/seqtypes.py +713 -0
  235. biotite/sequence/sequence.py +374 -0
  236. biotite/setup_ccd.py +197 -0
  237. biotite/structure/__init__.py +133 -0
  238. biotite/structure/alphabet/__init__.py +25 -0
  239. biotite/structure/alphabet/encoder.py +332 -0
  240. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  241. biotite/structure/alphabet/i3d.py +110 -0
  242. biotite/structure/alphabet/layers.py +86 -0
  243. biotite/structure/alphabet/pb.license +21 -0
  244. biotite/structure/alphabet/pb.py +171 -0
  245. biotite/structure/alphabet/unkerasify.py +122 -0
  246. biotite/structure/atoms.py +1554 -0
  247. biotite/structure/basepairs.py +1404 -0
  248. biotite/structure/bonds.cpython-313-darwin.so +0 -0
  249. biotite/structure/bonds.pyx +1972 -0
  250. biotite/structure/box.py +588 -0
  251. biotite/structure/celllist.cpython-313-darwin.so +0 -0
  252. biotite/structure/celllist.pyx +849 -0
  253. biotite/structure/chains.py +314 -0
  254. biotite/structure/charges.cpython-313-darwin.so +0 -0
  255. biotite/structure/charges.pyx +520 -0
  256. biotite/structure/compare.py +274 -0
  257. biotite/structure/density.py +109 -0
  258. biotite/structure/dotbracket.py +214 -0
  259. biotite/structure/error.py +39 -0
  260. biotite/structure/filter.py +590 -0
  261. biotite/structure/geometry.py +655 -0
  262. biotite/structure/graphics/__init__.py +13 -0
  263. biotite/structure/graphics/atoms.py +243 -0
  264. biotite/structure/graphics/rna.py +295 -0
  265. biotite/structure/hbond.py +428 -0
  266. biotite/structure/info/__init__.py +24 -0
  267. biotite/structure/info/atom_masses.json +121 -0
  268. biotite/structure/info/atoms.py +81 -0
  269. biotite/structure/info/bonds.py +149 -0
  270. biotite/structure/info/ccd.py +202 -0
  271. biotite/structure/info/components.bcif +0 -0
  272. biotite/structure/info/groups.py +131 -0
  273. biotite/structure/info/masses.py +121 -0
  274. biotite/structure/info/misc.py +138 -0
  275. biotite/structure/info/radii.py +197 -0
  276. biotite/structure/info/standardize.py +186 -0
  277. biotite/structure/integrity.py +215 -0
  278. biotite/structure/io/__init__.py +29 -0
  279. biotite/structure/io/dcd/__init__.py +13 -0
  280. biotite/structure/io/dcd/file.py +67 -0
  281. biotite/structure/io/general.py +243 -0
  282. biotite/structure/io/gro/__init__.py +14 -0
  283. biotite/structure/io/gro/file.py +344 -0
  284. biotite/structure/io/mol/__init__.py +20 -0
  285. biotite/structure/io/mol/convert.py +112 -0
  286. biotite/structure/io/mol/ctab.py +415 -0
  287. biotite/structure/io/mol/header.py +120 -0
  288. biotite/structure/io/mol/mol.py +149 -0
  289. biotite/structure/io/mol/sdf.py +914 -0
  290. biotite/structure/io/netcdf/__init__.py +13 -0
  291. biotite/structure/io/netcdf/file.py +64 -0
  292. biotite/structure/io/pdb/__init__.py +20 -0
  293. biotite/structure/io/pdb/convert.py +307 -0
  294. biotite/structure/io/pdb/file.py +1290 -0
  295. biotite/structure/io/pdb/hybrid36.cpython-313-darwin.so +0 -0
  296. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  297. biotite/structure/io/pdbqt/__init__.py +15 -0
  298. biotite/structure/io/pdbqt/convert.py +113 -0
  299. biotite/structure/io/pdbqt/file.py +688 -0
  300. biotite/structure/io/pdbx/__init__.py +23 -0
  301. biotite/structure/io/pdbx/bcif.py +656 -0
  302. biotite/structure/io/pdbx/cif.py +1075 -0
  303. biotite/structure/io/pdbx/component.py +245 -0
  304. biotite/structure/io/pdbx/compress.py +321 -0
  305. biotite/structure/io/pdbx/convert.py +1745 -0
  306. biotite/structure/io/pdbx/encoding.cpython-313-darwin.so +0 -0
  307. biotite/structure/io/pdbx/encoding.pyx +1031 -0
  308. biotite/structure/io/trajfile.py +693 -0
  309. biotite/structure/io/trr/__init__.py +13 -0
  310. biotite/structure/io/trr/file.py +43 -0
  311. biotite/structure/io/xtc/__init__.py +13 -0
  312. biotite/structure/io/xtc/file.py +43 -0
  313. biotite/structure/mechanics.py +73 -0
  314. biotite/structure/molecules.py +352 -0
  315. biotite/structure/pseudoknots.py +628 -0
  316. biotite/structure/rdf.py +245 -0
  317. biotite/structure/repair.py +304 -0
  318. biotite/structure/residues.py +572 -0
  319. biotite/structure/sasa.cpython-313-darwin.so +0 -0
  320. biotite/structure/sasa.pyx +322 -0
  321. biotite/structure/segments.py +178 -0
  322. biotite/structure/sequence.py +111 -0
  323. biotite/structure/sse.py +308 -0
  324. biotite/structure/superimpose.py +689 -0
  325. biotite/structure/transform.py +530 -0
  326. biotite/structure/util.py +168 -0
  327. biotite/version.py +16 -0
  328. biotite/visualize.py +265 -0
  329. biotite-1.1.0.dist-info/METADATA +190 -0
  330. biotite-1.1.0.dist-info/RECORD +332 -0
  331. biotite-1.1.0.dist-info/WHEEL +4 -0
  332. biotite-1.1.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,680 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ import numbers
9
+ import textwrap
10
+ from collections.abc import Sequence
11
+ import numpy as np
12
+
13
+ __all__ = [
14
+ "Alignment",
15
+ "get_codes",
16
+ "get_symbols",
17
+ "get_sequence_identity",
18
+ "get_pairwise_sequence_identity",
19
+ "score",
20
+ "find_terminal_gaps",
21
+ "remove_terminal_gaps",
22
+ ]
23
+
24
+
25
+ class Alignment(object):
26
+ """
27
+ An :class:`Alignment` object stores information about which symbols
28
+ of *n* sequences are aligned to each other and it stores the
29
+ corresponding alignment score.
30
+
31
+ Instead of saving a list of aligned symbols, this class saves the
32
+ original *n* sequences, that were aligned, and a so called *trace*,
33
+ which indicate the aligned symbols of these sequences.
34
+ The trace is a *(m x n)* :class:`ndarray` with alignment length
35
+ *m* and sequence count *n*.
36
+ Each element of the trace is the index in the corresponding
37
+ sequence.
38
+ A gap is represented by the value -1.
39
+
40
+ Furthermore this class provides multiple utility functions for
41
+ conversion into strings in order to make the alignment human
42
+ readable.
43
+
44
+ Unless an :class:`Alignment` object is the result of an multiple
45
+ sequence alignment, the object will contain only two sequences.
46
+
47
+ All attributes of this class are publicly accessible.
48
+
49
+ Parameters
50
+ ----------
51
+ sequences : list
52
+ A list of aligned sequences.
53
+ trace : ndarray, dtype=int, shape=(n,m)
54
+ The alignment trace.
55
+ score : int, optional
56
+ Alignment score.
57
+
58
+ Attributes
59
+ ----------
60
+ sequences : list
61
+ A list of aligned sequences.
62
+ trace : ndarray, dtype=int, shape=(n,m)
63
+ The alignment trace.
64
+ score : int
65
+ Alignment score.
66
+
67
+ Examples
68
+ --------
69
+
70
+ >>> seq1 = NucleotideSequence("CGTCAT")
71
+ >>> seq2 = NucleotideSequence("TCATGC")
72
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
73
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
74
+ >>> print(ali)
75
+ CGTCAT--
76
+ --TCATGC
77
+ >>> print(ali.trace)
78
+ [[ 0 -1]
79
+ [ 1 -1]
80
+ [ 2 0]
81
+ [ 3 1]
82
+ [ 4 2]
83
+ [ 5 3]
84
+ [-1 4]
85
+ [-1 5]]
86
+ >>> print(ali[1:4].trace)
87
+ [[ 1 -1]
88
+ [ 2 0]
89
+ [ 3 1]]
90
+ >>> print(ali[1:4, 0:1].trace)
91
+ [[1]
92
+ [2]
93
+ [3]]
94
+ """
95
+
96
+ def __init__(self, sequences, trace, score=None):
97
+ self.sequences = sequences.copy()
98
+ self.trace = trace
99
+ self.score = score
100
+
101
+ def __repr__(self):
102
+ """Represent Alignment a string for debugging."""
103
+ return (
104
+ f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], "
105
+ f"np.{np.array_repr(self.trace)}, score={self.score})"
106
+ )
107
+
108
+ def _gapped_str(self, seq_index):
109
+ seq_str = ""
110
+ for i in range(len(self.trace)):
111
+ j = self.trace[i][seq_index]
112
+ if j != -1:
113
+ seq_str += str(self.sequences[seq_index][j])
114
+ else:
115
+ seq_str += "-"
116
+ return seq_str
117
+
118
+ def get_gapped_sequences(self):
119
+ """
120
+ Get a the string representation of the gapped sequences.
121
+
122
+ Returns
123
+ -------
124
+ sequences : list of str
125
+ The list of gapped sequence strings. The order is the same
126
+ as in `Alignment.sequences`.
127
+ """
128
+ return [self._gapped_str(i) for i in range(len(self.sequences))]
129
+
130
+ def __str__(self):
131
+ # Check if any of the sequences
132
+ # has an non-single letter alphabet
133
+ all_single_letter = True
134
+ for seq in self.sequences:
135
+ if not _is_single_letter(seq.alphabet):
136
+ all_single_letter = False
137
+ if all_single_letter:
138
+ # First dimension: sequence number,
139
+ # second dimension: line number
140
+ seq_str_lines_list = []
141
+ wrapper = textwrap.TextWrapper(break_on_hyphens=False)
142
+ for i in range(len(self.sequences)):
143
+ seq_str_lines_list.append(wrapper.wrap(self._gapped_str(i)))
144
+ ali_str = ""
145
+ for row_i in range(len(seq_str_lines_list[0])):
146
+ for seq_j in range(len(seq_str_lines_list)):
147
+ ali_str += seq_str_lines_list[seq_j][row_i] + "\n"
148
+ ali_str += "\n"
149
+ # Remove final line breaks
150
+ return ali_str[:-2]
151
+ else:
152
+ return super().__str__()
153
+
154
+ def __getitem__(self, index):
155
+ if isinstance(index, tuple):
156
+ if len(index) > 2:
157
+ raise IndexError("Only 1D or 2D indices are allowed")
158
+ if isinstance(index[0], numbers.Integral) or isinstance(
159
+ index[0], numbers.Integral
160
+ ):
161
+ raise IndexError(
162
+ "Integers are invalid indices for alignments, "
163
+ "a single sequence or alignment column cannot be "
164
+ "selected"
165
+ )
166
+ return Alignment(
167
+ Alignment._index_sequences(self.sequences, index[1]),
168
+ self.trace[index],
169
+ self.score,
170
+ )
171
+ else:
172
+ return Alignment(self.sequences, self.trace[index], self.score)
173
+
174
+ def __iter__(self):
175
+ raise TypeError("'Alignment' object is not iterable")
176
+
177
+ def __len__(self):
178
+ return len(self.trace)
179
+
180
+ def __eq__(self, item):
181
+ if not isinstance(item, Alignment):
182
+ return False
183
+ if self.sequences != item.sequences:
184
+ return False
185
+ if not np.array_equal(self.trace, item.trace):
186
+ return False
187
+ if self.score != item.score:
188
+ return False
189
+ return True
190
+
191
+ @staticmethod
192
+ def _index_sequences(sequences, index):
193
+ if isinstance(index, (list, tuple)) or (
194
+ isinstance(index, np.ndarray) and index.dtype != bool
195
+ ):
196
+ return [sequences[i] for i in index]
197
+ elif isinstance(index, np.ndarray) and index.dtype == bool:
198
+ return [seq for seq, mask in zip(sequences, index) if mask]
199
+ if isinstance(index, slice):
200
+ return sequences[index]
201
+ else:
202
+ raise IndexError(f"Invalid alignment index type '{type(index).__name__}'")
203
+
204
+ @staticmethod
205
+ def trace_from_strings(seq_str_list):
206
+ """
207
+ Create a trace from strings that represent aligned sequences.
208
+
209
+ Parameters
210
+ ----------
211
+ seq_str_list : list of str
212
+ The strings, where each each one represents a sequence
213
+ (with gaps) in an alignment.
214
+ A ``-`` is interpreted as gap.
215
+
216
+ Returns
217
+ -------
218
+ trace : ndarray, dtype=int, shape=(n,2)
219
+ The created trace.
220
+ """
221
+ if len(seq_str_list) < 2:
222
+ raise ValueError("An alignment must contain at least two sequences")
223
+ seq_i = np.zeros(len(seq_str_list))
224
+ trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int)
225
+ # Get length of string (same length for all strings)
226
+ # rather than length of list
227
+ for pos_i in range(len(seq_str_list[0])):
228
+ for str_j in range(len(seq_str_list)):
229
+ if seq_str_list[str_j][pos_i] == "-":
230
+ trace[pos_i, str_j] = -1
231
+ else:
232
+ trace[pos_i, str_j] = seq_i[str_j]
233
+ seq_i[str_j] += 1
234
+ return trace
235
+
236
+
237
+ def get_codes(alignment):
238
+ """
239
+ Get the sequence codes of the sequences in the alignment.
240
+
241
+ The codes are built from the trace:
242
+ Instead of the indices of the aligned symbols (trace), the return
243
+ value contains the corresponding symbol codes for each index.
244
+ Gaps are still represented by *-1*.
245
+
246
+ Parameters
247
+ ----------
248
+ alignment : Alignment
249
+ The alignment to get the sequence codes for.
250
+
251
+ Returns
252
+ -------
253
+ codes : ndarray, dtype=int, shape=(n,m)
254
+ The sequence codes for the alignment.
255
+ The shape is *(n,m)* for *n* sequences and *m* alignment cloumn.
256
+ The array uses *-1* values for gaps.
257
+
258
+ Examples
259
+ --------
260
+
261
+ >>> seq1 = NucleotideSequence("CGTCAT")
262
+ >>> seq2 = NucleotideSequence("TCATGC")
263
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
264
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
265
+ >>> print(ali)
266
+ CGTCAT--
267
+ --TCATGC
268
+ >>> print(get_codes(ali))
269
+ [[ 1 2 3 1 0 3 -1 -1]
270
+ [-1 -1 3 1 0 3 2 1]]
271
+ """
272
+ trace = alignment.trace
273
+ sequences = alignment.sequences
274
+
275
+ # The number of sequences is the first dimension
276
+ codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=np.int64)
277
+ for i in range(len(sequences)):
278
+ # Mark -1 explicitly as int64 to avoid that the unsigned dtype
279
+ # of the sequence code is used
280
+ # (https://numpy.org/neps/nep-0050-scalar-promotion.html)
281
+ codes[i] = np.where(
282
+ trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1)
283
+ )
284
+
285
+ return np.stack(codes)
286
+
287
+
288
+ def get_symbols(alignment):
289
+ """
290
+ Similar to :func:`get_codes()`, but contains the decoded symbols
291
+ instead of codes.
292
+ Gaps are still represented by *None* values.
293
+
294
+ Parameters
295
+ ----------
296
+ alignment : Alignment
297
+ The alignment to get the symbols for.
298
+
299
+ Returns
300
+ -------
301
+ symbols : list of list
302
+ The nested list of symbols.
303
+
304
+ See Also
305
+ --------
306
+ get_codes
307
+
308
+ Examples
309
+ --------
310
+
311
+ >>> seq1 = NucleotideSequence("CGTCAT")
312
+ >>> seq2 = NucleotideSequence("TCATGC")
313
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
314
+ >>> ali = align_optimal(seq1, seq2, matrix)[0]
315
+ >>> print(ali)
316
+ CGTCAT--
317
+ --TCATGC
318
+ >>> print(get_symbols(ali))
319
+ [['C', 'G', 'T', 'C', 'A', 'T', None, None], [None, None, 'T', 'C', 'A', 'T', 'G', 'C']]
320
+ """
321
+ codes = get_codes(alignment)
322
+ symbols = [None] * codes.shape[0]
323
+ for i in range(codes.shape[0]):
324
+ alphabet = alignment.sequences[i].get_alphabet()
325
+ codes_wo_gaps = codes[i, codes[i] != -1]
326
+ symbols_wo_gaps = alphabet.decode_multiple(codes_wo_gaps)
327
+ if isinstance(symbols_wo_gaps, np.ndarray):
328
+ symbols_wo_gaps = symbols_wo_gaps.tolist()
329
+ symbols_for_seq = np.full(len(codes[i]), None, dtype=object)
330
+ symbols_for_seq[codes[i] != -1] = symbols_wo_gaps
331
+ symbols[i] = symbols_for_seq.tolist()
332
+ return symbols
333
+
334
+
335
+ def get_sequence_identity(alignment, mode="not_terminal"):
336
+ """
337
+ Calculate the sequence identity for an alignment.
338
+
339
+ The identity is equal to the matches divided by a measure for the
340
+ length of the alignment that depends on the `mode` parameter.
341
+
342
+ Parameters
343
+ ----------
344
+ alignment : Alignment
345
+ The alignment to calculate the identity for.
346
+ mode : {'all', 'not_terminal', 'shortest'}, optional
347
+ The calculation mode for alignment length.
348
+
349
+ - **all** - The number of matches divided by the number of
350
+ all alignment columns.
351
+ - **not_terminal** - The number of matches divided by the
352
+ number of alignment columns that are not terminal gaps in
353
+ any of the sequences.
354
+ - **shortest** - The number of matches divided by the
355
+ length of the shortest sequence.
356
+
357
+ Default is *not_terminal*.
358
+
359
+ Returns
360
+ -------
361
+ identity : float
362
+ The sequence identity, ranging between 0 and 1.
363
+
364
+ See also
365
+ --------
366
+ get_pairwise_sequence_identity
367
+ """
368
+ codes = get_codes(alignment)
369
+
370
+ # Count matches
371
+ matches = 0
372
+ for i in range(codes.shape[1]):
373
+ column = codes[:, i]
374
+ # One unique value -> all symbols match
375
+ unique_symbols = np.unique(column)
376
+ if len(unique_symbols) == 1 and unique_symbols[0] != -1:
377
+ matches += 1
378
+
379
+ # Calculate length
380
+ if mode == "all":
381
+ length = len(alignment)
382
+ elif mode == "not_terminal":
383
+ start, stop = find_terminal_gaps(alignment)
384
+ if stop <= start:
385
+ raise ValueError(
386
+ "Cannot calculate non-terminal identity, "
387
+ "at least two sequences have no overlap"
388
+ )
389
+ length = stop - start
390
+ elif mode == "shortest":
391
+ length = min([len(seq) for seq in alignment.sequences])
392
+ else:
393
+ raise ValueError(f"'{mode}' is an invalid calculation mode")
394
+
395
+ return matches / length
396
+
397
+
398
+ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
399
+ """
400
+ Calculate the pairwise sequence identity for an alignment.
401
+
402
+ The identity is equal to the matches divided by a measure for the
403
+ length of the alignment that depends on the `mode` parameter.
404
+
405
+ Parameters
406
+ ----------
407
+ alignment : Alignment, length=n
408
+ The alignment to calculate the pairwise sequence identity for.
409
+ mode : {'all', 'not_terminal', 'shortest'}, optional
410
+ The calculation mode for alignment length.
411
+
412
+ - **all** - The number of matches divided by the number of
413
+ all alignment columns.
414
+ - **not_terminal** - The number of matches divided by the
415
+ number of alignment columns that are not terminal gaps in
416
+ any of the two considered sequences.
417
+ - **shortest** - The number of matches divided by the
418
+ length of the shortest one of the two sequences.
419
+
420
+ Default is *not_terminal*.
421
+
422
+ Returns
423
+ -------
424
+ identity : ndarray, dtype=float, shape=(n,n)
425
+ The pairwise sequence identity, ranging between 0 and 1.
426
+
427
+ See also
428
+ --------
429
+ get_sequence_identity
430
+ """
431
+ codes = get_codes(alignment)
432
+ n_seq = len(codes)
433
+
434
+ # Count matches
435
+ # Calculate at which positions the sequences are identical
436
+ # and are not gaps
437
+ equality_matrix = (
438
+ (codes[:, np.newaxis, :] == codes[np.newaxis, :, :])
439
+ & (codes[:, np.newaxis, :] != -1)
440
+ & (codes[np.newaxis, :, :] != -1)
441
+ )
442
+ # Sum these positions up
443
+ matches = np.count_nonzero(equality_matrix, axis=-1)
444
+
445
+ # Calculate length
446
+ if mode == "all":
447
+ length = len(alignment)
448
+ elif mode == "not_terminal":
449
+ length = np.zeros((n_seq, n_seq))
450
+ for i in range(n_seq):
451
+ for j in range(n_seq):
452
+ # Find latest start and earliest stop of all sequences
453
+ start, stop = find_terminal_gaps(alignment[:, [i, j]])
454
+ if stop <= start:
455
+ raise ValueError(
456
+ "Cannot calculate non-terminal identity, "
457
+ "as the two sequences have no overlap"
458
+ )
459
+ length[i, j] = stop - start
460
+ elif mode == "shortest":
461
+ length = np.zeros((n_seq, n_seq))
462
+ for i in range(n_seq):
463
+ for j in range(n_seq):
464
+ length[i, j] = min(
465
+ [len(alignment.sequences[i]), len(alignment.sequences[j])]
466
+ )
467
+ else:
468
+ raise ValueError(f"'{mode}' is an invalid calculation mode")
469
+
470
+ return matches / length
471
+
472
+
473
+ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
474
+ """
475
+ Calculate the similarity score of an alignment.
476
+
477
+ If the alignment contains more than two sequences,
478
+ all pairwise scores are counted.
479
+
480
+ Parameters
481
+ ----------
482
+ alignment : Alignment
483
+ The alignment to calculate the identity for.
484
+ matrix : SubstitutionMatrix
485
+ The substitution matrix used for scoring.
486
+ gap_penalty : int or (tuple, dtype=int), optional
487
+ If an integer is provided, the value will be interpreted as
488
+ general gap penalty. If a tuple is provided, an affine gap
489
+ penalty is used. The first integer in the tuple is the gap
490
+ opening penalty, the second integer is the gap extension
491
+ penalty.
492
+ The values need to be negative. (Default: *-10*)
493
+ terminal_penalty : bool, optional
494
+ If true, gap penalties are applied to terminal gaps.
495
+ (Default: True)
496
+
497
+ Returns
498
+ -------
499
+ score : int
500
+ The similarity score.
501
+ """
502
+ codes = get_codes(alignment)
503
+ matrix = matrix.score_matrix()
504
+
505
+ # Sum similarity scores (without gaps)
506
+ score = 0
507
+ # Iterate over all positions
508
+ for pos in range(codes.shape[1]):
509
+ column = codes[:, pos]
510
+ # Iterate over all possible pairs
511
+ # Do not count self-similarity
512
+ # and do not count similarity twice (not S(i,j) and S(j,i))
513
+ for i in range(codes.shape[0]):
514
+ for j in range(i + 1, codes.shape[0]):
515
+ code_i = column[i]
516
+ code_j = column[j]
517
+ # Ignore gaps
518
+ if code_i != -1 and code_j != -1:
519
+ score += matrix[code_i, code_j]
520
+
521
+ # Sum gap penalties
522
+ if isinstance(gap_penalty, numbers.Real):
523
+ gap_open = gap_penalty
524
+ gap_ext = gap_penalty
525
+ elif isinstance(gap_penalty, Sequence):
526
+ gap_open = gap_penalty[0]
527
+ gap_ext = gap_penalty[1]
528
+ else:
529
+ raise TypeError("Gap penalty must be either integer or tuple")
530
+ # Iterate over all sequences
531
+ for seq_code in codes:
532
+ in_gap = False
533
+ if terminal_penalty:
534
+ start_index = 0
535
+ stop_index = len(seq_code)
536
+ else:
537
+ # Find a start and stop index excluding terminal gaps
538
+ start_index, stop_index = find_terminal_gaps(alignment)
539
+ for i in range(start_index, stop_index):
540
+ if seq_code[i] == -1:
541
+ if in_gap:
542
+ score += gap_ext
543
+ else:
544
+ score += gap_open
545
+ in_gap = True
546
+ else:
547
+ in_gap = False
548
+ return score
549
+
550
+
551
+ def find_terminal_gaps(alignment):
552
+ """
553
+ Find the slice indices that would remove terminal gaps from an
554
+ alignment.
555
+
556
+ Terminal gaps are gaps that appear before all sequences start and
557
+ after any sequence ends.
558
+
559
+ Parameters
560
+ ----------
561
+ alignment : Alignment
562
+ The alignment, where the slice indices should be found in.
563
+
564
+ Returns
565
+ -------
566
+ start, stop : int
567
+ Indices that point to the start and exclusive stop of the
568
+ alignment columns without terminal gaps.
569
+ When these indices are used as slice index for an alignment or
570
+ trace, the index would remove terminal gaps.
571
+
572
+ See also
573
+ --------
574
+ remove_terminal_gaps
575
+
576
+ Examples
577
+ --------
578
+
579
+ >>> sequences = [
580
+ ... NucleotideSequence(seq_string) for seq_string in (
581
+ ... "AAAAACTGATTC",
582
+ ... "AAACTGTTCA",
583
+ ... "CTGATTCAAA"
584
+ ... )
585
+ ... ]
586
+ >>> trace = np.transpose([
587
+ ... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
588
+ ... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
589
+ ... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
590
+ ... ])
591
+ >>> alignment = Alignment(sequences, trace)
592
+ >>> print(alignment)
593
+ AAAAACTGATTC---
594
+ --AAACTG-TTCA--
595
+ -----CTGATTCAAA
596
+ >>> print(find_terminal_gaps(alignment))
597
+ (5, 12)
598
+ """
599
+ trace = alignment.trace
600
+ # Find for each sequence the positions of non-gap symbols
601
+ no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])]
602
+ # Find for each sequence the positions of the sequence start and end
603
+ # in the alignment
604
+ firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])]
605
+ lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
606
+ # The terminal gaps are before all sequences start and after any
607
+ # sequence ends
608
+ # Use exclusive stop -> -1
609
+ return np.max(firsts).item(), np.min(lasts).item() + 1
610
+
611
+
612
+ def remove_terminal_gaps(alignment):
613
+ """
614
+ Remove terminal gaps from an alignment.
615
+
616
+ Terminal gaps are gaps that appear before all sequences start and
617
+ after any sequence ends.
618
+
619
+ Parameters
620
+ ----------
621
+ alignment : Alignment
622
+ The alignment, where the terminal gaps should be removed from.
623
+
624
+ Returns
625
+ -------
626
+ truncated_alignment : Alignment
627
+ A shallow copy of the input `alignment` with an truncated trace,
628
+ that does not contain alignment columns with terminal gaps.
629
+
630
+ See also
631
+ --------
632
+ find_terminal_gaps
633
+
634
+ Examples
635
+ --------
636
+
637
+ >>> sequences = [
638
+ ... NucleotideSequence(seq_string) for seq_string in (
639
+ ... "AAAAACTGATTC",
640
+ ... "AAACTGTTCA",
641
+ ... "CTGATTCAAA"
642
+ ... )
643
+ ... ]
644
+ >>> trace = np.transpose([
645
+ ... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
646
+ ... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
647
+ ... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
648
+ ... ])
649
+ >>> alignment = Alignment(sequences, trace)
650
+ >>> print(alignment)
651
+ AAAAACTGATTC---
652
+ --AAACTG-TTCA--
653
+ -----CTGATTCAAA
654
+ >>> truncated_alignment = remove_terminal_gaps(alignment)
655
+ >>> print(truncated_alignment)
656
+ CTGATTC
657
+ CTG-TTC
658
+ CTGATTC
659
+ """
660
+ start, stop = find_terminal_gaps(alignment)
661
+ if stop < start:
662
+ raise ValueError(
663
+ "Cannot remove terminal gaps, since at least two sequences have "
664
+ "no overlap and the resulting alignment would be empty"
665
+ )
666
+ return alignment[start:stop]
667
+
668
+
669
+ def _is_single_letter(alphabet):
670
+ """
671
+ More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
672
+ It is sufficient that only only the string representation of each symbol is only
673
+ a single character.
674
+ """
675
+ if alphabet.is_letter_alphabet():
676
+ return True
677
+ for symbol in alphabet:
678
+ if len(str(symbol)) != 1:
679
+ return False
680
+ return True