biotite 1.1.0__cp313-cp313-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (332) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +452 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +57 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +206 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +60 -0
  35. biotite/database/entrez/dbnames.py +91 -0
  36. biotite/database/entrez/download.py +229 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +262 -0
  39. biotite/database/error.py +16 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +258 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +830 -0
  44. biotite/database/pubchem/throttle.py +98 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +159 -0
  47. biotite/database/rcsb/query.py +964 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +40 -0
  50. biotite/database/uniprot/download.py +129 -0
  51. biotite/database/uniprot/query.py +293 -0
  52. biotite/file.py +232 -0
  53. biotite/sequence/__init__.py +84 -0
  54. biotite/sequence/align/__init__.py +203 -0
  55. biotite/sequence/align/alignment.py +680 -0
  56. biotite/sequence/align/banded.cpython-313-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +71 -0
  59. biotite/sequence/align/cigar.py +425 -0
  60. biotite/sequence/align/kmeralphabet.cpython-313-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +595 -0
  62. biotite/sequence/align/kmersimilarity.cpython-313-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-313-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3411 -0
  66. biotite/sequence/align/localgapped.cpython-313-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-313-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +622 -0
  71. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  72. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  81. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  87. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  99. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  100. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  101. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  102. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  103. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  104. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  105. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  155. biotite/sequence/align/matrix_data/PB.license +21 -0
  156. biotite/sequence/align/matrix_data/PB.mat +18 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  160. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  161. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  162. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  163. biotite/sequence/align/multiple.cpython-313-darwin.so +0 -0
  164. biotite/sequence/align/multiple.pyx +620 -0
  165. biotite/sequence/align/pairwise.cpython-313-darwin.so +0 -0
  166. biotite/sequence/align/pairwise.pyx +587 -0
  167. biotite/sequence/align/permutation.cpython-313-darwin.so +0 -0
  168. biotite/sequence/align/permutation.pyx +313 -0
  169. biotite/sequence/align/primes.txt +821 -0
  170. biotite/sequence/align/selector.cpython-313-darwin.so +0 -0
  171. biotite/sequence/align/selector.pyx +954 -0
  172. biotite/sequence/align/statistics.py +264 -0
  173. biotite/sequence/align/tracetable.cpython-313-darwin.so +0 -0
  174. biotite/sequence/align/tracetable.pxd +64 -0
  175. biotite/sequence/align/tracetable.pyx +370 -0
  176. biotite/sequence/alphabet.py +555 -0
  177. biotite/sequence/annotation.py +830 -0
  178. biotite/sequence/codec.cpython-313-darwin.so +0 -0
  179. biotite/sequence/codec.pyx +155 -0
  180. biotite/sequence/codon.py +477 -0
  181. biotite/sequence/codon_tables.txt +202 -0
  182. biotite/sequence/graphics/__init__.py +33 -0
  183. biotite/sequence/graphics/alignment.py +1115 -0
  184. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  185. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  186. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  187. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  188. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  189. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  190. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  192. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  193. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  194. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  195. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  196. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  197. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  198. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  199. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  200. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  201. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  202. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  203. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  204. biotite/sequence/graphics/colorschemes.py +170 -0
  205. biotite/sequence/graphics/dendrogram.py +229 -0
  206. biotite/sequence/graphics/features.py +544 -0
  207. biotite/sequence/graphics/logo.py +104 -0
  208. biotite/sequence/graphics/plasmid.py +712 -0
  209. biotite/sequence/io/__init__.py +12 -0
  210. biotite/sequence/io/fasta/__init__.py +22 -0
  211. biotite/sequence/io/fasta/convert.py +284 -0
  212. biotite/sequence/io/fasta/file.py +265 -0
  213. biotite/sequence/io/fastq/__init__.py +19 -0
  214. biotite/sequence/io/fastq/convert.py +117 -0
  215. biotite/sequence/io/fastq/file.py +507 -0
  216. biotite/sequence/io/genbank/__init__.py +17 -0
  217. biotite/sequence/io/genbank/annotation.py +269 -0
  218. biotite/sequence/io/genbank/file.py +573 -0
  219. biotite/sequence/io/genbank/metadata.py +336 -0
  220. biotite/sequence/io/genbank/sequence.py +171 -0
  221. biotite/sequence/io/general.py +201 -0
  222. biotite/sequence/io/gff/__init__.py +26 -0
  223. biotite/sequence/io/gff/convert.py +128 -0
  224. biotite/sequence/io/gff/file.py +450 -0
  225. biotite/sequence/phylo/__init__.py +36 -0
  226. biotite/sequence/phylo/nj.cpython-313-darwin.so +0 -0
  227. biotite/sequence/phylo/nj.pyx +221 -0
  228. biotite/sequence/phylo/tree.cpython-313-darwin.so +0 -0
  229. biotite/sequence/phylo/tree.pyx +1169 -0
  230. biotite/sequence/phylo/upgma.cpython-313-darwin.so +0 -0
  231. biotite/sequence/phylo/upgma.pyx +164 -0
  232. biotite/sequence/profile.py +567 -0
  233. biotite/sequence/search.py +118 -0
  234. biotite/sequence/seqtypes.py +713 -0
  235. biotite/sequence/sequence.py +374 -0
  236. biotite/setup_ccd.py +197 -0
  237. biotite/structure/__init__.py +133 -0
  238. biotite/structure/alphabet/__init__.py +25 -0
  239. biotite/structure/alphabet/encoder.py +332 -0
  240. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  241. biotite/structure/alphabet/i3d.py +110 -0
  242. biotite/structure/alphabet/layers.py +86 -0
  243. biotite/structure/alphabet/pb.license +21 -0
  244. biotite/structure/alphabet/pb.py +171 -0
  245. biotite/structure/alphabet/unkerasify.py +122 -0
  246. biotite/structure/atoms.py +1554 -0
  247. biotite/structure/basepairs.py +1404 -0
  248. biotite/structure/bonds.cpython-313-darwin.so +0 -0
  249. biotite/structure/bonds.pyx +1972 -0
  250. biotite/structure/box.py +588 -0
  251. biotite/structure/celllist.cpython-313-darwin.so +0 -0
  252. biotite/structure/celllist.pyx +849 -0
  253. biotite/structure/chains.py +314 -0
  254. biotite/structure/charges.cpython-313-darwin.so +0 -0
  255. biotite/structure/charges.pyx +520 -0
  256. biotite/structure/compare.py +274 -0
  257. biotite/structure/density.py +109 -0
  258. biotite/structure/dotbracket.py +214 -0
  259. biotite/structure/error.py +39 -0
  260. biotite/structure/filter.py +590 -0
  261. biotite/structure/geometry.py +655 -0
  262. biotite/structure/graphics/__init__.py +13 -0
  263. biotite/structure/graphics/atoms.py +243 -0
  264. biotite/structure/graphics/rna.py +295 -0
  265. biotite/structure/hbond.py +428 -0
  266. biotite/structure/info/__init__.py +24 -0
  267. biotite/structure/info/atom_masses.json +121 -0
  268. biotite/structure/info/atoms.py +81 -0
  269. biotite/structure/info/bonds.py +149 -0
  270. biotite/structure/info/ccd.py +202 -0
  271. biotite/structure/info/components.bcif +0 -0
  272. biotite/structure/info/groups.py +131 -0
  273. biotite/structure/info/masses.py +121 -0
  274. biotite/structure/info/misc.py +138 -0
  275. biotite/structure/info/radii.py +197 -0
  276. biotite/structure/info/standardize.py +186 -0
  277. biotite/structure/integrity.py +215 -0
  278. biotite/structure/io/__init__.py +29 -0
  279. biotite/structure/io/dcd/__init__.py +13 -0
  280. biotite/structure/io/dcd/file.py +67 -0
  281. biotite/structure/io/general.py +243 -0
  282. biotite/structure/io/gro/__init__.py +14 -0
  283. biotite/structure/io/gro/file.py +344 -0
  284. biotite/structure/io/mol/__init__.py +20 -0
  285. biotite/structure/io/mol/convert.py +112 -0
  286. biotite/structure/io/mol/ctab.py +415 -0
  287. biotite/structure/io/mol/header.py +120 -0
  288. biotite/structure/io/mol/mol.py +149 -0
  289. biotite/structure/io/mol/sdf.py +914 -0
  290. biotite/structure/io/netcdf/__init__.py +13 -0
  291. biotite/structure/io/netcdf/file.py +64 -0
  292. biotite/structure/io/pdb/__init__.py +20 -0
  293. biotite/structure/io/pdb/convert.py +307 -0
  294. biotite/structure/io/pdb/file.py +1290 -0
  295. biotite/structure/io/pdb/hybrid36.cpython-313-darwin.so +0 -0
  296. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  297. biotite/structure/io/pdbqt/__init__.py +15 -0
  298. biotite/structure/io/pdbqt/convert.py +113 -0
  299. biotite/structure/io/pdbqt/file.py +688 -0
  300. biotite/structure/io/pdbx/__init__.py +23 -0
  301. biotite/structure/io/pdbx/bcif.py +656 -0
  302. biotite/structure/io/pdbx/cif.py +1075 -0
  303. biotite/structure/io/pdbx/component.py +245 -0
  304. biotite/structure/io/pdbx/compress.py +321 -0
  305. biotite/structure/io/pdbx/convert.py +1745 -0
  306. biotite/structure/io/pdbx/encoding.cpython-313-darwin.so +0 -0
  307. biotite/structure/io/pdbx/encoding.pyx +1031 -0
  308. biotite/structure/io/trajfile.py +693 -0
  309. biotite/structure/io/trr/__init__.py +13 -0
  310. biotite/structure/io/trr/file.py +43 -0
  311. biotite/structure/io/xtc/__init__.py +13 -0
  312. biotite/structure/io/xtc/file.py +43 -0
  313. biotite/structure/mechanics.py +73 -0
  314. biotite/structure/molecules.py +352 -0
  315. biotite/structure/pseudoknots.py +628 -0
  316. biotite/structure/rdf.py +245 -0
  317. biotite/structure/repair.py +304 -0
  318. biotite/structure/residues.py +572 -0
  319. biotite/structure/sasa.cpython-313-darwin.so +0 -0
  320. biotite/structure/sasa.pyx +322 -0
  321. biotite/structure/segments.py +178 -0
  322. biotite/structure/sequence.py +111 -0
  323. biotite/structure/sse.py +308 -0
  324. biotite/structure/superimpose.py +689 -0
  325. biotite/structure/transform.py +530 -0
  326. biotite/structure/util.py +168 -0
  327. biotite/version.py +16 -0
  328. biotite/visualize.py +265 -0
  329. biotite-1.1.0.dist-info/METADATA +190 -0
  330. biotite-1.1.0.dist-info/RECORD +332 -0
  331. biotite-1.1.0.dist-info/WHEEL +4 -0
  332. biotite-1.1.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,567 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ import warnings
6
+ from numbers import Integral
7
+ import numpy as np
8
+ from biotite.sequence.align.alignment import get_codes
9
+ from biotite.sequence.alphabet import LetterAlphabet
10
+ from biotite.sequence.seqtypes import (
11
+ GeneralSequence,
12
+ NucleotideSequence,
13
+ ProteinSequence,
14
+ )
15
+
16
+ __name__ = "biotite.sequence"
17
+ __author__ = "Maximilian Greil"
18
+ __all__ = ["SequenceProfile"]
19
+
20
+ # Abbreviations
21
+ _NUC_DNA_ALPH = NucleotideSequence.alphabet_unamb
22
+ _NUC_RNA_ALPH = LetterAlphabet(["A", "C", "G", "U"])
23
+ _PROT_ALPH = ProteinSequence.alphabet
24
+
25
+
26
+ def _determine_common_alphabet(alphabets):
27
+ """
28
+ Determine the common alphabet from a list of alphabets, that
29
+ extends all alphabets.
30
+ """
31
+ common_alphabet = alphabets[0]
32
+ for alphabet in alphabets[1:]:
33
+ if not common_alphabet.extends(alphabet):
34
+ if alphabet.extends(common_alphabet):
35
+ common_alphabet = alphabet
36
+ else:
37
+ raise ValueError(
38
+ "There is no common alphabet that extends all alphabets"
39
+ )
40
+ return common_alphabet
41
+
42
+
43
+ def _codes_to_iupac(frequency, codes, maxes, row):
44
+ """
45
+ Returns IUPAC code for a row of 'symbols' with none, one or
46
+ multiple maximum positions.
47
+ """
48
+ if np.sum(frequency) == 0:
49
+ raise ValueError(
50
+ f"There is an empty column in the 'symbols' frequency table. "
51
+ f"This doesn't make sense in context of an alignment. "
52
+ f"Please check the 'symbols' frequency table in row {row}."
53
+ )
54
+ key = tuple(np.where(frequency == maxes)[0])
55
+ return codes[key]
56
+
57
+
58
+ class SequenceProfile(object):
59
+ """
60
+ A :class:`SequenceProfile` object stores information about a
61
+ sequence profile of aligned sequences.
62
+ It is possible to calculate and return its consensus sequence.
63
+
64
+ This class saves the position frequency matrix
65
+ (position count matrix) 'symbols' of the occurrences of each
66
+ alphabet symbol at each position.
67
+ It also saves the number of gaps at each position in the array
68
+ 'gaps'.
69
+
70
+ With :meth:`from_alignment()` a :class:`SequenceProfile` object can
71
+ be created from an indefinite number of aligned sequences.
72
+
73
+ With :meth:`probability_matrix()` the position probability matrix
74
+ can be created based on 'symbols' and a pseudocount.
75
+
76
+ With :meth:`log_odds_matrix()` the position weight matrix can
77
+ be created based on the before calculated position probability
78
+ matrix and the background frequencies.
79
+
80
+ With :meth:`sequence_probability_from_matrix()` the probability of a
81
+ sequence can be calculated based on the before calculated position
82
+ probability matrix of this instance of object SequenceProfile.
83
+
84
+ With :meth:`sequence_score_from_matrix()` the score of a sequence
85
+ can be calculated based on the before calculated position weight
86
+ matrix of this instance of object SequenceProfile.
87
+
88
+ All attributes of this class are publicly accessible.
89
+
90
+ Parameters
91
+ ----------
92
+ symbols : ndarray, dtype=int, shape=(n,k)
93
+ This matrix simply saves for each position how often absolutely
94
+ each symbol is present.
95
+ gaps : ndarray, dtype=int, shape=n
96
+ Array which indicates the number of gaps at each position.
97
+ alphabet : Alphabet, length=k
98
+ Alphabet of sequences of sequence profile
99
+
100
+ Attributes
101
+ ----------
102
+ symbols : ndarray, dtype=int, shape=(n,k)
103
+ This matrix simply saves for each position how often absolutely
104
+ each symbol is present.
105
+ gaps : ndarray, dtype=int, shape=n
106
+ Array which indicates the number of gaps at each position.
107
+ alphabet : Alphabet, length=k
108
+ Alphabet of sequences of sequence profile
109
+
110
+ Examples
111
+ --------
112
+
113
+ Create a profile from a multiple sequence alignment:
114
+
115
+ >>> sequences = [
116
+ ... NucleotideSequence("CGCTCATTC"),
117
+ ... NucleotideSequence("CGCTATTC"),
118
+ ... NucleotideSequence("CCCTCAATC"),
119
+ ... ]
120
+ >>> msa, _, _, _ = align_multiple(
121
+ ... sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
122
+ ... )
123
+ >>> print(msa)
124
+ CGCTCATTC
125
+ CGCT-ATTC
126
+ CCCTCAATC
127
+ >>> profile = SequenceProfile.from_alignment(msa)
128
+ >>> print(profile)
129
+ A C G T
130
+ 0 0 3 0 0
131
+ 1 0 1 2 0
132
+ 2 0 3 0 0
133
+ 3 0 0 0 3
134
+ 4 0 2 0 0
135
+ 5 3 0 0 0
136
+ 6 1 0 0 2
137
+ 7 0 0 0 3
138
+ 8 0 3 0 0
139
+ >>> print(profile.gaps)
140
+ [0 0 0 0 1 0 0 0 0]
141
+
142
+ Slice the profile (masks and index arrays are also supported):
143
+
144
+ >>> print(profile[2:])
145
+ A C G T
146
+ 0 0 3 0 0
147
+ 1 0 0 0 3
148
+ 2 0 2 0 0
149
+ 3 3 0 0 0
150
+ 4 1 0 0 2
151
+ 5 0 0 0 3
152
+ 6 0 3 0 0
153
+
154
+ Use the profile to compute the position probability matrix:
155
+
156
+ >>> print(profile.probability_matrix())
157
+ [[0.000 1.000 0.000 0.000]
158
+ [0.000 0.333 0.667 0.000]
159
+ [0.000 1.000 0.000 0.000]
160
+ [0.000 0.000 0.000 1.000]
161
+ [0.000 1.000 0.000 0.000]
162
+ [1.000 0.000 0.000 0.000]
163
+ [0.333 0.000 0.000 0.667]
164
+ [0.000 0.000 0.000 1.000]
165
+ [0.000 1.000 0.000 0.000]]
166
+ """
167
+
168
+ def __init__(self, symbols, gaps, alphabet):
169
+ self._symbols = symbols
170
+ self._gaps = gaps
171
+ self._alphabet = alphabet
172
+
173
+ if len(alphabet) != symbols.shape[1]:
174
+ raise ValueError(
175
+ f"The given alphabet doesn't have the same length "
176
+ f"({len(alphabet)}) as the number of columns "
177
+ f"({symbols.shape[1]}) in the 'symbols' frequency table."
178
+ )
179
+
180
+ if gaps.shape[0] != symbols.shape[0]:
181
+ raise ValueError(
182
+ f"The given 'gaps' position matrix doesn't have the same "
183
+ f"length ({gaps.shape[0]}) as the 'symbols' "
184
+ f"frequency table ({symbols.shape[0]})"
185
+ )
186
+
187
+ @property
188
+ def symbols(self):
189
+ return self._symbols
190
+
191
+ @property
192
+ def gaps(self):
193
+ return self._gaps
194
+
195
+ @property
196
+ def alphabet(self):
197
+ return self._alphabet
198
+
199
+ @symbols.setter
200
+ def symbols(self, new_symbols):
201
+ if not new_symbols.shape == self.symbols.shape:
202
+ raise ValueError(
203
+ f"New ndarray 'symbols' must be of same shape "
204
+ f"{self.symbols.shape} as the old one"
205
+ )
206
+ self._symbols = new_symbols
207
+
208
+ @gaps.setter
209
+ def gaps(self, new_gaps):
210
+ if not new_gaps.shape == self.gaps.shape:
211
+ raise ValueError(
212
+ f"New ndarray 'gaps' must be of same shape "
213
+ f"{self.gaps.shape} as the old one"
214
+ )
215
+ self._gaps = new_gaps
216
+
217
+ def __str__(self):
218
+ # Add an additional row and column for the position and symbol indicators
219
+ print_matrix = np.full(
220
+ (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
221
+ )
222
+ print_matrix[1:, 1:] = self.symbols.astype(str)
223
+ print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
224
+ print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
225
+ max_len = len(max(print_matrix.flatten(), key=len))
226
+ return "\n".join(
227
+ [
228
+ " ".join([str(cell).rjust(max_len) for cell in row])
229
+ for row in print_matrix
230
+ ]
231
+ )
232
+
233
+ def __repr__(self):
234
+ return (
235
+ f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
236
+ f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
237
+ )
238
+
239
+ def __eq__(self, item):
240
+ if not isinstance(item, SequenceProfile):
241
+ return False
242
+ if not np.array_equal(self.symbols, item.symbols):
243
+ return False
244
+ if not np.array_equal(self.gaps, item.gaps):
245
+ return False
246
+ if not self.alphabet == item.alphabet:
247
+ return False
248
+ return True
249
+
250
+ @staticmethod
251
+ def from_alignment(alignment, alphabet=None):
252
+ """
253
+ Get an object of :class:`SequenceProfile` from an object of
254
+ :class:`Alignment`.
255
+
256
+ Based on the sequences of the alignment, the SequenceProfile
257
+ parameters symbols and gaps are calculated.
258
+
259
+ Parameters
260
+ ----------
261
+ alignment : Alignment
262
+ An Alignment object to create the SequenceProfile object
263
+ from.
264
+ alphabet : bool
265
+ This alphabet will be used when creating the SequenceProfile
266
+ object. If no alphabet is selected, the alphabet for this
267
+ SequenceProfile
268
+ object will be calculated from the sequences of object
269
+ Alignment.
270
+ (Default: None).
271
+
272
+ Returns
273
+ -------
274
+ profile: SequenceProfile
275
+ The created SequenceProfile object
276
+ """
277
+ sequences = get_codes(alignment)
278
+ if alphabet is None:
279
+ alphabet = _determine_common_alphabet(
280
+ [seq.alphabet for seq in alignment.sequences]
281
+ )
282
+ else:
283
+ for alph in (seq.alphabet for seq in alignment.sequences):
284
+ if not alphabet.extends(alph):
285
+ raise ValueError(
286
+ "The given alphabet is incompatible with a least one "
287
+ "alphabet of the given sequences"
288
+ )
289
+ symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int)
290
+ gaps = np.zeros(len(sequences[0]), dtype=int)
291
+ sequences = np.transpose(sequences)
292
+ for i in range(len(sequences)):
293
+ row = np.where(sequences[i,] == -1, len(alphabet), sequences[i,])
294
+ count = np.bincount(row, minlength=len(alphabet) + 1)
295
+ symbols[i,] = count[0 : len(alphabet)]
296
+ gaps[i] = count[-1]
297
+ return SequenceProfile(symbols, gaps, alphabet)
298
+
299
+ def to_consensus(self, as_general=False):
300
+ """
301
+ Get the consensus sequence for this SequenceProfile object.
302
+
303
+ Parameters
304
+ ----------
305
+ as_general : bool
306
+ If true, returns consensus sequence as GeneralSequence
307
+ object.
308
+ Otherwise, the consensus sequence object type is chosen
309
+ based on the alphabet of this SequenceProfile object
310
+ (Default: False).
311
+
312
+ Returns
313
+ -------
314
+ consensus: Sequence
315
+ The calculated consensus sequence
316
+ """
317
+ # https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
318
+ if as_general:
319
+ return self._general_to_consensus()
320
+ elif self.alphabet == _NUC_DNA_ALPH:
321
+ return NucleotideSequence(self._dna_to_consensus())
322
+ elif self.alphabet == _NUC_RNA_ALPH:
323
+ return NucleotideSequence(self._rna_to_consensus())
324
+ elif self.alphabet == _PROT_ALPH:
325
+ return self._prot_to_consensus()
326
+ return self._general_to_consensus()
327
+
328
+ def _dna_to_consensus(self):
329
+ codes = {
330
+ (0,): "A",
331
+ (1,): "C",
332
+ (2,): "G",
333
+ (3,): "T",
334
+ (0, 2): "R",
335
+ (1, 3): "Y",
336
+ (1, 2): "S",
337
+ (0, 3): "W",
338
+ (2, 3): "K",
339
+ (0, 1): "M",
340
+ (1, 2, 3): "B",
341
+ (0, 2, 3): "D",
342
+ (0, 1, 3): "H",
343
+ (0, 1, 2): "V",
344
+ (0, 1, 2, 3): "N",
345
+ }
346
+ consensus = ""
347
+ maxes = np.max(self.symbols, axis=1)
348
+ for i in range(len(self.symbols)):
349
+ consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
350
+ return consensus
351
+
352
+ def _rna_to_consensus(self):
353
+ codes = {
354
+ (0,): "A",
355
+ (1,): "C",
356
+ (2,): "G",
357
+ (3,): "U",
358
+ (0, 2): "R",
359
+ (1, 3): "Y",
360
+ (1, 2): "S",
361
+ (0, 3): "W",
362
+ (2, 3): "K",
363
+ (0, 1): "M",
364
+ (1, 2, 3): "B",
365
+ (0, 2, 3): "D",
366
+ (0, 1, 3): "H",
367
+ (0, 1, 2): "V",
368
+ (0, 1, 2, 3): "N",
369
+ }
370
+ consensus = ""
371
+ maxes = np.max(self.symbols, axis=1)
372
+ for i in range(len(self.symbols)):
373
+ consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
374
+ return consensus
375
+
376
+ def _prot_to_consensus(self):
377
+ """
378
+ In case there is more than one symbol with the same maximal
379
+ occurrences, the alphabetically sorted first symbol will be
380
+ taken for the consensus sequence.
381
+ """
382
+ consensus = ProteinSequence()
383
+ consensus.code = np.argmax(self.symbols, axis=1)
384
+ consensus.code = np.where(
385
+ np.sum(self.symbols, axis=1) == 0, 23, consensus.code
386
+ ) # _PROT_ALPH[23] = 'X'
387
+ return consensus
388
+
389
+ def _general_to_consensus(self):
390
+ """
391
+ In case there is more than one symbol with the same maximal
392
+ occurrences, the alphabetically sorted first symbol will be
393
+ taken for the consensus sequence.
394
+ In case the sum of occurrences of all symbols at a position is
395
+ zero, the alphabetically sorted first symbol will be taken for
396
+ the consensus sequence.
397
+ """
398
+ consensus = GeneralSequence(self.alphabet)
399
+ consensus.code = np.argmax(self.symbols, axis=1)
400
+ return consensus
401
+
402
+ def probability_matrix(self, pseudocount=0):
403
+ r"""
404
+ Calculate the position probability matrix (PPM) based on
405
+ 'symbols' and the given pseudocount.
406
+ This new matrix has the same shape as 'symbols'.
407
+
408
+ .. math::
409
+
410
+ P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p}
411
+
412
+ :math:`S`: The symbol.
413
+
414
+ :math:`C_S`: The count of symbol :math:`S` at the sequence
415
+ position.
416
+
417
+ :math:`c_p`: The pseudocount.
418
+
419
+ :math:`k`: Length of the alphabet.
420
+
421
+ Parameters
422
+ ----------
423
+ pseudocount: int, optional
424
+ Amount added to the number of observed cases in order to
425
+ change the expected probability of the PPM.
426
+ (Default: 0)
427
+
428
+ Returns
429
+ -------
430
+ ppm: ndarray, dtype=float, shape=(n,k)
431
+ The calculated the position probability matrix.
432
+ """
433
+ if pseudocount < 0:
434
+ raise ValueError("Pseudocount can not be smaller than zero.")
435
+ return (self.symbols + pseudocount / self.symbols.shape[1]) / (
436
+ np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount
437
+ )
438
+
439
+ def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
440
+ r"""
441
+ Calculate the position weight matrix (PWM) based on the
442
+ position probability matrix (PPM) (with given pseudocount) and
443
+ background_frequencies.
444
+ This new matrix has the same shape as 'symbols'.
445
+
446
+ .. math::
447
+
448
+ W(S) = \log_2 \left( \frac{P(S)}{B_S} \right)
449
+
450
+ :math:`S`: The symbol.
451
+
452
+ :math:`P(S)`: The probability of symbol :math:`S` at the
453
+ sequence position.
454
+
455
+ :math:`c_p`: The background frequency of symbol :math:`S`.
456
+
457
+ Parameters
458
+ ----------
459
+ pseudocount: int, optional
460
+ Amount added to the number of observed cases in order to change
461
+ the expected probability of the PPM.
462
+ (Default: 0)
463
+ background_frequencies: ndarray, shape=(k,), dtype=float, optional
464
+ The background frequencies for each symbol in the alphabet.
465
+ By default, a uniform distribution is assumed.
466
+
467
+ Returns
468
+ -------
469
+ pwm: ndarray, dtype=float, shape=(n,k)
470
+ The calculated the position weight matrix.
471
+ """
472
+ if background_frequencies is None:
473
+ background_frequencies = 1 / len(self.alphabet)
474
+ ppm = self.probability_matrix(pseudocount=pseudocount)
475
+ # Catch warning that appears, if a symbol is missing at any
476
+ # position in the profile
477
+ with warnings.catch_warnings():
478
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
479
+ return np.log2(ppm / background_frequencies)
480
+
481
+ def sequence_probability(self, sequence, pseudocount=0):
482
+ r"""
483
+ Calculate probability of a sequence based on the
484
+ position probability matrix (PPM).
485
+
486
+ The sequence probability is the product of the probability of
487
+ the respective symbol over all sequence positions.
488
+
489
+ Parameters
490
+ ----------
491
+ sequence : Sequence
492
+ The input sequence.
493
+ pseudocount: int, optional
494
+ Amount added to the number of observed cases in order to change
495
+ the expected probability of the PPM.
496
+ (Default: 0)
497
+
498
+ Returns
499
+ -------
500
+ probability: float
501
+ The calculated probability for the input sequence based on
502
+ the PPM.
503
+ """
504
+ ppm = self.probability_matrix(pseudocount=pseudocount)
505
+ if len(sequence) != len(ppm):
506
+ raise ValueError(
507
+ f"The given sequence has a different length ({len(sequence)}) than "
508
+ f"the position probability matrix ({len(ppm)})."
509
+ )
510
+ if not ppm.shape == self.symbols.shape:
511
+ raise ValueError(
512
+ f"Position probability matrix {ppm.shape} must be of same shape "
513
+ f"as 'symbols' {self.symbols.shape}"
514
+ )
515
+ return np.prod(ppm[np.arange(len(sequence)), sequence.code])
516
+
517
+ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
518
+ """
519
+ Calculate score of a sequence based on the
520
+ position weight matrix (PWM).
521
+
522
+ The score is the sum of weights (log-odds scores) of
523
+ the respective symbol over all sequence positions.
524
+
525
+ Parameters
526
+ ----------
527
+ sequence : Sequence
528
+ The input sequence.
529
+ pseudocount: int, optional
530
+ Amount added to the number of observed cases in order to change
531
+ the expected probability of the PPM.
532
+ (Default: 0)
533
+ background_frequencies: ndarray, shape=(k,), dtype=float, optional
534
+ The background frequencies for each symbol in the alphabet.
535
+ By default a uniform distribution is assumed.
536
+
537
+ Returns
538
+ -------
539
+ score: float
540
+ The calculated score for the input sequence based on
541
+ the PWM.
542
+ """
543
+ if background_frequencies is None:
544
+ background_frequencies = 1 / len(self.alphabet)
545
+ pwm = self.log_odds_matrix(
546
+ background_frequencies=background_frequencies, pseudocount=pseudocount
547
+ )
548
+ if len(sequence) != len(pwm):
549
+ raise ValueError(
550
+ f"The given sequence has a different length ({len(sequence)}) than "
551
+ f"the position weight matrix ({len(pwm)})."
552
+ )
553
+ if not pwm.shape == self.symbols.shape:
554
+ raise ValueError(
555
+ f"Position weight matrix {pwm.shape} must be of same shape "
556
+ f"as 'symbols' {self.symbols.shape}"
557
+ )
558
+ return np.sum(pwm[np.arange(len(sequence)), sequence.code])
559
+
560
+ def __getitem__(self, index):
561
+ if isinstance(index, Integral):
562
+ # Do not allow to collapse dimensions
563
+ index = slice(index, index + 1)
564
+ return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
565
+
566
+ def __len__(self):
567
+ return len(self.symbols)
@@ -0,0 +1,118 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["find_subsequence", "find_symbol", "find_symbol_first", "find_symbol_last"]
8
+
9
+ import numpy as np
10
+
11
+
12
+ def find_subsequence(sequence, query):
13
+ """
14
+ Find a subsequence in a sequence.
15
+
16
+ Parameters
17
+ ----------
18
+ sequence : Sequence
19
+ The sequence to find the subsequence in.
20
+ query : Sequence
21
+ The potential subsequence. Its alphabet must extend the
22
+ `sequence` alphabet.
23
+
24
+ Returns
25
+ -------
26
+ match_indices : ndarray
27
+ The starting indices in `sequence`, where `query` has been
28
+ found. The array is empty if no match has been found.
29
+
30
+ Raises
31
+ ------
32
+ ValueError
33
+ If the `query` alphabet does not extend the `sequence` alphabet.
34
+
35
+ Examples
36
+ --------
37
+
38
+ >>> main_seq = NucleotideSequence("ACTGAATGA")
39
+ >>> sub_seq = NucleotideSequence("TGA")
40
+ >>> print(find_subsequence(main_seq, sub_seq))
41
+ [2 6]
42
+
43
+ """
44
+ if not sequence.get_alphabet().extends(query.get_alphabet()):
45
+ raise ValueError("The sequences alphabets are not equal")
46
+ match_indices = []
47
+ frame_size = len(query)
48
+ for i in range(len(sequence) - frame_size + 1):
49
+ sub_seq_code = sequence.code[i : i + frame_size]
50
+ if np.array_equal(query.code, sub_seq_code):
51
+ match_indices.append(i)
52
+ return np.array(match_indices)
53
+
54
+
55
+ def find_symbol(sequence, symbol):
56
+ """
57
+ Find a symbol in a sequence.
58
+
59
+ Parameters
60
+ ----------
61
+ sequence : Sequence
62
+ The sequence to find the symbol in.
63
+ symbol : object
64
+ The symbol to be found in `sequence`.
65
+
66
+ Returns
67
+ -------
68
+ match_indices : ndarray
69
+ The indices in `sequence`, where `symbol` has been found.
70
+ """
71
+ code = sequence.get_alphabet().encode(symbol)
72
+ return np.where(sequence.code == code)[0]
73
+
74
+
75
+ def find_symbol_first(sequence, symbol):
76
+ """
77
+ Find first occurence of a symbol in a sequence.
78
+
79
+ Parameters
80
+ ----------
81
+ sequence : Sequence
82
+ The sequence to find the symbol in.
83
+ symbol : object
84
+ The symbol to be found in `sequence`.
85
+
86
+ Returns
87
+ -------
88
+ first_index : int
89
+ The first index of `symbol` in `sequence`. If `symbol` is not in
90
+ `sequence`, -1 is returned.
91
+ """
92
+ match_i = find_symbol(sequence, symbol)
93
+ if len(match_i) == 0:
94
+ return -1
95
+ return np.min(match_i)
96
+
97
+
98
+ def find_symbol_last(sequence, symbol):
99
+ """
100
+ Find last occurence of a symbol in a sequence.
101
+
102
+ Parameters
103
+ ----------
104
+ sequence : Sequence
105
+ The sequence to find the symbol in.
106
+ symbol : object
107
+ The symbol to be found in `sequence`.
108
+
109
+ Returns
110
+ -------
111
+ flast_index : int
112
+ The last index of `symbol` in `sequence`. If `symbol` is not in
113
+ `sequence`, -1 is returned.
114
+ """
115
+ match_i = find_symbol(sequence, symbol)
116
+ if len(match_i) == 0:
117
+ return -1
118
+ return np.max(match_i)