biotite 0.41.1__cp311-cp311-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-311-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,566 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["Alphabet", "LetterAlphabet", "AlphabetMapper", "AlphabetError",
8
+ "common_alphabet"]
9
+
10
+ import copy
11
+ from numbers import Integral
12
+ import string
13
+ import numpy as np
14
+ from .codec import encode_chars, decode_to_chars, map_sequence_code
15
+
16
+
17
+ class Alphabet(object):
18
+ """
19
+ This class defines the allowed symbols for a :class:`Sequence` and
20
+ handles the encoding/decoding between symbols and symbol codes.
21
+
22
+ An :class:`Alphabet` is created with the list of symbols, that can
23
+ be used in this context.
24
+ In most cases a symbol will be simply a letter, hence a string of
25
+ length 1. But in principle every hashable Python object can serve
26
+ as symbol.
27
+
28
+ The encoding of a symbol into a symbol code is
29
+ done in the following way: Find the first index in the symbol list,
30
+ where the list element equals the symbol. This index is the
31
+ symbol code. If the symbol is not found in the list, an
32
+ :class:`AlphabetError` is raised.
33
+
34
+ Internally, a dictionary is used for encoding, with symbols as keys
35
+ and symbol codes as values. Therefore, every symbol must be
36
+ hashable. For decoding the symbol list is indexed with the symbol
37
+ code.
38
+
39
+ If an alphabet *1* contains the same symbols and the same
40
+ symbol-code-mappings like another alphabet *2*, but alphabet *1*
41
+ introduces also new symbols, then alphabet *1* *extends* alphabet
42
+ *2*.
43
+ Per definition, every alphabet also extends itself.
44
+
45
+ Objects of this class are immutable.
46
+
47
+ Parameters
48
+ ----------
49
+ symbols : iterable object
50
+ The symbols, that are allowed in this alphabet. The
51
+ corresponding code for a symbol, is the index of that symbol
52
+ in this list.
53
+
54
+ Examples
55
+ --------
56
+ Create an Alphabet containing DNA letters and encode/decode a
57
+ letter/code:
58
+
59
+ >>> alph = Alphabet(["A","C","G","T"])
60
+ >>> print(alph.encode("G"))
61
+ 2
62
+ >>> print(alph.decode(2))
63
+ G
64
+ >>> try:
65
+ ... alph.encode("foo")
66
+ ... except Exception as e:
67
+ ... print(e)
68
+ Symbol 'foo' is not in the alphabet
69
+
70
+ Create an Alphabet of arbitrary objects:
71
+
72
+ >>> alph = Alphabet(["foo", 42, (1,2,3), 5, 3.141])
73
+ >>> print(alph.encode((1,2,3)))
74
+ 2
75
+ >>> print(alph.decode(4))
76
+ 3.141
77
+
78
+ On the subject of alphabet extension:
79
+ An alphabet always extends itself.
80
+
81
+ >>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","G","T"]))
82
+ True
83
+
84
+ An alphabet extends an alphabet when it contains additional symbols...
85
+
86
+ >>> Alphabet(["A","C","G","T","U"]).extends(Alphabet(["A","C","G","T"]))
87
+ True
88
+
89
+ ...but not vice versa
90
+
91
+ >>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","G","T","U"]))
92
+ False
93
+
94
+ Two alphabets with same symbols but different symbol-code-mappings
95
+
96
+ >>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","T","G"]))
97
+ False
98
+ """
99
+
100
+ def __init__(self, symbols):
101
+ if len(symbols) == 0:
102
+ raise ValueError("Symbol list is empty")
103
+ self._symbols = copy.deepcopy(list(symbols))
104
+ self._symbol_dict = {}
105
+ for i, symbol in enumerate(symbols):
106
+ self._symbol_dict[symbol] = i
107
+
108
+ def __repr__(self):
109
+ """Represent Alphabet as a string for debugging."""
110
+ return f'Alphabet({self._symbols})'
111
+
112
+ def get_symbols(self):
113
+ """
114
+ Get the symbols in the alphabet.
115
+
116
+ Returns
117
+ -------
118
+ symbols : list
119
+ Copy of the internal list of symbols.
120
+ """
121
+ return copy.deepcopy(self._symbols)
122
+
123
+ def extends(self, alphabet):
124
+ """
125
+ Check, if this alphabet extends another alphabet.
126
+
127
+ Parameters
128
+ ----------
129
+ alphabet : Alphabet
130
+ The potential parent alphabet.
131
+
132
+ Returns
133
+ -------
134
+ result : bool
135
+ True, if this object extends `alphabet`, false otherwise.
136
+ """
137
+ if alphabet is self:
138
+ return True
139
+ elif len(alphabet) > len(self):
140
+ return False
141
+ else:
142
+ return alphabet.get_symbols() \
143
+ == self.get_symbols()[:len(alphabet)]
144
+
145
+ def encode(self, symbol):
146
+ """
147
+ Use the alphabet to encode a symbol.
148
+
149
+ Parameters
150
+ ----------
151
+ symbol : object
152
+ The object to encode into a symbol code.
153
+
154
+ Returns
155
+ -------
156
+ code : int
157
+ The symbol code of `symbol`.
158
+
159
+ Raises
160
+ ------
161
+ AlphabetError
162
+ If `symbol` is not in the alphabet.
163
+ """
164
+ try:
165
+ return self._symbol_dict[symbol]
166
+ except KeyError:
167
+ raise AlphabetError(
168
+ f"Symbol {repr(symbol)} is not in the alphabet"
169
+ )
170
+
171
+ def decode(self, code):
172
+ """
173
+ Use the alphabet to decode a symbol code.
174
+
175
+ Parameters
176
+ ----------
177
+ code : int
178
+ The symbol code to be decoded.
179
+
180
+ Returns
181
+ -------
182
+ symbol : object
183
+ The symbol corresponding to `code`.
184
+
185
+ Raises
186
+ ------
187
+ AlphabetError
188
+ If `code` is not a valid code in the alphabet.
189
+ """
190
+ if code < 0 or code >= len(self._symbols):
191
+ raise AlphabetError(f"'{code:d}' is not a valid code")
192
+ return self._symbols[code]
193
+
194
+ def encode_multiple(self, symbols, dtype=np.int64):
195
+ """
196
+ Encode a list of symbols.
197
+
198
+ Parameters
199
+ ----------
200
+ symbols : array-like
201
+ The symbols to encode.
202
+ dtype : dtype, optional
203
+ The dtype of the output ndarray. (Default: `int64`)
204
+
205
+ Returns
206
+ -------
207
+ code : ndarray
208
+ The sequence code.
209
+ """
210
+ return np.array([self.encode(e) for e in symbols], dtype=dtype)
211
+
212
+ def decode_multiple(self, code):
213
+ """
214
+ Decode a sequence code into a list of symbols.
215
+
216
+ Parameters
217
+ ----------
218
+ code : ndarray
219
+ The sequence code to decode.
220
+
221
+ Returns
222
+ -------
223
+ symbols : list
224
+ The decoded list of symbols.
225
+ """
226
+ return [self.decode(c) for c in code]
227
+
228
+ def is_letter_alphabet(self):
229
+ """
230
+ Check whether the symbols in this alphabet are single printable
231
+ letters.
232
+ If so, the alphabet could be expressed by a `LetterAlphabet`.
233
+
234
+ Returns
235
+ -------
236
+ is_letter_alphabet : bool
237
+ True, if all symbols in the alphabet are 'str' or 'bytes',
238
+ have length 1 and are printable.
239
+ """
240
+ for symbol in self:
241
+ if not isinstance(symbol, (str, bytes)) \
242
+ or len(symbol) > 1:
243
+ return False
244
+ if isinstance(symbol, str):
245
+ symbol = symbol.encode("ASCII")
246
+ if symbol not in LetterAlphabet.PRINATBLES:
247
+ return False
248
+ return True
249
+
250
+ def __str__(self):
251
+ return str(self.get_symbols())
252
+
253
+ def __len__(self):
254
+ return len(self.get_symbols())
255
+
256
+ def __iter__(self):
257
+ return self.get_symbols().__iter__()
258
+
259
+ def __contains__(self, symbol):
260
+ return symbol in self.get_symbols()
261
+
262
+ def __hash__(self):
263
+ return hash(tuple(self._symbols))
264
+
265
+ def __eq__(self, item):
266
+ if item is self:
267
+ return True
268
+ if not isinstance(item, Alphabet):
269
+ return False
270
+ return self.get_symbols() == item.get_symbols()
271
+
272
+
273
+ class LetterAlphabet(Alphabet):
274
+ """
275
+ :class:`LetterAlphabet` is a an :class:`Alphabet` subclass
276
+ specialized for letter based alphabets, like DNA or protein
277
+ sequence alphabets.
278
+ The alphabet size is limited to the 94 printable, non-whitespace
279
+ characters.
280
+ Internally the symbols are saved as `bytes` objects.
281
+ The encoding and decoding process is a lot faster than for a
282
+ normal :class:`Alphabet`.
283
+
284
+ The performance gain comes through the use of *NumPy* and *Cython*
285
+ for encoding and decoding, without the need of a dictionary.
286
+
287
+ Parameters
288
+ ----------
289
+ symbols : iterable object or str or bytes
290
+ The symbols, that are allowed in this alphabet. The
291
+ corresponding code for a symbol, is the index of that symbol
292
+ in this list.
293
+ """
294
+
295
+ PRINATBLES = (string.digits + string.ascii_letters + string.punctuation) \
296
+ .encode("ASCII")
297
+
298
+ def __init__(self, symbols):
299
+ if len(symbols) == 0:
300
+ raise ValueError("Symbol list is empty")
301
+ self._symbols = []
302
+ for symbol in symbols:
303
+ if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
304
+ raise ValueError(f"Symbol '{symbol}' is not a single letter")
305
+ if isinstance(symbol, str):
306
+ symbol = symbol.encode("ASCII")
307
+ if symbol not in LetterAlphabet.PRINATBLES:
308
+ raise ValueError(
309
+ f"Symbol {repr(symbol)} is not printable or whitespace"
310
+ )
311
+ self._symbols.append(symbol)
312
+ # Direct 'astype' conversion is not allowed by numpy
313
+ # -> frombuffer()
314
+ self._symbols = np.frombuffer(
315
+ np.array(self._symbols, dtype="|S1"),
316
+ dtype=np.ubyte
317
+ )
318
+
319
+ def __repr__(self):
320
+ """Represent LetterAlphabet as a string for debugging."""
321
+ return f'LetterAlphabet({self.get_symbols()})'
322
+
323
+ def extends(self, alphabet):
324
+ if alphabet is self:
325
+ return True
326
+ elif type(alphabet) == LetterAlphabet:
327
+ if len(alphabet._symbols) > len(self._symbols):
328
+ return False
329
+ return np.all(
330
+ alphabet._symbols == self._symbols[:len(alphabet._symbols)]
331
+ )
332
+ else:
333
+ return super().extends(alphabet)
334
+
335
+ def get_symbols(self):
336
+ """
337
+ Get the symbols in the alphabet.
338
+
339
+ Returns
340
+ -------
341
+ symbols : list
342
+ Copy of the internal list of symbols.
343
+ """
344
+ return [symbol.decode("ASCII") for symbol
345
+ in self._symbols_as_bytes()]
346
+
347
+ def encode(self, symbol):
348
+ if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
349
+ raise AlphabetError(f"Symbol '{symbol}' is not a single letter")
350
+ indices = np.where(self._symbols == ord(symbol))[0]
351
+ if len(indices) == 0:
352
+ raise AlphabetError(
353
+ f"Symbol {repr(symbol)} is not in the alphabet"
354
+ )
355
+ return indices[0]
356
+
357
+ def decode(self, code, as_bytes=False):
358
+ if code < 0 or code >= len(self._symbols):
359
+ raise AlphabetError(f"'{code:d}' is not a valid code")
360
+ return chr(self._symbols[code])
361
+
362
+ def encode_multiple(self, symbols, dtype=None):
363
+ """
364
+ Encode multiple symbols.
365
+
366
+ Parameters
367
+ ----------
368
+ symbols : iterable object or str or bytes
369
+ The symbols to encode. The method is fastest when a
370
+ :class:`ndarray`, :class:`str` or :class:`bytes` object
371
+ containing the symbols is provided, instead of e.g. a list.
372
+ dtype : dtype, optional
373
+ For compatibility with superclass. The value is ignored
374
+
375
+ Returns
376
+ -------
377
+ code : ndarray
378
+ The sequence code.
379
+ """
380
+ if isinstance(symbols, str):
381
+ symbols = np.frombuffer(symbols.encode("ASCII"), dtype=np.ubyte)
382
+ elif isinstance(symbols, bytes):
383
+ symbols = np.frombuffer(symbols, dtype=np.ubyte)
384
+ elif isinstance(symbols, np.ndarray):
385
+ symbols = np.frombuffer(
386
+ symbols.astype(dtype="|S1"), dtype=np.ubyte
387
+ )
388
+ else:
389
+ symbols = np.frombuffer(
390
+ np.array(list(symbols), dtype="|S1"),
391
+ dtype=np.ubyte
392
+ )
393
+ return encode_chars(alphabet=self._symbols, symbols=symbols)
394
+
395
+ def decode_multiple(self, code, as_bytes=False):
396
+ """
397
+ Decode a sequence code into a list of symbols.
398
+
399
+ Parameters
400
+ ----------
401
+ code : ndarray, dtype=uint8
402
+ The sequence code to decode.
403
+ Works fastest if a :class:`ndarray` is provided.
404
+ as_bytes : bool, optional
405
+ If true, the output array will contain `bytes`
406
+ (dtype 'S1').
407
+ Otherwise, the the output array will contain `str`
408
+ (dtype 'U1').
409
+
410
+ Returns
411
+ -------
412
+ symbols : ndarray, dtype='U1' or dtype='S1'
413
+ The decoded list of symbols.
414
+ """
415
+ if not isinstance(code, np.ndarray):
416
+ code = np.array(code, dtype=np.uint8)
417
+ code = code.astype(np.uint8, copy=False)
418
+ symbols = decode_to_chars(alphabet=self._symbols, code=code)
419
+ # Symbols must be convverted from 'np.ubyte' to '|S1'
420
+ symbols = np.frombuffer(symbols, dtype="|S1")
421
+ if not as_bytes:
422
+ symbols = symbols.astype("U1")
423
+ return symbols
424
+
425
+ def __contains__(self, symbol):
426
+ if not isinstance(symbol, (str, bytes)):
427
+ return False
428
+ return ord(symbol) in self._symbols
429
+
430
+ def __len__(self):
431
+ return len(self._symbols)
432
+
433
+ def _symbols_as_bytes(self):
434
+ "Properly convert from dtype 'np.ubyte' to '|S1'"
435
+ return np.frombuffer(self._symbols, dtype="|S1")
436
+
437
+
438
+
439
+ class AlphabetMapper(object):
440
+ """
441
+ This class is used for symbol code conversion from a source
442
+ alphabet into a target alphabet.
443
+
444
+ This means that the symbol codes are converted from one to another
445
+ alphabet so that the symbol itself is preserved.
446
+ This class works for single symbol codes or an entire sequence code
447
+ likewise.
448
+
449
+ Parameters
450
+ ----------
451
+ source_alphabet, target_alphabet : Alphabet
452
+ The codes are converted from the source alphabet into the
453
+ target alphabet.
454
+ The target alphabet must contain at least all symbols of the
455
+ source alphabet, but it is not required that the shared symbols
456
+ are in the same order.
457
+
458
+ Examples
459
+ --------
460
+
461
+ >>> source_alph = Alphabet(["A","C","G","T"])
462
+ >>> target_alph = Alphabet(["T","U","A","G","C"])
463
+ >>> mapper = AlphabetMapper(source_alph, target_alph)
464
+ >>> print(mapper[0])
465
+ 2
466
+ >>> print(mapper[1])
467
+ 4
468
+ >>> print(mapper[[1,1,3]])
469
+ [4 4 0]
470
+ >>> in_sequence = GeneralSequence(source_alph, "GCCTAT")
471
+ >>> print(in_sequence.code)
472
+ [2 1 1 3 0 3]
473
+ >>> print(in_sequence)
474
+ GCCTAT
475
+ >>> out_sequence = GeneralSequence(target_alph)
476
+ >>> out_sequence.code = mapper[in_sequence.code]
477
+ >>> print(out_sequence.code)
478
+ [3 4 4 0 2 0]
479
+ >>> print(out_sequence)
480
+ GCCTAT
481
+ """
482
+
483
+ def __init__(self, source_alphabet, target_alphabet):
484
+ if target_alphabet.extends(source_alphabet):
485
+ self._necessary_mapping = False
486
+ else:
487
+ self._necessary_mapping = True
488
+ self._mapper = np.zeros(
489
+ len(source_alphabet),
490
+ dtype=AlphabetMapper._dtype(len(target_alphabet))
491
+ )
492
+ for old_code in range(len(source_alphabet)):
493
+ symbol = source_alphabet.decode(old_code)
494
+ new_code = target_alphabet.encode(symbol)
495
+ self._mapper[old_code] = new_code
496
+
497
+ def __getitem__(self, code):
498
+ if isinstance(code, Integral):
499
+ if self._necessary_mapping:
500
+ return self._mapper[code]
501
+ else:
502
+ return code
503
+ if not isinstance(code, np.ndarray) \
504
+ or code.dtype not in (np.uint8, np.uint16, np.uint32, np.uint64):
505
+ code = np.array(code, dtype=np.uint64)
506
+ if self._necessary_mapping:
507
+ mapped_code = np.empty(len(code), dtype=self._mapper.dtype)
508
+ map_sequence_code(
509
+ self._mapper,
510
+ code,
511
+ mapped_code
512
+ )
513
+ return mapped_code
514
+ else:
515
+ return code
516
+
517
+
518
+ @staticmethod
519
+ def _dtype(alphabet_size):
520
+ _size_uint8 = np.iinfo(np.uint8 ).max +1
521
+ _size_uint16 = np.iinfo(np.uint16).max +1
522
+ _size_uint32 = np.iinfo(np.uint32).max +1
523
+ if alphabet_size <= _size_uint8:
524
+ return np.uint8
525
+ elif alphabet_size <= _size_uint16:
526
+ return np.uint16
527
+ elif alphabet_size <= _size_uint32:
528
+ return np.uint32
529
+ else:
530
+ return np.uint64
531
+
532
+
533
+ class AlphabetError(Exception):
534
+ """
535
+ This exception is raised, when a code or a symbol is not in an
536
+ :class:`Alphabet`.
537
+ """
538
+ pass
539
+
540
+
541
+ def common_alphabet(alphabets):
542
+ """
543
+ Determine the alphabet from a list of alphabets, that
544
+ extends all alphabets.
545
+
546
+ Parameters
547
+ ----------
548
+ alphabets : iterable of Alphabet
549
+ The alphabets from which the common one should be identified.
550
+
551
+ Returns
552
+ -------
553
+ common_alphabet : Alphabet or None
554
+ The alphabet from `alphabets` that extends all alphabets.
555
+ ``None`` if no such common alphabet exists.
556
+ """
557
+ common_alphabet = None
558
+ for alphabet in alphabets:
559
+ if common_alphabet is None:
560
+ common_alphabet = alphabet
561
+ elif not common_alphabet.extends(alphabet):
562
+ if alphabet.extends(common_alphabet):
563
+ common_alphabet = alphabet
564
+ else:
565
+ return None
566
+ return common_alphabet