biotite 1.6.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +426 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +202 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +66 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +224 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +259 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +191 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +127 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +491 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +763 -0
  73. biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +462 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1596 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cp314-win_amd64.pyd +0 -0
  272. biotite/structure/charges.pyx +521 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +646 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +426 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2122 -0
  323. biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +452 -0
  338. biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.6.0.dist-info/METADATA +162 -0
  352. biotite-1.6.0.dist-info/RECORD +354 -0
  353. biotite-1.6.0.dist-info/WHEEL +4 -0
  354. biotite-1.6.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,12 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ A subpackage for reading and writing sequence related data.
7
+ """
8
+
9
+ __name__ = "biotite.sequence.io"
10
+ __author__ = "Patrick Kunzmann"
11
+
12
+ from .general import *
@@ -0,0 +1,22 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This subpackage is used for reading and writing sequence objects
7
+ using the popular FASTA format.
8
+
9
+ This package contains the :class:`FastaFile`, which provides a
10
+ dictionary like interface to FASTA files, where the header lines are
11
+ keys and the strings containing sequence data are the corresponding
12
+ values.
13
+
14
+ Furthermore, the package contains convenience functions for
15
+ getting/setting directly :class:`Sequence` objects, rather than strings.
16
+ """
17
+
18
+ __name__ = "biotite.sequence.io.fasta"
19
+ __author__ = "Patrick Kunzmann"
20
+
21
+ from .convert import *
22
+ from .file import *
@@ -0,0 +1,462 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.fasta"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ import functools
9
+ import warnings
10
+ from collections import OrderedDict
11
+ import numpy as np
12
+ from biotite.sequence.align.alignment import Alignment, get_codes
13
+ from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
14
+ from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
15
+
16
+ __all__ = [
17
+ "get_sequence",
18
+ "get_sequences",
19
+ "set_sequence",
20
+ "set_sequences",
21
+ "get_alignment",
22
+ "set_alignment",
23
+ "get_a3m_alignments",
24
+ "set_a3m_alignments",
25
+ ]
26
+
27
+
28
+ def get_sequence(fasta_file, header=None, seq_type=None):
29
+ """
30
+ Get a sequence from a :class:`FastaFile` instance.
31
+
32
+ The type of sequence is guessed from the sequence string:
33
+ First, a conversion into a :class:`NucleotideSequence` and
34
+ second a conversion into a :class:`ProteinSequence` is tried.
35
+
36
+ Parameters
37
+ ----------
38
+ fasta_file : FastaFile
39
+ The :class:`FastaFile` to be accessed.
40
+ header : str, optional
41
+ The header to get the sequence from. By default, the first
42
+ sequence of the file is returned.
43
+ seq_type : type[Sequence], optional
44
+ The :class:`Sequence` subclass contained in the file.
45
+ If not set, the type is automatically inferred as
46
+ :class:`ProteinSequence` or :class:`NucleotideSequence`.
47
+ For large sequence data it is recommended to set this parameter.
48
+
49
+ Returns
50
+ -------
51
+ sequence : NucleotideSequence or ProteinSequence
52
+ The requested sequence in the `FastaFile`.
53
+ :class:`NucleotideSequence` if the sequence string fits the
54
+ corresponding alphabet, :class:`ProteinSequence` otherwise.
55
+
56
+ Raises
57
+ ------
58
+ ValueError
59
+ If the sequence data can be neither converted into a
60
+ :class:`NucleotideSequence` nor a :class:`ProteinSequence`.
61
+ """
62
+ if header is not None:
63
+ seq_str = fasta_file[header]
64
+ else:
65
+ # Return first (and probably only) sequence of file
66
+ seq_str = None
67
+ for seq_str in fasta_file.values():
68
+ break
69
+ if seq_str is None:
70
+ raise ValueError("File does not contain any sequences")
71
+ # Determine the sequence type:
72
+ # If NucleotideSequence can be created it is a DNA sequence,
73
+ # otherwise protein sequence
74
+ return _convert_to_sequence(seq_str, seq_type)
75
+
76
+
77
+ def get_sequences(fasta_file, seq_type=None):
78
+ """
79
+ Get dictionary from a :class:`FastaFile` instance,
80
+ where headers are keys and sequences are values.
81
+
82
+ The type of sequence is guessed from the sequence string:
83
+ First, a conversion into a :class:`NucleotideSequence` and
84
+ second a conversion into a :class:`ProteinSequence` is tried.
85
+
86
+ Parameters
87
+ ----------
88
+ fasta_file : FastaFile
89
+ The :class:`FastaFile` to be accessed.
90
+ seq_type : type[Sequence], optional
91
+ The :class:`Sequence` subclass contained in the file.
92
+ If not set, the type is automatically inferred as
93
+ :class:`ProteinSequence` or :class:`NucleotideSequence`.
94
+ For large sequence data it is recommended to set this parameter.
95
+
96
+ Returns
97
+ -------
98
+ seq_dict : dict
99
+ A dictionary that maps headers to
100
+ :class:`NucleotideSequence` and/or :class:`ProteinSequence`
101
+ instances as values.
102
+
103
+ Raises
104
+ ------
105
+ ValueError
106
+ If at least on of the sequence strings can be neither converted
107
+ into a :class:`NucleotideSequence` nor a
108
+ :class:`ProteinSequence`.
109
+ """
110
+ seq_dict = OrderedDict()
111
+ for header, seq_str in fasta_file.items():
112
+ seq_dict[header] = _convert_to_sequence(seq_str, seq_type)
113
+ return seq_dict
114
+
115
+
116
+ def set_sequence(fasta_file, sequence, header=None, as_rna=False):
117
+ """
118
+ Set a sequence in a :class:`FastaFile` instance.
119
+
120
+ Parameters
121
+ ----------
122
+ fasta_file : FastaFile
123
+ The :class:`FastaFile` to be accessed.
124
+ sequence : Sequence
125
+ The sequence to be set.
126
+ header : str, optional
127
+ The header for the sequence. Default is ``'sequence'``.
128
+ as_rna : bool, optional
129
+ If set to true, ``'T'`` will be replaced by ``'U'``,
130
+ if a :class:`NucleotideSequence` was given.
131
+
132
+ Raises
133
+ ------
134
+ ValueError
135
+ If the sequence's alphabet uses symbols other than single
136
+ characters.
137
+ """
138
+ if header is None:
139
+ header = "sequence"
140
+ fasta_file[header] = _convert_to_string(sequence, as_rna)
141
+
142
+
143
+ def set_sequences(fasta_file, sequence_dict, as_rna=False):
144
+ """
145
+ Set sequences in a :class:`FastaFile` instance from a dictionary.
146
+
147
+ Parameters
148
+ ----------
149
+ fasta_file : FastaFile
150
+ The :class:`FastaFile` to be accessed.
151
+ sequence_dict : dict
152
+ A dictionary containing the sequences to be set.
153
+ Header are keys, :class:`Sequence` instances are values.
154
+ as_rna : bool, optional
155
+ If set to true, ``'T'`` will be replaced by ``'U'``,
156
+ if a :class:`NucleotideSequence` was given.
157
+
158
+ Raises
159
+ ------
160
+ ValueError
161
+ If the sequences alphabets uses symbols other than single
162
+ characters.
163
+ """
164
+ for header, sequence in sequence_dict.items():
165
+ fasta_file[header] = _convert_to_string(sequence, as_rna)
166
+
167
+
168
+ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None):
169
+ """
170
+ Get an alignment from a :class:`FastaFile` instance.
171
+
172
+ Parameters
173
+ ----------
174
+ fasta_file : FastaFile
175
+ The :class:`FastaFile` to be accessed.
176
+ additional_gap_chars : str, optional
177
+ The characters to be treated as gaps.
178
+ seq_type : type[Sequence], optional
179
+ The :class:`Sequence` subclass contained in the file.
180
+ If not set, the type is automatically inferred as
181
+ :class:`ProteinSequence` or :class:`NucleotideSequence`.
182
+ For large sequence data it is recommended to set this parameter.
183
+
184
+ Returns
185
+ -------
186
+ alignment : Alignment
187
+ The alignment from the :class:`FastaFile`.
188
+ """
189
+ seq_strings = list(fasta_file.values())
190
+ # Replace additional gap symbols with default gap symbol ('-')
191
+ for char in additional_gap_chars:
192
+ for i, seq_str in enumerate(seq_strings):
193
+ seq_strings[i] = seq_str.replace(char, "-")
194
+ return Alignment.from_strings(
195
+ seq_strings, functools.partial(_convert_to_sequence, seq_type=seq_type)
196
+ )
197
+
198
+
199
+ def set_alignment(fasta_file, alignment, seq_names):
200
+ """
201
+ Fill a :class:`FastaFile` with gapped sequence strings from an alignment.
202
+
203
+ Parameters
204
+ ----------
205
+ fasta_file : FastaFile
206
+ The :class:`FastaFile` to be accessed.
207
+ alignment : Alignment
208
+ The alignment to be set.
209
+ seq_names : iterable object of str
210
+ The names for the sequences in the alignment.
211
+ Must have the same length as the sequence count in `alignment`.
212
+ """
213
+ gapped_seq_strings = alignment.get_gapped_sequences()
214
+ if len(gapped_seq_strings) != len(seq_names):
215
+ raise ValueError(
216
+ f"Alignment has {len(gapped_seq_strings)} sequences, "
217
+ f"but {len(seq_names)} names were given"
218
+ )
219
+ for i in range(len(gapped_seq_strings)):
220
+ fasta_file[seq_names[i]] = gapped_seq_strings[i]
221
+
222
+
223
+ def get_a3m_alignments(a3m_file, seq_type=None):
224
+ """
225
+ Get pairwise sequence alignments from an *A3M*-formatted FASTA file.
226
+
227
+ The *i*-th alignment is an alignment of the first sequence in the file (the query)
228
+ to the *i+1*-th sequence in the file (the target).
229
+
230
+ Parameters
231
+ ----------
232
+ a3m_file : FastaFile
233
+ The A3M file to parse.
234
+ The first sequence (the query) must not contain any deletions or insertions.
235
+ All subsequent sequences indicate insertions and deletions by ``-`` or
236
+ lower case characters, respectively.
237
+ seq_type : type[Sequence], optional
238
+ The :class:`Sequence` subclass contained in the file.
239
+ If not set, the type is automatically inferred as
240
+ :class:`ProteinSequence` or :class:`NucleotideSequence` from the query sequence.
241
+ For large sequence data it is recommended to set this parameter.
242
+
243
+ Returns
244
+ -------
245
+ alignments : list of Alignment
246
+ Alignments of all sequences (excluding the query itself) to the query sequence.
247
+ Each alignment is between the query (first element) and each target sequence
248
+ (second element).
249
+ """
250
+ sequence_iterator = iter(a3m_file.values())
251
+ query_str = next(sequence_iterator)
252
+ query = _convert_to_sequence(query_str, seq_type)
253
+ if isinstance(query, NucleotideSequence):
254
+ factory = _convert_to_nucleotide
255
+ elif isinstance(query, ProteinSequence):
256
+ factory = _convert_to_protein
257
+ else:
258
+ factory = seq_type
259
+
260
+ alignments = []
261
+ for target_str in sequence_iterator:
262
+ # The target sequence provides all information about the alignment
263
+ # - matches/mismatches -> upper case
264
+ # - gaps in query -> lower case
265
+ # - gaps in target -> '-'
266
+ target_byte_array = np.frombuffer(target_str.encode("ASCII"), dtype=np.ubyte)
267
+ query_gaps = _is_lower(target_byte_array)
268
+ target_gaps = _is_gap(target_byte_array)
269
+
270
+ # Start with a trace filled with gaps (-1)
271
+ trace = np.full((len(target_str), 2), -1, dtype=np.int64)
272
+ # Fill the trace with the positions of the query sequence where there is no gap
273
+ trace[~query_gaps, 0] = np.arange(len(query))
274
+ # Do the same for the target sequence, but without the gap indicators
275
+ # but remove the gap indicators from it first to get the actual sequence length
276
+ trace[~target_gaps, 1] = np.arange(np.count_nonzero(~target_gaps))
277
+
278
+ alignments.append(
279
+ Alignment([query, factory(target_str.replace("-", ""))], trace)
280
+ )
281
+
282
+ return alignments
283
+
284
+
285
+ def set_a3m_alignments(a3m_file, alignments, query_label, target_labels):
286
+ """
287
+ Fill a :class:`FastaFile` with *A3M*-formatted alignments.
288
+
289
+ Parameters
290
+ ----------
291
+ a3m_file : FastaFile
292
+ The A3M file to fill.
293
+ alignments : list of Alignment, length=n
294
+ The pairwise alignments to fill the file with.
295
+ The first sequence of each alignment must always be the same
296
+ and will become the first sequence (the query) in the file.
297
+ query_label : str
298
+ The label for the query sequence.
299
+ target_labels : iterable object of str, length=n
300
+ The labels for the target sequences in the alignment.
301
+ """
302
+ query = alignments[0].sequences[0]
303
+ a3m_file[query_label] = _convert_to_string(query, as_rna=False)
304
+
305
+ for alignment, name in zip(alignments, target_labels, strict=True):
306
+ if len(alignment.sequences) != 2:
307
+ raise ValueError("Each alignment must be pairwise")
308
+ if alignment.sequences[0] != query:
309
+ raise ValueError(
310
+ "The first sequence of each alignment must be the same as the query"
311
+ )
312
+
313
+ alignment = _as_global(alignment)
314
+
315
+ code = get_codes(alignment)
316
+ query_code = code[0]
317
+ target_code = code[1]
318
+ query_gaps = query_code == -1
319
+ target_gaps = target_code == -1
320
+ match_mask = ~query_gaps & ~target_gaps
321
+
322
+ a3m_string_array = np.zeros(len(query_code), dtype="S1")
323
+ # Indicate gaps in the target sequence with '-'
324
+ a3m_string_array[target_gaps] = "-"
325
+ # Keep gaps in the query sequence as lower case letters
326
+ a3m_string_array[query_gaps] = np.char.lower(
327
+ query.alphabet.decode_multiple(target_code[query_gaps], as_bytes=True)
328
+ )
329
+ # Matches/mismatches are indicated with upper case letters
330
+ a3m_string_array[match_mask] = query.alphabet.decode_multiple(
331
+ target_code[match_mask], as_bytes=True
332
+ )
333
+ a3m_file[name] = a3m_string_array.tobytes().decode("ASCII")
334
+
335
+
336
+ def _convert_to_sequence(seq_str, seq_type=None):
337
+ # Set manually selected sequence type
338
+ if seq_type is not None:
339
+ # Do preprocessing as done without manual selection
340
+ if seq_type == NucleotideSequence:
341
+ return _convert_to_nucleotide(seq_str)
342
+ elif seq_type == ProteinSequence:
343
+ if "U" in seq_str:
344
+ warnings.warn(
345
+ "ProteinSequence objects do not support selenocysteine "
346
+ "(U), occurrences were substituted by cysteine (C)"
347
+ )
348
+ return _convert_to_protein(seq_str)
349
+ else:
350
+ return seq_type(seq_str)
351
+
352
+ # Attempt to automatically determine sequence type
353
+
354
+ try:
355
+ return _convert_to_nucleotide(seq_str)
356
+ except AlphabetError:
357
+ pass
358
+ try:
359
+ prot_seq = _convert_to_protein(seq_str)
360
+ # Raise Warning after conversion into 'ProteinSequence'
361
+ # to wait for potential 'AlphabetError'
362
+ if "U" in seq_str:
363
+ warnings.warn(
364
+ "ProteinSequence objects do not support selenocysteine (U), "
365
+ "occurrences were substituted by cysteine (C)"
366
+ )
367
+ return prot_seq
368
+ except AlphabetError:
369
+ raise ValueError(
370
+ "FASTA data cannot be converted either to "
371
+ "'NucleotideSequence' nor to 'ProteinSequence'"
372
+ )
373
+
374
+
375
+ def _convert_to_protein(seq_str):
376
+ """
377
+ Replace selenocysteine with cysteine and pyrrolysine with lysine.
378
+ """
379
+ return ProteinSequence(seq_str.upper().replace("U", "C").replace("O", "K"))
380
+
381
+
382
+ def _convert_to_nucleotide(seq_str):
383
+ """
384
+ For nucleotides uracil is represented by thymine and there is only
385
+ one letter for completely unknown nucleotides
386
+ """
387
+ return NucleotideSequence(seq_str.upper().replace("U", "T").replace("X", "N"))
388
+
389
+
390
+ def _convert_to_string(sequence, as_rna):
391
+ if not isinstance(sequence.get_alphabet(), LetterAlphabet):
392
+ raise ValueError(
393
+ "Only sequences using single letter alphabets can be stored in a FASTA file"
394
+ )
395
+ if isinstance(sequence, NucleotideSequence) and as_rna:
396
+ return str(sequence).replace("T", "U")
397
+ else:
398
+ return str(sequence)
399
+
400
+
401
+ def _as_global(alignment):
402
+ """
403
+ Convert a semi-global alignment into a global alignment.
404
+
405
+ A semi-global alignment is an alignment, where alignment columns for terminal
406
+ gaps are not included.
407
+ """
408
+ trace = alignment.trace
409
+ sequence_lengths = np.array([len(sequence) for sequence in alignment.sequences])
410
+
411
+ start_positions = []
412
+ end_positions = []
413
+ for i in range(trace.shape[1]):
414
+ trace_for_seq = trace[:, i]
415
+ trace_wo_gaps = trace_for_seq[trace_for_seq != -1]
416
+ start_positions.append(trace_wo_gaps[0])
417
+ end_positions.append(trace_wo_gaps[-1])
418
+ start_positions = np.array(start_positions)
419
+ end_positions = np.array(end_positions)
420
+ if (
421
+ np.count_nonzero(start_positions != 0) > 1
422
+ or np.count_nonzero(end_positions != sequence_lengths - 1) > 1
423
+ ):
424
+ # If multiple sequences do not run from beginning to end,
425
+ # the alignment is not semi-global, but local
426
+ raise ValueError("Alignment is local, but a semi-global alignment is required")
427
+
428
+ trace_parts = [trace]
429
+ if not (start_positions == 0).all():
430
+ # We need to add a prefix to the alignment, which has gaps for all sequences
431
+ # except for one
432
+ seq_index_with_missing_start = np.where(start_positions != 0)[0][0]
433
+ trace_prefix = np.full(
434
+ (start_positions[seq_index_with_missing_start], trace.shape[1]),
435
+ -1,
436
+ dtype=int,
437
+ )
438
+ trace_prefix[:, seq_index_with_missing_start] = np.arange(len(trace_prefix))
439
+ trace_parts.insert(0, trace_prefix)
440
+ if not (end_positions == sequence_lengths).all():
441
+ # The same needs to be done for the end of the alignment
442
+ seq_index_with_missing_end = np.where(end_positions != sequence_lengths)[0][0]
443
+ end_position = end_positions[seq_index_with_missing_end]
444
+ seq_length = sequence_lengths[seq_index_with_missing_end]
445
+ trace_suffix = np.full(
446
+ (seq_length - end_position - 1, trace.shape[1]), -1, dtype=int
447
+ )
448
+ trace_suffix[:, seq_index_with_missing_end] = np.arange(
449
+ end_position + 1, end_position + 1 + len(trace_suffix)
450
+ )
451
+ trace_parts.append(trace_suffix)
452
+
453
+ trace = np.concatenate(trace_parts, axis=0)
454
+ return Alignment(alignment.sequences, trace)
455
+
456
+
457
+ def _is_lower(characters):
458
+ return (characters >= ord("a")) & (characters <= ord("z"))
459
+
460
+
461
+ def _is_gap(characters):
462
+ return characters == ord("-")