biotite 0.41.1__cp312-cp312-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,273 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.fasta"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ import warnings
9
+ from collections import OrderedDict
10
+ from ...sequence import Sequence
11
+ from ...alphabet import AlphabetError, LetterAlphabet
12
+ from ...seqtypes import NucleotideSequence, ProteinSequence
13
+ from ...align.alignment import Alignment
14
+
15
+ __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences",
16
+ "get_alignment", "set_alignment"]
17
+
18
+
19
+ def get_sequence(fasta_file, header=None, seq_type=None):
20
+ """
21
+ Get a sequence from a :class:`FastaFile` instance.
22
+
23
+ The type of sequence is guessed from the sequence string:
24
+ First, a conversion into a :class:`NucleotideSequence` and
25
+ second a conversion into a :class:`ProteinSequence` is tried.
26
+
27
+ Parameters
28
+ ----------
29
+ fasta_file : FastaFile
30
+ The :class:`FastaFile` to be accessed.
31
+ header : str, optional
32
+ The header to get the sequence from. By default, the first
33
+ sequence of the file is returned.
34
+ seq_type : Class, optional
35
+ The :class:`Sequence` subclass contained in the file. If not
36
+ set, biotite will attempt to automatically detect whether a
37
+ nucleotide or protein sequence is present.
38
+
39
+ Returns
40
+ -------
41
+ sequence : NucleotideSequence or ProteinSequence
42
+ The requested sequence in the `FastaFile`.
43
+ :class:`NucleotideSequence` if the sequence string fits the
44
+ corresponding alphabet, :class:`ProteinSequence` otherwise.
45
+
46
+ Raises
47
+ ------
48
+ ValueError
49
+ If the sequence data can be neither converted into a
50
+ :class:`NucleotideSequence` nor a :class:`ProteinSequence`.
51
+ """
52
+ if header is not None:
53
+ seq_str = fasta_file[header]
54
+ else:
55
+ # Return first (and probably only) sequence of file
56
+ seq_str = None
57
+ for seq_str in fasta_file.values():
58
+ break
59
+ if seq_str is None:
60
+ raise ValueError("File does not contain any sequences")
61
+ # Determine the sequence type:
62
+ # If NucleotideSequence can be created it is a DNA sequence,
63
+ # otherwise protein sequence
64
+ return _convert_to_sequence(seq_str, seq_type)
65
+
66
+
67
+ def get_sequences(fasta_file, seq_type=None):
68
+ """
69
+ Get dictionary from a :class:`FastaFile` instance,
70
+ where headers are keys and sequences are values.
71
+
72
+ The type of sequence is guessed from the sequence string:
73
+ First, a conversion into a :class:`NucleotideSequence` and
74
+ second a conversion into a :class:`ProteinSequence` is tried.
75
+
76
+ Parameters
77
+ ----------
78
+ fasta_file : FastaFile
79
+ The :class:`FastaFile` to be accessed.
80
+ seq_type : Class, optional
81
+ The :class:`Sequence` subclass contained in the file. If not
82
+ set, biotite will attempt to automatically detect whether a
83
+ nucleotide or protein sequence is present.
84
+
85
+ Returns
86
+ -------
87
+ seq_dict : dict
88
+ A dictionary that maps headers to
89
+ :class:`NucleotideSequence` and/or :class:`ProteinSequence`
90
+ instances as values.
91
+
92
+ Raises
93
+ ------
94
+ ValueError
95
+ If at least on of the sequence strings can be neither converted
96
+ into a :class:`NucleotideSequence` nor a
97
+ :class:`ProteinSequence`.
98
+ """
99
+ seq_dict = OrderedDict()
100
+ for header, seq_str in fasta_file.items():
101
+ seq_dict[header] = _convert_to_sequence(seq_str, seq_type)
102
+ return seq_dict
103
+
104
+
105
+ def set_sequence(fasta_file, sequence, header=None, as_rna=False):
106
+ """
107
+ Set a sequence in a :class:`FastaFile` instance.
108
+
109
+ Parameters
110
+ ----------
111
+ fasta_file : FastaFile
112
+ The :class:`FastaFile` to be accessed.
113
+ sequence : Sequence
114
+ The sequence to be set.
115
+ header : str, optional
116
+ The header for the sequence. Default is ``'sequence'``.
117
+ as_rna : bool, optional
118
+ If set to true, ``'T'`` will be replaced by ``'U'``,
119
+ if a :class:`NucleotideSequence` was given.
120
+
121
+ Raises
122
+ ------
123
+ ValueError
124
+ If the sequence's alphabet uses symbols other than single
125
+ characters.
126
+ """
127
+ if header is None:
128
+ header = "sequence"
129
+ fasta_file[header] = _convert_to_string(sequence, as_rna)
130
+
131
+
132
+ def set_sequences(fasta_file, sequence_dict, as_rna=False):
133
+ """
134
+ Set sequences in a :class:`FastaFile` instance from a dictionary.
135
+
136
+ Parameters
137
+ ----------
138
+ fasta_file : FastaFile
139
+ The :class:`FastaFile` to be accessed.
140
+ sequence_dict : dict
141
+ A dictionary containing the sequences to be set.
142
+ Header are keys, :class:`Sequence` instances are values.
143
+ as_rna : bool, optional
144
+ If set to true, ``'T'`` will be replaced by ``'U'``,
145
+ if a :class:`NucleotideSequence` was given.
146
+
147
+ Raises
148
+ ------
149
+ ValueError
150
+ If the sequences alphabets uses symbols other than single
151
+ characters.
152
+ """
153
+ for header, sequence in sequence_dict.items():
154
+ fasta_file[header] = _convert_to_string(sequence, as_rna)
155
+
156
+
157
+ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None):
158
+ """
159
+ Get an alignment from a :class:`FastaFile` instance.
160
+
161
+ Parameters
162
+ ----------
163
+ fasta_file : FastaFile
164
+ The :class:`FastaFile` to be accessed.
165
+ additional_gap_chars : str, optional
166
+ The characters to be treated as gaps.
167
+ seq_type : Class, optional
168
+ The :class:`Sequence` subclass contained in the file. If not
169
+ set, biotite will attempt to automatically detect whether a
170
+ nucleotide or protein sequence is present.
171
+
172
+ Returns
173
+ -------
174
+ alignment : Alignment
175
+ The alignment from the :class:`FastaFile`.
176
+ """
177
+ seq_strings = list(fasta_file.values())
178
+ # Replace additional gap symbols with default gap symbol ('-')
179
+ for char in additional_gap_chars:
180
+ for i, seq_str in enumerate(seq_strings):
181
+ seq_strings[i] = seq_str.replace(char, "-")
182
+ # Remove gaps for creation of sequences
183
+ sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type)
184
+ for seq_str in seq_strings]
185
+ trace = Alignment.trace_from_strings(seq_strings)
186
+ return Alignment(sequences, trace, score=None)
187
+
188
+
189
+ def set_alignment(fasta_file, alignment, seq_names):
190
+ """
191
+ Fill a :class:`FastaFile` with gapped sequence strings from an
192
+ alignment.
193
+
194
+ Parameters
195
+ ----------
196
+ fasta_file : FastaFile
197
+ The :class:`FastaFile` to be accessed.
198
+ alignment : Alignment
199
+ The alignment to be set.
200
+ seq_names : iterable object of str
201
+ The names for the sequences in the alignment.
202
+ Must have the same length as the sequence count in `alignment`.
203
+ """
204
+ gapped_seq_strings = alignment.get_gapped_sequences()
205
+ if len(gapped_seq_strings) != len(seq_names):
206
+ raise ValueError(
207
+ f"Alignment has {len(gapped_seq_strings)} sequences, "
208
+ f"but {len(seq_names)} names were given"
209
+ )
210
+ for i in range(len(gapped_seq_strings)):
211
+ fasta_file[seq_names[i]] = gapped_seq_strings[i]
212
+
213
+
214
+ def _convert_to_sequence(seq_str, seq_type=None):
215
+
216
+ # Define preprocessing of preimplemented sequence types
217
+
218
+ # Replace selenocysteine with cysteine
219
+ # and pyrrolysine with lysine
220
+ process_protein_sequence = (
221
+ lambda x : x.upper().replace("U", "C").replace("O", "K")
222
+ )
223
+ # For nucleotides uracil is represented by thymine and there is only
224
+ # one letter for completely unknown nucleotides
225
+ process_nucleotide_sequence = (
226
+ lambda x : x.upper().replace("U","T").replace("X","N")
227
+ )
228
+
229
+ # Set manually selected sequence type
230
+
231
+ if seq_type is not None:
232
+ # Do preprocessing as done without manual selection
233
+ if seq_type == NucleotideSequence:
234
+ seq_str = process_nucleotide_sequence(seq_str)
235
+ elif seq_type == ProteinSequence:
236
+ if "U" in seq_str:
237
+ warnings.warn(
238
+ "ProteinSequence objects do not support selenocysteine "
239
+ "(U), occurrences were substituted by cysteine (C)"
240
+ )
241
+ seq_str = process_protein_sequence(seq_str)
242
+ # Return the converted sequence
243
+ return seq_type(seq_str)
244
+
245
+ # Attempt to automatically determine sequence type
246
+
247
+ try:
248
+ return NucleotideSequence(process_nucleotide_sequence(seq_str))
249
+ except AlphabetError:
250
+ pass
251
+ try:
252
+ prot_seq = ProteinSequence(process_protein_sequence(seq_str))
253
+ # Raise Warning after conversion into 'ProteinSequence'
254
+ # to wait for potential 'AlphabetError'
255
+ if "U" in seq_str:
256
+ warnings.warn(
257
+ "ProteinSequence objects do not support selenocysteine (U), "
258
+ "occurrences were substituted by cysteine (C)"
259
+ )
260
+ return prot_seq
261
+ except AlphabetError:
262
+ raise ValueError("FASTA data cannot be converted either to "
263
+ "'NucleotideSequence' nor to 'ProteinSequence'")
264
+
265
+
266
+ def _convert_to_string(sequence, as_rna):
267
+ if not isinstance(sequence.get_alphabet(), LetterAlphabet):
268
+ raise ValueError("Only sequences using single letter alphabets "
269
+ "can be stored in a FASTA file")
270
+ if isinstance(sequence, NucleotideSequence) and as_rna:
271
+ return(str(sequence).replace("T", "U"))
272
+ else:
273
+ return(str(sequence))
@@ -0,0 +1,278 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.fasta"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["FastaFile"]
8
+
9
+ from ....file import TextFile, InvalidFileError, wrap_string
10
+ from collections import OrderedDict
11
+ from collections.abc import MutableMapping
12
+
13
+
14
+ class FastaFile(TextFile, MutableMapping):
15
+ """
16
+ This class represents a file in FASTA format.
17
+
18
+ A FASTA file contains so called *header* lines, beginning with
19
+ ``>``, that describe following sequence.
20
+ The corresponding sequence starts at the line after the header line
21
+ and ends at the next header line or at the end of file.
22
+ The header along with its sequence forms an entry.
23
+
24
+ This class is used in a dictionary like manner, implementing the
25
+ :class:`MutableMapping` interface:
26
+ Headers (without the leading ``>``) are used as keys,
27
+ and strings containing the sequences are the corresponding values.
28
+ Entries can be accessed using indexing,
29
+ ``del`` deletes the entry at the given index.
30
+
31
+ Parameters
32
+ ----------
33
+ chars_per_line : int, optional
34
+ The number characters in a line containing sequence data
35
+ after which a line break is inserted.
36
+ Only relevant, when adding sequences to a file.
37
+ Default is 80.
38
+
39
+ Examples
40
+ --------
41
+
42
+ >>> import os.path
43
+ >>> file = FastaFile()
44
+ >>> file["seq1"] = "ATACT"
45
+ >>> print(file["seq1"])
46
+ ATACT
47
+ >>> file["seq2"] = "AAAATT"
48
+ >>> print(file)
49
+ >seq1
50
+ ATACT
51
+ >seq2
52
+ AAAATT
53
+ >>> print(dict(file.items()))
54
+ {'seq1': 'ATACT', 'seq2': 'AAAATT'}
55
+ >>> for header, seq in file.items():
56
+ ... print(header, seq)
57
+ seq1 ATACT
58
+ seq2 AAAATT
59
+ >>> del file["seq1"]
60
+ >>> print(dict(file.items()))
61
+ {'seq2': 'AAAATT'}
62
+ >>> file.write(os.path.join(path_to_directory, "test.fasta"))
63
+ """
64
+
65
+ def __init__(self, chars_per_line=80):
66
+ super().__init__()
67
+ self._chars_per_line = chars_per_line
68
+ self._entries = OrderedDict()
69
+
70
+ @classmethod
71
+ def read(cls, file, chars_per_line=80):
72
+ """
73
+ Read a FASTA file.
74
+
75
+ Parameters
76
+ ----------
77
+ file : file-like object or str
78
+ The file to be read.
79
+ Alternatively a file path can be supplied.
80
+ chars_per_line : int, optional
81
+ The number characters in a line containing sequence data
82
+ after which a line break is inserted.
83
+ Only relevant, when adding sequences to a file.
84
+ Default is 80.
85
+
86
+ Returns
87
+ -------
88
+ file_object : FastaFile
89
+ The parsed file.
90
+ """
91
+ file = super().read(file, chars_per_line)
92
+ # Filter out empty and comment lines
93
+ file.lines = [line for line in file.lines
94
+ if len(line.strip()) != 0 and line[0] != ";"]
95
+ if len(file.lines) == 0:
96
+ raise InvalidFileError("File is empty or contains only comments")
97
+ file._find_entries()
98
+ return file
99
+
100
+ def __setitem__(self, header, seq_str):
101
+ if not isinstance(header, str):
102
+ raise IndexError(
103
+ "'FastaFile' only supports header strings as keys"
104
+ )
105
+ if not isinstance(seq_str, str):
106
+ raise TypeError("'FastaFile' only supports sequence strings "
107
+ "as values")
108
+ # Create lines for new header and sequence (with line breaks)
109
+ new_lines = [">" + header.replace("\n","").strip()] + \
110
+ wrap_string(seq_str, width=self._chars_per_line)
111
+ if header in self:
112
+ # Delete lines of entry corresponding to the header,
113
+ # if existing
114
+ del self[header]
115
+ self.lines += new_lines
116
+ self._find_entries()
117
+ else:
118
+ # Simply append lines
119
+ # Add entry in a more efficient way than '_find_entries()'
120
+ # for this simple case
121
+ self._entries[header] = (
122
+ len(self.lines),
123
+ len(self.lines) + len(new_lines)
124
+ )
125
+ self.lines += new_lines
126
+
127
+ def __getitem__(self, header):
128
+ if not isinstance(header, str):
129
+ raise IndexError(
130
+ "'FastaFile' only supports header strings as keys"
131
+ )
132
+ start, stop = self._entries[header]
133
+ # Concatenate sequence string from following lines
134
+ seq_string = "".join(
135
+ [line.strip() for line in self.lines[start+1 : stop]]
136
+ )
137
+ return seq_string
138
+
139
+ def __delitem__(self, header):
140
+ start, stop = self._entries[header]
141
+ del self.lines[start:stop]
142
+ del self._entries[header]
143
+ self._find_entries()
144
+
145
+ def __len__(self):
146
+ return len(self._entries)
147
+
148
+ def __iter__(self):
149
+ return self._entries.__iter__()
150
+
151
+ def __contains__(self, identifer):
152
+ return identifer in self._entries
153
+
154
+ def _find_entries(self):
155
+ if len(self.lines) > 0 and self.lines[0][0] != ">":
156
+ raise InvalidFileError(
157
+ f"File starts with '{self.lines[0][0]}' instead of '>'"
158
+ )
159
+
160
+ header_i = []
161
+ for i, line in enumerate(self.lines):
162
+ if line[0] == ">":
163
+ header_i.append(i)
164
+
165
+ self._entries = OrderedDict()
166
+ for j in range(len(header_i)):
167
+ # Remove leading '>' from header
168
+ header = self.lines[header_i[j]].strip()[1:]
169
+ start = header_i[j]
170
+ if j < len(header_i) -1:
171
+ # Header in mid or start of file
172
+ # -> stop is start of next header
173
+ stop = header_i[j+1]
174
+ else:
175
+ # Last header -> entry stops at end of file
176
+ stop = len(self.lines)
177
+ self._entries[header] = (start, stop)
178
+
179
+
180
+ @staticmethod
181
+ def read_iter(file):
182
+ """
183
+ Create an iterator over each sequence of the given FASTA file.
184
+
185
+ Parameters
186
+ ----------
187
+ file : file-like object or str
188
+ The file to be read.
189
+ Alternatively a file path can be supplied.
190
+
191
+ Yields
192
+ ------
193
+ header : str
194
+ The header of the current sequence.
195
+ seq_str : str
196
+ The current sequence as string.
197
+
198
+ Notes
199
+ -----
200
+ This approach gives the same results as
201
+ `FastaFile.read(file).items()`, but is slightly faster and much
202
+ more memory efficient.
203
+ """
204
+ header = None
205
+ seq_str_list = []
206
+ for line in TextFile.read_iter(file):
207
+ line = line.strip()
208
+ # Ignore empty and comment lines
209
+ if len(line) == 0 or line[0] == ";":
210
+ continue
211
+ if line[0] == ">":
212
+ # New entry
213
+ # -> yield previous entry
214
+ if header is not None:
215
+ yield header, "".join(seq_str_list)
216
+ # Track new header and reset sequence
217
+ header = line[1:]
218
+ seq_str_list = []
219
+ else:
220
+ seq_str_list.append(line)
221
+ # Yield final entry
222
+ if header is not None:
223
+ yield header, "".join(seq_str_list)
224
+
225
+
226
+ @staticmethod
227
+ def write_iter(file, items, chars_per_line=80):
228
+ """
229
+ Iterate over the given `items` and write each item into
230
+ the specified `file`.
231
+
232
+ In contrast to :meth:`write()`, the lines of text are not stored
233
+ in an intermediate :class:`TextFile`, but are directly written
234
+ to the file.
235
+ Hence, this static method may save a large amount of memory if
236
+ a large file should be written, especially if the `items`
237
+ are provided as generator.
238
+
239
+ Parameters
240
+ ----------
241
+ file : file-like object or str
242
+ The file to be written to.
243
+ Alternatively a file path can be supplied.
244
+ items : generator or array-like of tuple(str, str)
245
+ The entries to be written into the file.
246
+ Each entry consists of an header string and a sequence
247
+ string.
248
+ chars_per_line : int, optional
249
+ The number characters in a line containing sequence data
250
+ after which a line break is inserted.
251
+ Only relevant, when adding sequences to a file.
252
+ Default is 80.
253
+
254
+ Notes
255
+ -----
256
+ This method does not test, whether the given identifiers are
257
+ unambiguous.
258
+ """
259
+ def line_generator():
260
+ for item in items:
261
+ header, seq_str = item
262
+ if not isinstance(header, str):
263
+ raise IndexError(
264
+ "'FastaFile' only supports header strings"
265
+ )
266
+ if not isinstance(seq_str, str):
267
+ raise TypeError(
268
+ "'FastaFile' only supports sequence strings"
269
+ )
270
+
271
+ # Yield header line
272
+ yield ">" + header.replace("\n","").strip()
273
+
274
+ # Yield sequence line(s)
275
+ for line in wrap_string(seq_str, width=chars_per_line):
276
+ yield line
277
+
278
+ TextFile.write_iter(file, line_generator())
@@ -0,0 +1,19 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This subpackage is used for reading and writing sequencing data
7
+ using the popular FASTQ format.
8
+
9
+ This package contains the :class:`FastqFile`, which provides a
10
+ dictionary like interface to FASTQ files, with the sequence identifer
11
+ strings being the keys and the sequences and quality scores being the
12
+ values.
13
+ """
14
+
15
+ __name__ = "biotite.sequence.io.fastq"
16
+ __author__ = "Patrick Kunzmann"
17
+
18
+ from .file import *
19
+ from .convert import *
@@ -0,0 +1,120 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.fastq"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ from collections import OrderedDict
9
+ from ...sequence import Sequence
10
+ from ...alphabet import AlphabetError, LetterAlphabet
11
+ from ...seqtypes import NucleotideSequence
12
+ from ...align.alignment import Alignment
13
+
14
+ __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
15
+
16
+
17
+ def get_sequence(fastq_file, header=None):
18
+ """
19
+ Get a sequence and quality scores from a `FastqFile` instance.
20
+
21
+ Parameters
22
+ ----------
23
+ fastq_file : FastqFile
24
+ The `FastqFile` to be accessed.
25
+ header : str, optional
26
+ The identifier to get the sequence and scores from.
27
+ By default, the first sequence of the file is returned.
28
+
29
+ Returns
30
+ -------
31
+ sequence : NucleotideSequence
32
+ The requested sequence.
33
+ scores : ndarray, dtype=int
34
+ The requested scores.
35
+ """
36
+ if header is not None:
37
+ seq_str, scores = fastq_file[header]
38
+ else:
39
+ # Return first (and probably only) sequence of file
40
+ seq_str = None
41
+ scores = None
42
+ for seq_str, scores in fastq_file.values():
43
+ break
44
+ if seq_str is None:
45
+ raise ValueError("File does not contain any sequences")
46
+ processed_seq_str = seq_str.replace("U","T").replace("X","N")
47
+ return NucleotideSequence(processed_seq_str), scores
48
+
49
+
50
+ def get_sequences(fastq_file):
51
+ """
52
+ Get a dictionary from a `FastqFile` instance,
53
+ where identifiers are keys and sequence-score-tuples are values.
54
+
55
+ Parameters
56
+ ----------
57
+ fastq_file : FastqFile
58
+ The `Fastqile` to be accessed.
59
+
60
+ Returns
61
+ -------
62
+ seq_dict : dict
63
+ A dictionary containing identifiers as keys and
64
+ (`NucleotideSequence`, `ndarray`) tuples as values.
65
+ """
66
+ seq_dict = OrderedDict()
67
+ for header, (seq_str, scores) in fastq_file.items():
68
+ processed_seq_str = seq_str.replace("U","T").replace("X","N")
69
+ seq_dict[header] = NucleotideSequence(processed_seq_str), scores
70
+ return seq_dict
71
+
72
+
73
+ def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
74
+ """
75
+ Set a sequence and a quality score array in a `FastqFile` instance.
76
+
77
+ Parameters
78
+ ----------
79
+ fastq_file : FastqFile
80
+ The `FastqFile` to be accessed.
81
+ sequence : NucleotideSequence
82
+ The sequence to be set.
83
+ scores : ndarray, dtype=int
84
+ The quality scores to be set.
85
+ header : str, optional
86
+ The identifier for the sequence. Default is 'sequence'.
87
+ as_rna : bool, optional
88
+ If set to true, the sequence symbol ``'T'`` will be replaced
89
+ by ``'U'``.
90
+ """
91
+ if header is None:
92
+ header = "sequence"
93
+ fastq_file[header] = _convert_to_string(sequence, as_rna), scores
94
+
95
+
96
+ def set_sequences(fastq_file, sequence_dict, as_rna=False):
97
+ """
98
+ Set sequences in a `FastqFile` instance from a dictionary.
99
+
100
+ Parameters
101
+ ----------
102
+ fastq_file : FastqFile
103
+ The `FastqFile` to be accessed.
104
+ sequence_dict : dict
105
+ A dictionary containing the sequences and scores to be set.
106
+ Identifiers are keys,
107
+ (`NucleotideSequence`, `ndarray`) tuples are values.
108
+ as_rna : bool, optional
109
+ If set to true, the sequence symbol ``'T'`` will be replaced
110
+ by ``'U'``.
111
+ """
112
+ for header, (sequence, scores) in sequence_dict.items():
113
+ fastq_file[header] = _convert_to_string(sequence, as_rna), scores
114
+
115
+
116
+ def _convert_to_string(sequence, as_rna):
117
+ if as_rna:
118
+ return(str(sequence).replace("T", "U"))
119
+ else:
120
+ return(str(sequence))