biotite 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-310-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,551 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.fastq"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ import warnings
9
+ from numbers import Integral
10
+ from collections import OrderedDict
11
+ from collections.abc import MutableMapping
12
+ import numpy as np
13
+ from ....file import TextFile, InvalidFileError, wrap_string
14
+ from ...seqtypes import NucleotideSequence
15
+
16
+ __all__ = ["FastqFile"]
17
+
18
+
19
+ _OFFSETS = {
20
+ "Sanger" : 33,
21
+ "Solexa" : 64,
22
+ "Illumina-1.3" : 64,
23
+ "Illumina-1.5" : 64,
24
+ "Illumina-1.8" : 33,
25
+ }
26
+
27
+
28
+ class FastqFile(TextFile, MutableMapping):
29
+ """
30
+ This class represents a file in FASTQ format.
31
+
32
+ A FASTQ file stores one or multiple sequences (base calls) along
33
+ with sequencing quality scores.
34
+ Each sequence is associated with an identifer string,
35
+ beginning with an ``@``.
36
+
37
+ The quality scores are encoded as ASCII characters,
38
+ with each actual score being the ASCII code subtracted by an
39
+ `offset` value.
40
+ The offset is format dependent.
41
+ As the offset is not reliably deducible from the file contets, it
42
+ must be provided explicitly, either as number or format
43
+ (e.g. ``'Illumina-1.8'``).
44
+
45
+ Similar to the :class:`FastaFile` class, this class implements the
46
+ :class:`MutableMapping` interface:
47
+ An identifier string (without the leading ``@``) is used as index
48
+ to get and set the corresponding sequence and quality.
49
+ ``del`` removes an entry in the file.
50
+
51
+ Parameters
52
+ ----------
53
+ offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
54
+ This value is added to the quality score to obtain the
55
+ ASCII code.
56
+ Can either be directly the value, or a string that indicates
57
+ the score format.
58
+ chars_per_line : int, optional
59
+ The number characters in a line containing sequence data
60
+ after which a line break is inserted.
61
+ Only relevant, when adding sequences to a file.
62
+ By default each sequence (and score string)
63
+ is put into one line.
64
+
65
+ Examples
66
+ --------
67
+
68
+ >>> import os.path
69
+ >>> file = FastqFile(offset="Sanger")
70
+ >>> file["seq1"] = str(NucleotideSequence("ATACT")), [0,3,10,7,12]
71
+ >>> file["seq2"] = str(NucleotideSequence("TTGTAGG")), [15,13,24,21,28,38,35]
72
+ >>> print(file)
73
+ @seq1
74
+ ATACT
75
+ +
76
+ !$+(-
77
+ @seq2
78
+ TTGTAGG
79
+ +
80
+ 0.96=GD
81
+ >>> sequence, scores = file["seq1"]
82
+ >>> print(sequence)
83
+ ATACT
84
+ >>> print(scores)
85
+ [ 0 3 10 7 12]
86
+ >>> del file["seq1"]
87
+ >>> print(file)
88
+ @seq2
89
+ TTGTAGG
90
+ +
91
+ 0.96=GD
92
+ >>> file.write(os.path.join(path_to_directory, "test.fastq"))
93
+ """
94
+
95
+ def __init__(self, offset, chars_per_line=None):
96
+ super().__init__()
97
+ self._chars_per_line = chars_per_line
98
+ self._entries = OrderedDict()
99
+ self._offset = _convert_offset(offset)
100
+
101
+ @classmethod
102
+ def read(cls, file, offset, chars_per_line=None):
103
+ """
104
+ Read a FASTQ file.
105
+
106
+ Parameters
107
+ ----------
108
+ file : file-like object or str
109
+ The file to be read.
110
+ Alternatively a file path can be supplied.
111
+ offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
112
+ This value is added to the quality score to obtain the
113
+ ASCII code.
114
+ Can either be directly the value, or a string that indicates
115
+ the score format.
116
+ chars_per_line : int, optional
117
+ The number characters in a line containing sequence data
118
+ after which a line break is inserted.
119
+ Only relevant, when adding sequences to a file.
120
+ By default each sequence (and score string)
121
+ is put into one line.
122
+
123
+ Returns
124
+ -------
125
+ file_object : FastqFile
126
+ The parsed file.
127
+ """
128
+ file = super().read(file, offset, chars_per_line)
129
+ # Remove leading and trailing whitespace in all lines
130
+ file.lines = [line.strip() for line in file.lines]
131
+ # Filter out empty lines
132
+ file.lines = [line for line in file.lines if len(line) != 0]
133
+ if len(file.lines) == 0:
134
+ raise InvalidFileError("File is empty")
135
+ file._find_entries()
136
+ return file
137
+
138
+ def get_sequence(self, identifier):
139
+ """
140
+ Get the sequence for the specified identifier.
141
+
142
+ DEPRECATED: Use :meth:`get_seq_string()` or
143
+ :func:`get_sequence()` instead.
144
+
145
+ Parameters
146
+ ----------
147
+ identifier : str
148
+ The identifier of the sequence.
149
+
150
+ Returns
151
+ -------
152
+ sequence : NucleotideSequence
153
+ The sequence corresponding to the identifier.
154
+ """
155
+ warnings.warn(
156
+ "'get_sequence()' is deprecated, use the 'get_seq_string()'"
157
+ "method or 'fasta.get_sequence()' function instead",
158
+ DeprecationWarning
159
+ )
160
+ return NucleotideSequence(self.get_seq_string(identifier))
161
+
162
+ def get_seq_string(self, identifier):
163
+ """
164
+ Get the string representing the sequence for the specified
165
+ identifier.
166
+
167
+ Parameters
168
+ ----------
169
+ identifier : str
170
+ The identifier of the sequence.
171
+
172
+ Returns
173
+ -------
174
+ sequence : str
175
+ The sequence corresponding to the identifier.
176
+ """
177
+ if not isinstance(identifier, str):
178
+ raise IndexError(
179
+ "'FastqFile' only supports identifier strings as keys"
180
+ )
181
+ seq_start, seq_stop, score_start, score_stop \
182
+ = self._entries[identifier]
183
+ # Concatenate sequence string from the sequence lines
184
+ seq_str = "".join(self.lines[seq_start : seq_stop])
185
+ return seq_str
186
+
187
+ def get_quality(self, identifier):
188
+ """
189
+ Get the quality scores for the specified identifier.
190
+
191
+ Parameters
192
+ ----------
193
+ identifier : str
194
+ The identifier of the quality scores.
195
+
196
+ Returns
197
+ -------
198
+ scores : ndarray, dtype=int
199
+ The quality scores corresponding to the identifier.
200
+ """
201
+ if not isinstance(identifier, str):
202
+ raise IndexError(
203
+ "'FastqFile' only supports identifier strings as keys"
204
+ )
205
+ seq_start, seq_stop, score_start, score_stop \
206
+ = self._entries[identifier]
207
+ # Concatenate sequence string from the score lines
208
+ return _score_str_to_scores(
209
+ "".join(self.lines[score_start : score_stop]),
210
+ self._offset
211
+ )
212
+
213
+ def __setitem__(self, identifier, item):
214
+ sequence, scores = item
215
+ if len(sequence) != len(scores):
216
+ raise ValueError(
217
+ f"Sequence has length {len(sequence)}, "
218
+ f"but score length is {len(scores)}"
219
+ )
220
+ if not isinstance(identifier, str):
221
+ raise IndexError(
222
+ "'FastqFile' only supports strings as identifier"
223
+ )
224
+ # Delete lines of entry corresponding to the identifier,
225
+ # if already existing
226
+ if identifier in self:
227
+ del self[identifier]
228
+
229
+ # Create new lines
230
+ # Start with identifier line
231
+ new_lines = ["@" + identifier.replace("\n","").strip()]
232
+ # Append new lines with sequence string (with line breaks)
233
+ seq_start_i = len(new_lines)
234
+ if self._chars_per_line is None:
235
+ new_lines.append(str(sequence))
236
+ else:
237
+ new_lines += wrap_string(sequence, width=self._chars_per_line)
238
+ seq_stop_i =len(new_lines)
239
+ # Append sequence-score separator
240
+ new_lines += ["+"]
241
+ # Append scores
242
+ score_chars = _scores_to_score_str(scores, self._offset)
243
+ score_start_i = len(new_lines)
244
+ if self._chars_per_line is None:
245
+ new_lines.append(score_chars)
246
+ else:
247
+ new_lines += wrap_string(score_chars, width=self._chars_per_line)
248
+ score_stop_i = len(new_lines)
249
+
250
+ if identifier in self:
251
+ # Delete lines of entry corresponding to the header,
252
+ # if existing
253
+ del self[identifier]
254
+ self.lines += new_lines
255
+ self._find_entries()
256
+ else:
257
+ # Simply append lines
258
+ # Add entry in a more efficient way than '_find_entries()'
259
+ # for this simple case
260
+ self._entries[identifier] = (
261
+ len(self.lines) + seq_start_i,
262
+ len(self.lines) + seq_stop_i,
263
+ len(self.lines) + score_start_i,
264
+ len(self.lines) + score_stop_i
265
+ )
266
+ self.lines += new_lines
267
+
268
+ def __getitem__(self, identifier):
269
+ return self.get_seq_string(identifier), self.get_quality(identifier)
270
+
271
+ def __delitem__(self, identifier):
272
+ seq_start, seq_stop, score_start, score_stop \
273
+ = self._entries[identifier]
274
+ del self.lines[seq_start-1 : score_stop]
275
+ del self._entries[identifier]
276
+ self._find_entries()
277
+
278
+ def __len__(self):
279
+ return len(self._entries)
280
+
281
+ def __iter__(self):
282
+ return self._entries.__iter__()
283
+
284
+ def __contains__(self, identifer):
285
+ return identifer in self._entries
286
+
287
+ def _find_entries(self):
288
+ self._entries = OrderedDict()
289
+ in_sequence = False
290
+ # Record if the parser is currently in a quality score section,
291
+ # as the '@' character at the start of a line may also be a
292
+ # score instead of the start of an identifier
293
+ in_scores = False
294
+ seq_len = 0
295
+ score_len = 0
296
+ seq_start_i = None
297
+ seq_stop_i = None
298
+ score_start_i = None
299
+ score_stop_i = None
300
+ identifier = None
301
+ for i, line in enumerate(self.lines):
302
+ if not in_scores and not in_sequence and line[0] == "@":
303
+ # Identifier line
304
+ identifier = line[1:]
305
+ seq_start_i = i+1
306
+ # Next line is sequence
307
+ in_sequence = True
308
+ # Reset
309
+ seq_len = 0
310
+ score_len = 0
311
+ elif in_sequence:
312
+ if line[0] == "+":
313
+ # End of sequence start of scores
314
+ in_sequence = False
315
+ in_scores = True
316
+ seq_stop_i = i
317
+ score_start_i = i+1
318
+ else:
319
+ # Still in sequence
320
+ seq_len += len(line)
321
+ elif in_scores:
322
+ score_len += len(line)
323
+ if score_len < seq_len:
324
+ # Scores have not ended yet
325
+ pass
326
+ elif score_len == seq_len:
327
+ # End of scores
328
+ # -> End of entry
329
+ score_stop_i = i + 1
330
+ in_scores = False
331
+ # Record this entry
332
+ self._entries[identifier] = (
333
+ seq_start_i, seq_stop_i, score_start_i, score_stop_i
334
+ )
335
+ else: # score_len > seq_len
336
+ raise InvalidFileError(
337
+ f"The amount of scores is not equal to the sequence "
338
+ f"length for the sequence in line {seq_start_i+1} "
339
+ )
340
+ else:
341
+ raise InvalidFileError(f"Line {i+1} in FASTQ file is invalid")
342
+ # At the end of the file, the last sequence or score block
343
+ # must have properly ended
344
+ if in_sequence or in_scores:
345
+ raise InvalidFileError("The last entry in the file is incomplete")
346
+
347
+
348
+ @staticmethod
349
+ def read_iter(file, offset):
350
+ """
351
+ Create an iterator over each sequence (and corresponding scores)
352
+ of the given FASTQ file.
353
+
354
+ Parameters
355
+ ----------
356
+ file : file-like object or str
357
+ The file to be read.
358
+ Alternatively a file path can be supplied.
359
+ offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
360
+ This value that is added to the quality score to obtain the
361
+ ASCII code.
362
+ Can either be directly the value, or a string that indicates
363
+ the score format.
364
+
365
+ Yields
366
+ ------
367
+ identifier : str
368
+ The identifier of the current sequence.
369
+ sequence : tuple(str, ndarray)
370
+ The current sequence as string and its corresponding quality
371
+ scores as :class:`ndarray`.
372
+
373
+ Notes
374
+ -----
375
+ This approach gives the same results as
376
+ `FastqFile.read(file, offset).items()`, but is slightly faster
377
+ and much more memory efficient.
378
+ """
379
+ offset = _convert_offset(offset)
380
+
381
+ identifier = None
382
+ seq_str_list = []
383
+ score_str_list = []
384
+ in_sequence = False
385
+ in_scores = False
386
+ seq_len = 0
387
+ score_len = 0
388
+
389
+ for line in TextFile.read_iter(file):
390
+ line = line.strip()
391
+ # Ignore empty lines
392
+ if len(line) == 0:
393
+ continue
394
+
395
+ if not in_scores and not in_sequence and line[0] == "@":
396
+ # Track new entry
397
+ identifier = line[1:]
398
+ in_sequence = True
399
+ # Reset
400
+ seq_len = 0
401
+ score_len = 0
402
+ seq_str_list = []
403
+ score_str_list = []
404
+
405
+ elif in_sequence:
406
+ if line[0] == "+":
407
+ # End of sequence start of scores
408
+ in_sequence = False
409
+ in_scores = True
410
+ else:
411
+ # Still in sequence
412
+ seq_len += len(line)
413
+ seq_str_list.append(line)
414
+
415
+ elif in_scores:
416
+ score_len += len(line)
417
+ score_str_list.append(line)
418
+ if score_len < seq_len:
419
+ pass
420
+ elif score_len == seq_len:
421
+ # End of scores
422
+ # -> End of entry
423
+ in_scores = False
424
+ # yield this entry
425
+ scores = _score_str_to_scores(
426
+ "".join(score_str_list),
427
+ offset
428
+ )
429
+ yield identifier, ("".join(seq_str_list), scores)
430
+ else: # score_len > seq_len
431
+ raise InvalidFileError(
432
+ f"The amount of scores is not equal to the sequence "
433
+ f"length"
434
+ )
435
+
436
+ else:
437
+ raise InvalidFileError(f"FASTQ file is invalid")
438
+
439
+
440
+ @staticmethod
441
+ def write_iter(file, items, offset, chars_per_line=None):
442
+ """
443
+ Iterate over the given `items` and write each item into
444
+ the specified `file`.
445
+
446
+ In contrast to :meth:`write()`, the lines of text are not stored
447
+ in an intermediate :class:`TextFile`, but are directly written
448
+ to the file.
449
+ Hence, this static method may save a large amount of memory if
450
+ a large file should be written, especially if the `items`
451
+ are provided as generator.
452
+
453
+ Parameters
454
+ ----------
455
+ file : file-like object or str
456
+ The file to be written to.
457
+ Alternatively a file path can be supplied.
458
+ items : generator or array-like of tuple(str, tuple(str, ndarray))
459
+ The entries to be written into the file.
460
+ Each entry consists of an identifier string and a tuple
461
+ containing a sequence (as string) and a score array.
462
+ offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
463
+ This value is added to the quality score to obtain the
464
+ ASCII code.
465
+ Can either be directly the value, or a string that indicates
466
+ the score format.
467
+ chars_per_line : int, optional
468
+ The number characters in a line containing sequence data
469
+ after which a line break is inserted.
470
+ Only relevant, when adding sequences to a file.
471
+ By default each sequence (and score string)
472
+ is put into one line.
473
+
474
+ Notes
475
+ -----
476
+ This method does not test, whether the given identifiers are
477
+ unambiguous.
478
+ """
479
+ offset = _convert_offset(offset)
480
+
481
+ def line_generator():
482
+ for item in items:
483
+ identifier, (sequence, scores) = item
484
+ if len(sequence) != len(scores):
485
+ raise ValueError(
486
+ f"Sequence has length {len(sequence)}, "
487
+ f"but score length is {len(scores)}"
488
+ )
489
+ if not isinstance(identifier, str):
490
+ raise IndexError(
491
+ "'FastqFile' only supports strings as identifier"
492
+ )
493
+
494
+ # Yield identifier line
495
+ yield "@" + identifier.replace("\n","").strip()
496
+
497
+ # Yield sequence line(s)
498
+ if chars_per_line is None:
499
+ yield str(sequence)
500
+ else:
501
+ for line in wrap_string(sequence, width=chars_per_line):
502
+ yield line
503
+
504
+ # Yield separator
505
+ yield "+"
506
+
507
+ # Yield scores
508
+ score_chars = _scores_to_score_str(scores, offset)
509
+ if chars_per_line is None:
510
+ yield score_chars
511
+ else:
512
+ for line in wrap_string(score_chars, width=chars_per_line):
513
+ yield line
514
+
515
+ TextFile.write_iter(file, line_generator())
516
+
517
+
518
+ def _score_str_to_scores(score_str, offset):
519
+ """
520
+ Convert an ASCII string into actual score values.
521
+ """
522
+ scores = np.frombuffer(
523
+ bytearray(
524
+ score_str, encoding="ascii"
525
+ ),
526
+ dtype=np.int8
527
+ )
528
+ scores -= offset
529
+ return scores
530
+
531
+ def _scores_to_score_str(scores, offset):
532
+ """
533
+ Convert score values into an ASCII string.
534
+ """
535
+ scores = np.asarray(scores) + offset
536
+ return scores.astype(np.int8, copy=False).tobytes().decode("ascii")
537
+
538
+ def _convert_offset(offset_val_or_string):
539
+ """
540
+ If the given offset is a string return the corresponding numerical
541
+ value.
542
+ """
543
+ if isinstance(offset_val_or_string, Integral):
544
+ return offset_val_or_string
545
+ elif isinstance(offset_val_or_string, str):
546
+ return _OFFSETS[offset_val_or_string]
547
+ else:
548
+ raise TypeError(
549
+ f"The offset must be either an integer or a string "
550
+ f"indicating the format, not {type(offset_val_or_string).__name__}"
551
+ )
@@ -0,0 +1,17 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This subpackage is used for reading/writing information
7
+ (especially sequence features) from/to files in the *GenBank*
8
+ and *GenPept* format.
9
+ """
10
+
11
+ __name__ = "biotite.sequence.io.genbank"
12
+ __author__ = "Patrick Kunzmann"
13
+
14
+ from .file import *
15
+ from .annotation import *
16
+ from .sequence import *
17
+ from .metadata import *