biotite 1.5.0__cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
from biotite.sequence.seqtypes import NucleotideSequence
|
|
10
|
+
|
|
11
|
+
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_sequence(fastq_file, header=None):
|
|
15
|
+
"""
|
|
16
|
+
Get a sequence and quality scores from a `FastqFile` instance.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
fastq_file : FastqFile
|
|
21
|
+
The `FastqFile` to be accessed.
|
|
22
|
+
header : str, optional
|
|
23
|
+
The identifier to get the sequence and scores from.
|
|
24
|
+
By default, the first sequence of the file is returned.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
sequence : NucleotideSequence
|
|
29
|
+
The requested sequence.
|
|
30
|
+
scores : ndarray, dtype=int
|
|
31
|
+
The requested scores.
|
|
32
|
+
"""
|
|
33
|
+
if header is not None:
|
|
34
|
+
seq_str, scores = fastq_file[header]
|
|
35
|
+
else:
|
|
36
|
+
# Return first (and probably only) sequence of file
|
|
37
|
+
seq_str = None
|
|
38
|
+
scores = None
|
|
39
|
+
for seq_str, scores in fastq_file.values():
|
|
40
|
+
break
|
|
41
|
+
if seq_str is None:
|
|
42
|
+
raise ValueError("File does not contain any sequences")
|
|
43
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
44
|
+
return NucleotideSequence(processed_seq_str), scores
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_sequences(fastq_file):
|
|
48
|
+
"""
|
|
49
|
+
Get a dictionary from a `FastqFile` instance,
|
|
50
|
+
where identifiers are keys and sequence-score-tuples are values.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
fastq_file : FastqFile
|
|
55
|
+
The `Fastqile` to be accessed.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
seq_dict : dict
|
|
60
|
+
A dictionary containing identifiers as keys and
|
|
61
|
+
(`NucleotideSequence`, `ndarray`) tuples as values.
|
|
62
|
+
"""
|
|
63
|
+
seq_dict = OrderedDict()
|
|
64
|
+
for header, (seq_str, scores) in fastq_file.items():
|
|
65
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
66
|
+
seq_dict[header] = NucleotideSequence(processed_seq_str), scores
|
|
67
|
+
return seq_dict
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
|
|
71
|
+
"""
|
|
72
|
+
Set a sequence and a quality score array in a `FastqFile` instance.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
fastq_file : FastqFile
|
|
77
|
+
The `FastqFile` to be accessed.
|
|
78
|
+
sequence : NucleotideSequence
|
|
79
|
+
The sequence to be set.
|
|
80
|
+
scores : ndarray, dtype=int
|
|
81
|
+
The quality scores to be set.
|
|
82
|
+
header : str, optional
|
|
83
|
+
The identifier for the sequence. Default is 'sequence'.
|
|
84
|
+
as_rna : bool, optional
|
|
85
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
86
|
+
by ``'U'``.
|
|
87
|
+
"""
|
|
88
|
+
if header is None:
|
|
89
|
+
header = "sequence"
|
|
90
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def set_sequences(fastq_file, sequence_dict, as_rna=False):
|
|
94
|
+
"""
|
|
95
|
+
Set sequences in a `FastqFile` instance from a dictionary.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
fastq_file : FastqFile
|
|
100
|
+
The `FastqFile` to be accessed.
|
|
101
|
+
sequence_dict : dict
|
|
102
|
+
A dictionary containing the sequences and scores to be set.
|
|
103
|
+
Identifiers are keys,
|
|
104
|
+
(`NucleotideSequence`, `ndarray`) tuples are values.
|
|
105
|
+
as_rna : bool, optional
|
|
106
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
107
|
+
by ``'U'``.
|
|
108
|
+
"""
|
|
109
|
+
for header, (sequence, scores) in sequence_dict.items():
|
|
110
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _convert_to_string(sequence, as_rna):
|
|
114
|
+
if as_rna:
|
|
115
|
+
return str(sequence).replace("T", "U")
|
|
116
|
+
else:
|
|
117
|
+
return str(sequence)
|
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
from collections.abc import MutableMapping
|
|
10
|
+
from numbers import Integral
|
|
11
|
+
import numpy as np
|
|
12
|
+
from biotite.file import InvalidFileError, TextFile, wrap_string
|
|
13
|
+
|
|
14
|
+
__all__ = ["FastqFile"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_OFFSETS = {
|
|
18
|
+
"Sanger": 33,
|
|
19
|
+
"Solexa": 64,
|
|
20
|
+
"Illumina-1.3": 64,
|
|
21
|
+
"Illumina-1.5": 64,
|
|
22
|
+
"Illumina-1.8": 33,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FastqFile(TextFile, MutableMapping):
|
|
27
|
+
"""
|
|
28
|
+
This class represents a file in FASTQ format.
|
|
29
|
+
|
|
30
|
+
A FASTQ file stores one or multiple sequences (base calls) along
|
|
31
|
+
with sequencing quality scores.
|
|
32
|
+
Each sequence is associated with an identifer string,
|
|
33
|
+
beginning with an ``@``.
|
|
34
|
+
|
|
35
|
+
The quality scores are encoded as ASCII characters,
|
|
36
|
+
with each actual score being the ASCII code subtracted by an
|
|
37
|
+
`offset` value.
|
|
38
|
+
The offset is format dependent.
|
|
39
|
+
As the offset is not reliably deducible from the file contets, it
|
|
40
|
+
must be provided explicitly, either as number or format
|
|
41
|
+
(e.g. ``'Illumina-1.8'``).
|
|
42
|
+
|
|
43
|
+
Similar to the :class:`FastaFile` class, this class implements the
|
|
44
|
+
:class:`MutableMapping` interface:
|
|
45
|
+
An identifier string (without the leading ``@``) is used as index
|
|
46
|
+
to get and set the corresponding sequence and quality.
|
|
47
|
+
``del`` removes an entry in the file.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
|
|
52
|
+
This value is added to the quality score to obtain the
|
|
53
|
+
ASCII code.
|
|
54
|
+
Can either be directly the value, or a string that indicates
|
|
55
|
+
the score format.
|
|
56
|
+
chars_per_line : int, optional
|
|
57
|
+
The number characters in a line containing sequence data
|
|
58
|
+
after which a line break is inserted.
|
|
59
|
+
Only relevant, when adding sequences to a file.
|
|
60
|
+
By default each sequence (and score string)
|
|
61
|
+
is put into one line.
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
|
|
66
|
+
>>> import os.path
|
|
67
|
+
>>> file = FastqFile(offset="Sanger")
|
|
68
|
+
>>> file["seq1"] = str(NucleotideSequence("ATACT")), [0,3,10,7,12]
|
|
69
|
+
>>> file["seq2"] = str(NucleotideSequence("TTGTAGG")), [15,13,24,21,28,38,35]
|
|
70
|
+
>>> print(file)
|
|
71
|
+
@seq1
|
|
72
|
+
ATACT
|
|
73
|
+
+
|
|
74
|
+
!$+(-
|
|
75
|
+
@seq2
|
|
76
|
+
TTGTAGG
|
|
77
|
+
+
|
|
78
|
+
0.96=GD
|
|
79
|
+
>>> sequence, scores = file["seq1"]
|
|
80
|
+
>>> print(sequence)
|
|
81
|
+
ATACT
|
|
82
|
+
>>> print(scores)
|
|
83
|
+
[ 0 3 10 7 12]
|
|
84
|
+
>>> del file["seq1"]
|
|
85
|
+
>>> print(file)
|
|
86
|
+
@seq2
|
|
87
|
+
TTGTAGG
|
|
88
|
+
+
|
|
89
|
+
0.96=GD
|
|
90
|
+
>>> file.write(os.path.join(path_to_directory, "test.fastq"))
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self, offset, chars_per_line=None):
|
|
94
|
+
super().__init__()
|
|
95
|
+
self._chars_per_line = chars_per_line
|
|
96
|
+
self._entries = OrderedDict()
|
|
97
|
+
self._offset = _convert_offset(offset)
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def read(cls, file, offset, chars_per_line=None):
|
|
101
|
+
"""
|
|
102
|
+
Read a FASTQ file.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
file : file-like object or str
|
|
107
|
+
The file to be read.
|
|
108
|
+
Alternatively a file path can be supplied.
|
|
109
|
+
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
|
|
110
|
+
This value is added to the quality score to obtain the
|
|
111
|
+
ASCII code.
|
|
112
|
+
Can either be directly the value, or a string that indicates
|
|
113
|
+
the score format.
|
|
114
|
+
chars_per_line : int, optional
|
|
115
|
+
The number characters in a line containing sequence data
|
|
116
|
+
after which a line break is inserted.
|
|
117
|
+
Only relevant, when adding sequences to a file.
|
|
118
|
+
By default each sequence (and score string)
|
|
119
|
+
is put into one line.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
file_object : FastqFile
|
|
124
|
+
The parsed file.
|
|
125
|
+
"""
|
|
126
|
+
file = super().read(file, offset, chars_per_line)
|
|
127
|
+
# Remove leading and trailing whitespace in all lines
|
|
128
|
+
file.lines = [line.strip() for line in file.lines]
|
|
129
|
+
# Filter out empty lines
|
|
130
|
+
file.lines = [line for line in file.lines if len(line) != 0]
|
|
131
|
+
if len(file.lines) == 0:
|
|
132
|
+
raise InvalidFileError("File is empty")
|
|
133
|
+
file._find_entries()
|
|
134
|
+
return file
|
|
135
|
+
|
|
136
|
+
def get_seq_string(self, identifier):
|
|
137
|
+
"""
|
|
138
|
+
Get the string representing the sequence for the specified
|
|
139
|
+
identifier.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
identifier : str
|
|
144
|
+
The identifier of the sequence.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
sequence : str
|
|
149
|
+
The sequence corresponding to the identifier.
|
|
150
|
+
"""
|
|
151
|
+
if not isinstance(identifier, str):
|
|
152
|
+
raise IndexError("'FastqFile' only supports identifier strings as keys")
|
|
153
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
154
|
+
# Concatenate sequence string from the sequence lines
|
|
155
|
+
seq_str = "".join(self.lines[seq_start:seq_stop])
|
|
156
|
+
return seq_str
|
|
157
|
+
|
|
158
|
+
def get_quality(self, identifier):
|
|
159
|
+
"""
|
|
160
|
+
Get the quality scores for the specified identifier.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
identifier : str
|
|
165
|
+
The identifier of the quality scores.
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
scores : ndarray, dtype=int
|
|
170
|
+
The quality scores corresponding to the identifier.
|
|
171
|
+
"""
|
|
172
|
+
if not isinstance(identifier, str):
|
|
173
|
+
raise IndexError("'FastqFile' only supports identifier strings as keys")
|
|
174
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
175
|
+
# Concatenate sequence string from the score lines
|
|
176
|
+
return _score_str_to_scores(
|
|
177
|
+
"".join(self.lines[score_start:score_stop]), self._offset
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def __setitem__(self, identifier, item):
|
|
181
|
+
sequence, scores = item
|
|
182
|
+
if len(sequence) != len(scores):
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"Sequence has length {len(sequence)}, "
|
|
185
|
+
f"but score length is {len(scores)}"
|
|
186
|
+
)
|
|
187
|
+
if not isinstance(identifier, str):
|
|
188
|
+
raise IndexError("'FastqFile' only supports strings as identifier")
|
|
189
|
+
# Delete lines of entry corresponding to the identifier,
|
|
190
|
+
# if already existing
|
|
191
|
+
if identifier in self:
|
|
192
|
+
del self[identifier]
|
|
193
|
+
|
|
194
|
+
# Create new lines
|
|
195
|
+
# Start with identifier line
|
|
196
|
+
new_lines = ["@" + identifier.replace("\n", "").strip()]
|
|
197
|
+
# Append new lines with sequence string (with line breaks)
|
|
198
|
+
seq_start_i = len(new_lines)
|
|
199
|
+
if self._chars_per_line is None:
|
|
200
|
+
new_lines.append(str(sequence))
|
|
201
|
+
else:
|
|
202
|
+
new_lines += wrap_string(sequence, width=self._chars_per_line)
|
|
203
|
+
seq_stop_i = len(new_lines)
|
|
204
|
+
# Append sequence-score separator
|
|
205
|
+
new_lines += ["+"]
|
|
206
|
+
# Append scores
|
|
207
|
+
score_chars = _scores_to_score_str(scores, self._offset)
|
|
208
|
+
score_start_i = len(new_lines)
|
|
209
|
+
if self._chars_per_line is None:
|
|
210
|
+
new_lines.append(score_chars)
|
|
211
|
+
else:
|
|
212
|
+
new_lines += wrap_string(score_chars, width=self._chars_per_line)
|
|
213
|
+
score_stop_i = len(new_lines)
|
|
214
|
+
|
|
215
|
+
if identifier in self:
|
|
216
|
+
# Delete lines of entry corresponding to the header,
|
|
217
|
+
# if existing
|
|
218
|
+
del self[identifier]
|
|
219
|
+
self.lines += new_lines
|
|
220
|
+
self._find_entries()
|
|
221
|
+
else:
|
|
222
|
+
# Simply append lines
|
|
223
|
+
# Add entry in a more efficient way than '_find_entries()'
|
|
224
|
+
# for this simple case
|
|
225
|
+
self._entries[identifier] = (
|
|
226
|
+
len(self.lines) + seq_start_i,
|
|
227
|
+
len(self.lines) + seq_stop_i,
|
|
228
|
+
len(self.lines) + score_start_i,
|
|
229
|
+
len(self.lines) + score_stop_i,
|
|
230
|
+
)
|
|
231
|
+
self.lines += new_lines
|
|
232
|
+
|
|
233
|
+
def __getitem__(self, identifier):
|
|
234
|
+
return self.get_seq_string(identifier), self.get_quality(identifier)
|
|
235
|
+
|
|
236
|
+
def __delitem__(self, identifier):
|
|
237
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
238
|
+
del self.lines[seq_start - 1 : score_stop]
|
|
239
|
+
del self._entries[identifier]
|
|
240
|
+
self._find_entries()
|
|
241
|
+
|
|
242
|
+
def __len__(self):
|
|
243
|
+
return len(self._entries)
|
|
244
|
+
|
|
245
|
+
def __iter__(self):
|
|
246
|
+
return self._entries.__iter__()
|
|
247
|
+
|
|
248
|
+
def __contains__(self, identifer):
|
|
249
|
+
return identifer in self._entries
|
|
250
|
+
|
|
251
|
+
def _find_entries(self):
|
|
252
|
+
self._entries = OrderedDict()
|
|
253
|
+
in_sequence = False
|
|
254
|
+
# Record if the parser is currently in a quality score section,
|
|
255
|
+
# as the '@' character at the start of a line may also be a
|
|
256
|
+
# score instead of the start of an identifier
|
|
257
|
+
in_scores = False
|
|
258
|
+
seq_len = 0
|
|
259
|
+
score_len = 0
|
|
260
|
+
seq_start_i = None
|
|
261
|
+
seq_stop_i = None
|
|
262
|
+
score_start_i = None
|
|
263
|
+
score_stop_i = None
|
|
264
|
+
identifier = None
|
|
265
|
+
for i, line in enumerate(self.lines):
|
|
266
|
+
if not in_scores and not in_sequence and line[0] == "@":
|
|
267
|
+
# Identifier line
|
|
268
|
+
identifier = line[1:]
|
|
269
|
+
seq_start_i = i + 1
|
|
270
|
+
# Next line is sequence
|
|
271
|
+
in_sequence = True
|
|
272
|
+
# Reset
|
|
273
|
+
seq_len = 0
|
|
274
|
+
score_len = 0
|
|
275
|
+
elif in_sequence:
|
|
276
|
+
if line[0] == "+":
|
|
277
|
+
# End of sequence start of scores
|
|
278
|
+
in_sequence = False
|
|
279
|
+
in_scores = True
|
|
280
|
+
seq_stop_i = i
|
|
281
|
+
score_start_i = i + 1
|
|
282
|
+
else:
|
|
283
|
+
# Still in sequence
|
|
284
|
+
seq_len += len(line)
|
|
285
|
+
elif in_scores:
|
|
286
|
+
score_len += len(line)
|
|
287
|
+
if score_len < seq_len:
|
|
288
|
+
# Scores have not ended yet
|
|
289
|
+
pass
|
|
290
|
+
elif score_len == seq_len:
|
|
291
|
+
# End of scores
|
|
292
|
+
# -> End of entry
|
|
293
|
+
score_stop_i = i + 1
|
|
294
|
+
in_scores = False
|
|
295
|
+
# Record this entry
|
|
296
|
+
self._entries[identifier] = (
|
|
297
|
+
seq_start_i,
|
|
298
|
+
seq_stop_i,
|
|
299
|
+
score_start_i,
|
|
300
|
+
score_stop_i,
|
|
301
|
+
)
|
|
302
|
+
else: # score_len > seq_len
|
|
303
|
+
raise InvalidFileError(
|
|
304
|
+
f"The amount of scores is not equal to the sequence "
|
|
305
|
+
f"length for the sequence in line {seq_start_i + 1} "
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
raise InvalidFileError(f"Line {i + 1} in FASTQ file is invalid")
|
|
309
|
+
# At the end of the file, the last sequence or score block
|
|
310
|
+
# must have properly ended
|
|
311
|
+
if in_sequence or in_scores:
|
|
312
|
+
raise InvalidFileError("The last entry in the file is incomplete")
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
def read_iter(file, offset):
|
|
316
|
+
"""
|
|
317
|
+
Create an iterator over each sequence (and corresponding scores)
|
|
318
|
+
of the given FASTQ file.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
file : file-like object or str
|
|
323
|
+
The file to be read.
|
|
324
|
+
Alternatively a file path can be supplied.
|
|
325
|
+
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
|
|
326
|
+
This value that is added to the quality score to obtain the
|
|
327
|
+
ASCII code.
|
|
328
|
+
Can either be directly the value, or a string that indicates
|
|
329
|
+
the score format.
|
|
330
|
+
|
|
331
|
+
Yields
|
|
332
|
+
------
|
|
333
|
+
identifier : str
|
|
334
|
+
The identifier of the current sequence.
|
|
335
|
+
sequence : tuple(str, ndarray)
|
|
336
|
+
The current sequence as string and its corresponding quality
|
|
337
|
+
scores as :class:`ndarray`.
|
|
338
|
+
|
|
339
|
+
Notes
|
|
340
|
+
-----
|
|
341
|
+
This approach gives the same results as
|
|
342
|
+
`FastqFile.read(file, offset).items()`, but is slightly faster
|
|
343
|
+
and much more memory efficient.
|
|
344
|
+
"""
|
|
345
|
+
offset = _convert_offset(offset)
|
|
346
|
+
|
|
347
|
+
identifier = None
|
|
348
|
+
seq_str_list = []
|
|
349
|
+
score_str_list = []
|
|
350
|
+
in_sequence = False
|
|
351
|
+
in_scores = False
|
|
352
|
+
seq_len = 0
|
|
353
|
+
score_len = 0
|
|
354
|
+
|
|
355
|
+
for line in TextFile.read_iter(file):
|
|
356
|
+
line = line.strip()
|
|
357
|
+
# Ignore empty lines
|
|
358
|
+
if len(line) == 0:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
if not in_scores and not in_sequence and line[0] == "@":
|
|
362
|
+
# Track new entry
|
|
363
|
+
identifier = line[1:]
|
|
364
|
+
in_sequence = True
|
|
365
|
+
# Reset
|
|
366
|
+
seq_len = 0
|
|
367
|
+
score_len = 0
|
|
368
|
+
seq_str_list = []
|
|
369
|
+
score_str_list = []
|
|
370
|
+
|
|
371
|
+
elif in_sequence:
|
|
372
|
+
if line[0] == "+":
|
|
373
|
+
# End of sequence start of scores
|
|
374
|
+
in_sequence = False
|
|
375
|
+
in_scores = True
|
|
376
|
+
else:
|
|
377
|
+
# Still in sequence
|
|
378
|
+
seq_len += len(line)
|
|
379
|
+
seq_str_list.append(line)
|
|
380
|
+
|
|
381
|
+
elif in_scores:
|
|
382
|
+
score_len += len(line)
|
|
383
|
+
score_str_list.append(line)
|
|
384
|
+
if score_len < seq_len:
|
|
385
|
+
pass
|
|
386
|
+
elif score_len == seq_len:
|
|
387
|
+
# End of scores
|
|
388
|
+
# -> End of entry
|
|
389
|
+
in_scores = False
|
|
390
|
+
# yield this entry
|
|
391
|
+
scores = _score_str_to_scores("".join(score_str_list), offset)
|
|
392
|
+
yield identifier, ("".join(seq_str_list), scores)
|
|
393
|
+
else: # score_len > seq_len
|
|
394
|
+
raise InvalidFileError(
|
|
395
|
+
"The amount of scores is not equal to the sequence length"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
else:
|
|
399
|
+
raise InvalidFileError("FASTQ file is invalid")
|
|
400
|
+
|
|
401
|
+
@staticmethod
|
|
402
|
+
def write_iter(file, items, offset, chars_per_line=None):
|
|
403
|
+
"""
|
|
404
|
+
Iterate over the given `items` and write each item into
|
|
405
|
+
the specified `file`.
|
|
406
|
+
|
|
407
|
+
In contrast to :meth:`write()`, the lines of text are not stored
|
|
408
|
+
in an intermediate :class:`TextFile`, but are directly written
|
|
409
|
+
to the file.
|
|
410
|
+
Hence, this static method may save a large amount of memory if
|
|
411
|
+
a large file should be written, especially if the `items`
|
|
412
|
+
are provided as generator.
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
file : file-like object or str
|
|
417
|
+
The file to be written to.
|
|
418
|
+
Alternatively a file path can be supplied.
|
|
419
|
+
items : generator or array-like of tuple(str, tuple(str, ndarray))
|
|
420
|
+
The entries to be written into the file.
|
|
421
|
+
Each entry consists of an identifier string and a tuple
|
|
422
|
+
containing a sequence (as string) and a score array.
|
|
423
|
+
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
|
|
424
|
+
This value is added to the quality score to obtain the
|
|
425
|
+
ASCII code.
|
|
426
|
+
Can either be directly the value, or a string that indicates
|
|
427
|
+
the score format.
|
|
428
|
+
chars_per_line : int, optional
|
|
429
|
+
The number characters in a line containing sequence data
|
|
430
|
+
after which a line break is inserted.
|
|
431
|
+
Only relevant, when adding sequences to a file.
|
|
432
|
+
By default each sequence (and score string)
|
|
433
|
+
is put into one line.
|
|
434
|
+
|
|
435
|
+
Notes
|
|
436
|
+
-----
|
|
437
|
+
This method does not test, whether the given identifiers are
|
|
438
|
+
unambiguous.
|
|
439
|
+
"""
|
|
440
|
+
offset = _convert_offset(offset)
|
|
441
|
+
|
|
442
|
+
def line_generator():
|
|
443
|
+
for item in items:
|
|
444
|
+
identifier, (sequence, scores) = item
|
|
445
|
+
if len(sequence) != len(scores):
|
|
446
|
+
raise ValueError(
|
|
447
|
+
f"Sequence has length {len(sequence)}, "
|
|
448
|
+
f"but score length is {len(scores)}"
|
|
449
|
+
)
|
|
450
|
+
if not isinstance(identifier, str):
|
|
451
|
+
raise IndexError("'FastqFile' only supports strings as identifier")
|
|
452
|
+
|
|
453
|
+
# Yield identifier line
|
|
454
|
+
yield "@" + identifier.replace("\n", "").strip()
|
|
455
|
+
|
|
456
|
+
# Yield sequence line(s)
|
|
457
|
+
if chars_per_line is None:
|
|
458
|
+
yield str(sequence)
|
|
459
|
+
else:
|
|
460
|
+
for line in wrap_string(sequence, width=chars_per_line):
|
|
461
|
+
yield line
|
|
462
|
+
|
|
463
|
+
# Yield separator
|
|
464
|
+
yield "+"
|
|
465
|
+
|
|
466
|
+
# Yield scores
|
|
467
|
+
score_chars = _scores_to_score_str(scores, offset)
|
|
468
|
+
if chars_per_line is None:
|
|
469
|
+
yield score_chars
|
|
470
|
+
else:
|
|
471
|
+
for line in wrap_string(score_chars, width=chars_per_line):
|
|
472
|
+
yield line
|
|
473
|
+
|
|
474
|
+
TextFile.write_iter(file, line_generator())
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def _score_str_to_scores(score_str, offset):
|
|
478
|
+
"""
|
|
479
|
+
Convert an ASCII string into actual score values.
|
|
480
|
+
"""
|
|
481
|
+
scores = np.frombuffer(bytearray(score_str, encoding="ascii"), dtype=np.int8)
|
|
482
|
+
scores -= offset
|
|
483
|
+
return scores
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _scores_to_score_str(scores, offset):
|
|
487
|
+
"""
|
|
488
|
+
Convert score values into an ASCII string.
|
|
489
|
+
"""
|
|
490
|
+
scores = np.asarray(scores) + offset
|
|
491
|
+
return scores.astype(np.int8, copy=False).tobytes().decode("ascii")
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _convert_offset(offset_val_or_string):
|
|
495
|
+
"""
|
|
496
|
+
If the given offset is a string return the corresponding numerical
|
|
497
|
+
value.
|
|
498
|
+
"""
|
|
499
|
+
if isinstance(offset_val_or_string, Integral):
|
|
500
|
+
return offset_val_or_string
|
|
501
|
+
elif isinstance(offset_val_or_string, str):
|
|
502
|
+
return _OFFSETS[offset_val_or_string]
|
|
503
|
+
else:
|
|
504
|
+
raise TypeError(
|
|
505
|
+
f"The offset must be either an integer or a string "
|
|
506
|
+
f"indicating the format, not {type(offset_val_or_string).__name__}"
|
|
507
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This subpackage is used for reading/writing information
|
|
7
|
+
(especially sequence features) from/to files in the *GenBank*
|
|
8
|
+
and *GenPept* format.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__name__ = "biotite.sequence.io.genbank"
|
|
12
|
+
__author__ = "Patrick Kunzmann"
|
|
13
|
+
|
|
14
|
+
from .annotation import *
|
|
15
|
+
from .file import *
|
|
16
|
+
from .metadata import *
|
|
17
|
+
from .sequence import *
|