biotite 0.41.1__cp311-cp311-macosx_10_16_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +19 -0
- biotite/application/__init__.py +43 -0
- biotite/application/application.py +265 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +505 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +83 -0
- biotite/application/blast/webapp.py +421 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +238 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +152 -0
- biotite/application/localapp.py +306 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +122 -0
- biotite/application/msaapp.py +374 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +254 -0
- biotite/application/muscle/app5.py +171 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +456 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +222 -0
- biotite/application/util.py +59 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +304 -0
- biotite/application/viennarna/rnafold.py +269 -0
- biotite/application/viennarna/rnaplot.py +187 -0
- biotite/application/viennarna/util.py +72 -0
- biotite/application/webapp.py +77 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +61 -0
- biotite/database/entrez/dbnames.py +89 -0
- biotite/database/entrez/download.py +223 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +223 -0
- biotite/database/error.py +15 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +260 -0
- biotite/database/pubchem/error.py +20 -0
- biotite/database/pubchem/query.py +827 -0
- biotite/database/pubchem/throttle.py +99 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +167 -0
- biotite/database/rcsb/query.py +959 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +32 -0
- biotite/database/uniprot/download.py +134 -0
- biotite/database/uniprot/query.py +209 -0
- biotite/file.py +251 -0
- biotite/sequence/__init__.py +73 -0
- biotite/sequence/align/__init__.py +49 -0
- biotite/sequence/align/alignment.py +658 -0
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +69 -0
- biotite/sequence/align/cigar.py +434 -0
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +574 -0
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3400 -0
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +405 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +620 -0
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.pyx +587 -0
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +305 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +956 -0
- biotite/sequence/align/statistics.py +265 -0
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +566 -0
- biotite/sequence/annotation.py +829 -0
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +466 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1034 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +139 -0
- biotite/sequence/graphics/dendrogram.py +184 -0
- biotite/sequence/graphics/features.py +510 -0
- biotite/sequence/graphics/logo.py +110 -0
- biotite/sequence/graphics/plasmid.py +661 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +273 -0
- biotite/sequence/io/fasta/file.py +278 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +120 -0
- biotite/sequence/io/fastq/file.py +551 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +277 -0
- biotite/sequence/io/genbank/file.py +575 -0
- biotite/sequence/io/genbank/metadata.py +324 -0
- biotite/sequence/io/genbank/sequence.py +172 -0
- biotite/sequence/io/general.py +192 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +133 -0
- biotite/sequence/io/gff/file.py +434 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +456 -0
- biotite/sequence/search.py +116 -0
- biotite/sequence/seqtypes.py +556 -0
- biotite/sequence/sequence.py +374 -0
- biotite/structure/__init__.py +132 -0
- biotite/structure/atoms.py +1455 -0
- biotite/structure/basepairs.py +1415 -0
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +1933 -0
- biotite/structure/box.py +592 -0
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/celllist.pyx +849 -0
- biotite/structure/chains.py +298 -0
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +274 -0
- biotite/structure/density.py +114 -0
- biotite/structure/dotbracket.py +216 -0
- biotite/structure/error.py +31 -0
- biotite/structure/filter.py +585 -0
- biotite/structure/geometry.py +697 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +226 -0
- biotite/structure/graphics/rna.py +282 -0
- biotite/structure/hbond.py +409 -0
- biotite/structure/info/__init__.py +25 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +82 -0
- biotite/structure/info/bonds.py +145 -0
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1663 -0
- biotite/structure/info/ccd/carbohydrates.txt +1135 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +798 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +123 -0
- biotite/structure/info/misc.py +144 -0
- biotite/structure/info/radii.py +197 -0
- biotite/structure/info/standardize.py +196 -0
- biotite/structure/integrity.py +268 -0
- biotite/structure/io/__init__.py +30 -0
- biotite/structure/io/ctab.py +72 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +65 -0
- biotite/structure/io/general.py +257 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mmtf/__init__.py +21 -0
- biotite/structure/io/mmtf/assembly.py +214 -0
- biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +341 -0
- biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +501 -0
- biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +152 -0
- biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +183 -0
- biotite/structure/io/mmtf/file.py +233 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +115 -0
- biotite/structure/io/mol/ctab.py +414 -0
- biotite/structure/io/mol/header.py +116 -0
- biotite/structure/io/mol/mol.py +193 -0
- biotite/structure/io/mol/sdf.py +916 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +63 -0
- biotite/structure/io/npz/__init__.py +20 -0
- biotite/structure/io/npz/file.py +152 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +293 -0
- biotite/structure/io/pdb/file.py +1240 -0
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +107 -0
- biotite/structure/io/pdbqt/file.py +640 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +648 -0
- biotite/structure/io/pdbx/cif.py +1032 -0
- biotite/structure/io/pdbx/component.py +246 -0
- biotite/structure/io/pdbx/convert.py +1597 -0
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +950 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/io/tng/__init__.py +13 -0
- biotite/structure/io/tng/file.py +46 -0
- biotite/structure/io/trajfile.py +710 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +46 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +46 -0
- biotite/structure/mechanics.py +75 -0
- biotite/structure/molecules.py +353 -0
- biotite/structure/pseudoknots.py +642 -0
- biotite/structure/rdf.py +243 -0
- biotite/structure/repair.py +253 -0
- biotite/structure/residues.py +562 -0
- biotite/structure/resutil.py +178 -0
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/sequence.py +112 -0
- biotite/structure/sse.py +327 -0
- biotite/structure/superimpose.py +727 -0
- biotite/structure/transform.py +504 -0
- biotite/structure/util.py +98 -0
- biotite/temp.py +86 -0
- biotite/version.py +16 -0
- biotite/visualize.py +251 -0
- biotite-0.41.1.dist-info/METADATA +187 -0
- biotite-0.41.1.dist-info/RECORD +340 -0
- biotite-0.41.1.dist-info/WHEEL +4 -0
- biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fasta"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from collections import OrderedDict
|
|
10
|
+
from ...sequence import Sequence
|
|
11
|
+
from ...alphabet import AlphabetError, LetterAlphabet
|
|
12
|
+
from ...seqtypes import NucleotideSequence, ProteinSequence
|
|
13
|
+
from ...align.alignment import Alignment
|
|
14
|
+
|
|
15
|
+
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences",
|
|
16
|
+
"get_alignment", "set_alignment"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_sequence(fasta_file, header=None, seq_type=None):
|
|
20
|
+
"""
|
|
21
|
+
Get a sequence from a :class:`FastaFile` instance.
|
|
22
|
+
|
|
23
|
+
The type of sequence is guessed from the sequence string:
|
|
24
|
+
First, a conversion into a :class:`NucleotideSequence` and
|
|
25
|
+
second a conversion into a :class:`ProteinSequence` is tried.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
fasta_file : FastaFile
|
|
30
|
+
The :class:`FastaFile` to be accessed.
|
|
31
|
+
header : str, optional
|
|
32
|
+
The header to get the sequence from. By default, the first
|
|
33
|
+
sequence of the file is returned.
|
|
34
|
+
seq_type : Class, optional
|
|
35
|
+
The :class:`Sequence` subclass contained in the file. If not
|
|
36
|
+
set, biotite will attempt to automatically detect whether a
|
|
37
|
+
nucleotide or protein sequence is present.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
sequence : NucleotideSequence or ProteinSequence
|
|
42
|
+
The requested sequence in the `FastaFile`.
|
|
43
|
+
:class:`NucleotideSequence` if the sequence string fits the
|
|
44
|
+
corresponding alphabet, :class:`ProteinSequence` otherwise.
|
|
45
|
+
|
|
46
|
+
Raises
|
|
47
|
+
------
|
|
48
|
+
ValueError
|
|
49
|
+
If the sequence data can be neither converted into a
|
|
50
|
+
:class:`NucleotideSequence` nor a :class:`ProteinSequence`.
|
|
51
|
+
"""
|
|
52
|
+
if header is not None:
|
|
53
|
+
seq_str = fasta_file[header]
|
|
54
|
+
else:
|
|
55
|
+
# Return first (and probably only) sequence of file
|
|
56
|
+
seq_str = None
|
|
57
|
+
for seq_str in fasta_file.values():
|
|
58
|
+
break
|
|
59
|
+
if seq_str is None:
|
|
60
|
+
raise ValueError("File does not contain any sequences")
|
|
61
|
+
# Determine the sequence type:
|
|
62
|
+
# If NucleotideSequence can be created it is a DNA sequence,
|
|
63
|
+
# otherwise protein sequence
|
|
64
|
+
return _convert_to_sequence(seq_str, seq_type)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_sequences(fasta_file, seq_type=None):
|
|
68
|
+
"""
|
|
69
|
+
Get dictionary from a :class:`FastaFile` instance,
|
|
70
|
+
where headers are keys and sequences are values.
|
|
71
|
+
|
|
72
|
+
The type of sequence is guessed from the sequence string:
|
|
73
|
+
First, a conversion into a :class:`NucleotideSequence` and
|
|
74
|
+
second a conversion into a :class:`ProteinSequence` is tried.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
fasta_file : FastaFile
|
|
79
|
+
The :class:`FastaFile` to be accessed.
|
|
80
|
+
seq_type : Class, optional
|
|
81
|
+
The :class:`Sequence` subclass contained in the file. If not
|
|
82
|
+
set, biotite will attempt to automatically detect whether a
|
|
83
|
+
nucleotide or protein sequence is present.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
seq_dict : dict
|
|
88
|
+
A dictionary that maps headers to
|
|
89
|
+
:class:`NucleotideSequence` and/or :class:`ProteinSequence`
|
|
90
|
+
instances as values.
|
|
91
|
+
|
|
92
|
+
Raises
|
|
93
|
+
------
|
|
94
|
+
ValueError
|
|
95
|
+
If at least on of the sequence strings can be neither converted
|
|
96
|
+
into a :class:`NucleotideSequence` nor a
|
|
97
|
+
:class:`ProteinSequence`.
|
|
98
|
+
"""
|
|
99
|
+
seq_dict = OrderedDict()
|
|
100
|
+
for header, seq_str in fasta_file.items():
|
|
101
|
+
seq_dict[header] = _convert_to_sequence(seq_str, seq_type)
|
|
102
|
+
return seq_dict
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def set_sequence(fasta_file, sequence, header=None, as_rna=False):
|
|
106
|
+
"""
|
|
107
|
+
Set a sequence in a :class:`FastaFile` instance.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
fasta_file : FastaFile
|
|
112
|
+
The :class:`FastaFile` to be accessed.
|
|
113
|
+
sequence : Sequence
|
|
114
|
+
The sequence to be set.
|
|
115
|
+
header : str, optional
|
|
116
|
+
The header for the sequence. Default is ``'sequence'``.
|
|
117
|
+
as_rna : bool, optional
|
|
118
|
+
If set to true, ``'T'`` will be replaced by ``'U'``,
|
|
119
|
+
if a :class:`NucleotideSequence` was given.
|
|
120
|
+
|
|
121
|
+
Raises
|
|
122
|
+
------
|
|
123
|
+
ValueError
|
|
124
|
+
If the sequence's alphabet uses symbols other than single
|
|
125
|
+
characters.
|
|
126
|
+
"""
|
|
127
|
+
if header is None:
|
|
128
|
+
header = "sequence"
|
|
129
|
+
fasta_file[header] = _convert_to_string(sequence, as_rna)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def set_sequences(fasta_file, sequence_dict, as_rna=False):
|
|
133
|
+
"""
|
|
134
|
+
Set sequences in a :class:`FastaFile` instance from a dictionary.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
fasta_file : FastaFile
|
|
139
|
+
The :class:`FastaFile` to be accessed.
|
|
140
|
+
sequence_dict : dict
|
|
141
|
+
A dictionary containing the sequences to be set.
|
|
142
|
+
Header are keys, :class:`Sequence` instances are values.
|
|
143
|
+
as_rna : bool, optional
|
|
144
|
+
If set to true, ``'T'`` will be replaced by ``'U'``,
|
|
145
|
+
if a :class:`NucleotideSequence` was given.
|
|
146
|
+
|
|
147
|
+
Raises
|
|
148
|
+
------
|
|
149
|
+
ValueError
|
|
150
|
+
If the sequences alphabets uses symbols other than single
|
|
151
|
+
characters.
|
|
152
|
+
"""
|
|
153
|
+
for header, sequence in sequence_dict.items():
|
|
154
|
+
fasta_file[header] = _convert_to_string(sequence, as_rna)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None):
|
|
158
|
+
"""
|
|
159
|
+
Get an alignment from a :class:`FastaFile` instance.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
fasta_file : FastaFile
|
|
164
|
+
The :class:`FastaFile` to be accessed.
|
|
165
|
+
additional_gap_chars : str, optional
|
|
166
|
+
The characters to be treated as gaps.
|
|
167
|
+
seq_type : Class, optional
|
|
168
|
+
The :class:`Sequence` subclass contained in the file. If not
|
|
169
|
+
set, biotite will attempt to automatically detect whether a
|
|
170
|
+
nucleotide or protein sequence is present.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
alignment : Alignment
|
|
175
|
+
The alignment from the :class:`FastaFile`.
|
|
176
|
+
"""
|
|
177
|
+
seq_strings = list(fasta_file.values())
|
|
178
|
+
# Replace additional gap symbols with default gap symbol ('-')
|
|
179
|
+
for char in additional_gap_chars:
|
|
180
|
+
for i, seq_str in enumerate(seq_strings):
|
|
181
|
+
seq_strings[i] = seq_str.replace(char, "-")
|
|
182
|
+
# Remove gaps for creation of sequences
|
|
183
|
+
sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type)
|
|
184
|
+
for seq_str in seq_strings]
|
|
185
|
+
trace = Alignment.trace_from_strings(seq_strings)
|
|
186
|
+
return Alignment(sequences, trace, score=None)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def set_alignment(fasta_file, alignment, seq_names):
|
|
190
|
+
"""
|
|
191
|
+
Fill a :class:`FastaFile` with gapped sequence strings from an
|
|
192
|
+
alignment.
|
|
193
|
+
|
|
194
|
+
Parameters
|
|
195
|
+
----------
|
|
196
|
+
fasta_file : FastaFile
|
|
197
|
+
The :class:`FastaFile` to be accessed.
|
|
198
|
+
alignment : Alignment
|
|
199
|
+
The alignment to be set.
|
|
200
|
+
seq_names : iterable object of str
|
|
201
|
+
The names for the sequences in the alignment.
|
|
202
|
+
Must have the same length as the sequence count in `alignment`.
|
|
203
|
+
"""
|
|
204
|
+
gapped_seq_strings = alignment.get_gapped_sequences()
|
|
205
|
+
if len(gapped_seq_strings) != len(seq_names):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"Alignment has {len(gapped_seq_strings)} sequences, "
|
|
208
|
+
f"but {len(seq_names)} names were given"
|
|
209
|
+
)
|
|
210
|
+
for i in range(len(gapped_seq_strings)):
|
|
211
|
+
fasta_file[seq_names[i]] = gapped_seq_strings[i]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _convert_to_sequence(seq_str, seq_type=None):
|
|
215
|
+
|
|
216
|
+
# Define preprocessing of preimplemented sequence types
|
|
217
|
+
|
|
218
|
+
# Replace selenocysteine with cysteine
|
|
219
|
+
# and pyrrolysine with lysine
|
|
220
|
+
process_protein_sequence = (
|
|
221
|
+
lambda x : x.upper().replace("U", "C").replace("O", "K")
|
|
222
|
+
)
|
|
223
|
+
# For nucleotides uracil is represented by thymine and there is only
|
|
224
|
+
# one letter for completely unknown nucleotides
|
|
225
|
+
process_nucleotide_sequence = (
|
|
226
|
+
lambda x : x.upper().replace("U","T").replace("X","N")
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Set manually selected sequence type
|
|
230
|
+
|
|
231
|
+
if seq_type is not None:
|
|
232
|
+
# Do preprocessing as done without manual selection
|
|
233
|
+
if seq_type == NucleotideSequence:
|
|
234
|
+
seq_str = process_nucleotide_sequence(seq_str)
|
|
235
|
+
elif seq_type == ProteinSequence:
|
|
236
|
+
if "U" in seq_str:
|
|
237
|
+
warnings.warn(
|
|
238
|
+
"ProteinSequence objects do not support selenocysteine "
|
|
239
|
+
"(U), occurrences were substituted by cysteine (C)"
|
|
240
|
+
)
|
|
241
|
+
seq_str = process_protein_sequence(seq_str)
|
|
242
|
+
# Return the converted sequence
|
|
243
|
+
return seq_type(seq_str)
|
|
244
|
+
|
|
245
|
+
# Attempt to automatically determine sequence type
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
return NucleotideSequence(process_nucleotide_sequence(seq_str))
|
|
249
|
+
except AlphabetError:
|
|
250
|
+
pass
|
|
251
|
+
try:
|
|
252
|
+
prot_seq = ProteinSequence(process_protein_sequence(seq_str))
|
|
253
|
+
# Raise Warning after conversion into 'ProteinSequence'
|
|
254
|
+
# to wait for potential 'AlphabetError'
|
|
255
|
+
if "U" in seq_str:
|
|
256
|
+
warnings.warn(
|
|
257
|
+
"ProteinSequence objects do not support selenocysteine (U), "
|
|
258
|
+
"occurrences were substituted by cysteine (C)"
|
|
259
|
+
)
|
|
260
|
+
return prot_seq
|
|
261
|
+
except AlphabetError:
|
|
262
|
+
raise ValueError("FASTA data cannot be converted either to "
|
|
263
|
+
"'NucleotideSequence' nor to 'ProteinSequence'")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _convert_to_string(sequence, as_rna):
|
|
267
|
+
if not isinstance(sequence.get_alphabet(), LetterAlphabet):
|
|
268
|
+
raise ValueError("Only sequences using single letter alphabets "
|
|
269
|
+
"can be stored in a FASTA file")
|
|
270
|
+
if isinstance(sequence, NucleotideSequence) and as_rna:
|
|
271
|
+
return(str(sequence).replace("T", "U"))
|
|
272
|
+
else:
|
|
273
|
+
return(str(sequence))
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fasta"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["FastaFile"]
|
|
8
|
+
|
|
9
|
+
from ....file import TextFile, InvalidFileError, wrap_string
|
|
10
|
+
from collections import OrderedDict
|
|
11
|
+
from collections.abc import MutableMapping
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FastaFile(TextFile, MutableMapping):
|
|
15
|
+
"""
|
|
16
|
+
This class represents a file in FASTA format.
|
|
17
|
+
|
|
18
|
+
A FASTA file contains so called *header* lines, beginning with
|
|
19
|
+
``>``, that describe following sequence.
|
|
20
|
+
The corresponding sequence starts at the line after the header line
|
|
21
|
+
and ends at the next header line or at the end of file.
|
|
22
|
+
The header along with its sequence forms an entry.
|
|
23
|
+
|
|
24
|
+
This class is used in a dictionary like manner, implementing the
|
|
25
|
+
:class:`MutableMapping` interface:
|
|
26
|
+
Headers (without the leading ``>``) are used as keys,
|
|
27
|
+
and strings containing the sequences are the corresponding values.
|
|
28
|
+
Entries can be accessed using indexing,
|
|
29
|
+
``del`` deletes the entry at the given index.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
chars_per_line : int, optional
|
|
34
|
+
The number characters in a line containing sequence data
|
|
35
|
+
after which a line break is inserted.
|
|
36
|
+
Only relevant, when adding sequences to a file.
|
|
37
|
+
Default is 80.
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
|
|
42
|
+
>>> import os.path
|
|
43
|
+
>>> file = FastaFile()
|
|
44
|
+
>>> file["seq1"] = "ATACT"
|
|
45
|
+
>>> print(file["seq1"])
|
|
46
|
+
ATACT
|
|
47
|
+
>>> file["seq2"] = "AAAATT"
|
|
48
|
+
>>> print(file)
|
|
49
|
+
>seq1
|
|
50
|
+
ATACT
|
|
51
|
+
>seq2
|
|
52
|
+
AAAATT
|
|
53
|
+
>>> print(dict(file.items()))
|
|
54
|
+
{'seq1': 'ATACT', 'seq2': 'AAAATT'}
|
|
55
|
+
>>> for header, seq in file.items():
|
|
56
|
+
... print(header, seq)
|
|
57
|
+
seq1 ATACT
|
|
58
|
+
seq2 AAAATT
|
|
59
|
+
>>> del file["seq1"]
|
|
60
|
+
>>> print(dict(file.items()))
|
|
61
|
+
{'seq2': 'AAAATT'}
|
|
62
|
+
>>> file.write(os.path.join(path_to_directory, "test.fasta"))
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, chars_per_line=80):
|
|
66
|
+
super().__init__()
|
|
67
|
+
self._chars_per_line = chars_per_line
|
|
68
|
+
self._entries = OrderedDict()
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def read(cls, file, chars_per_line=80):
|
|
72
|
+
"""
|
|
73
|
+
Read a FASTA file.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
file : file-like object or str
|
|
78
|
+
The file to be read.
|
|
79
|
+
Alternatively a file path can be supplied.
|
|
80
|
+
chars_per_line : int, optional
|
|
81
|
+
The number characters in a line containing sequence data
|
|
82
|
+
after which a line break is inserted.
|
|
83
|
+
Only relevant, when adding sequences to a file.
|
|
84
|
+
Default is 80.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
file_object : FastaFile
|
|
89
|
+
The parsed file.
|
|
90
|
+
"""
|
|
91
|
+
file = super().read(file, chars_per_line)
|
|
92
|
+
# Filter out empty and comment lines
|
|
93
|
+
file.lines = [line for line in file.lines
|
|
94
|
+
if len(line.strip()) != 0 and line[0] != ";"]
|
|
95
|
+
if len(file.lines) == 0:
|
|
96
|
+
raise InvalidFileError("File is empty or contains only comments")
|
|
97
|
+
file._find_entries()
|
|
98
|
+
return file
|
|
99
|
+
|
|
100
|
+
def __setitem__(self, header, seq_str):
|
|
101
|
+
if not isinstance(header, str):
|
|
102
|
+
raise IndexError(
|
|
103
|
+
"'FastaFile' only supports header strings as keys"
|
|
104
|
+
)
|
|
105
|
+
if not isinstance(seq_str, str):
|
|
106
|
+
raise TypeError("'FastaFile' only supports sequence strings "
|
|
107
|
+
"as values")
|
|
108
|
+
# Create lines for new header and sequence (with line breaks)
|
|
109
|
+
new_lines = [">" + header.replace("\n","").strip()] + \
|
|
110
|
+
wrap_string(seq_str, width=self._chars_per_line)
|
|
111
|
+
if header in self:
|
|
112
|
+
# Delete lines of entry corresponding to the header,
|
|
113
|
+
# if existing
|
|
114
|
+
del self[header]
|
|
115
|
+
self.lines += new_lines
|
|
116
|
+
self._find_entries()
|
|
117
|
+
else:
|
|
118
|
+
# Simply append lines
|
|
119
|
+
# Add entry in a more efficient way than '_find_entries()'
|
|
120
|
+
# for this simple case
|
|
121
|
+
self._entries[header] = (
|
|
122
|
+
len(self.lines),
|
|
123
|
+
len(self.lines) + len(new_lines)
|
|
124
|
+
)
|
|
125
|
+
self.lines += new_lines
|
|
126
|
+
|
|
127
|
+
def __getitem__(self, header):
|
|
128
|
+
if not isinstance(header, str):
|
|
129
|
+
raise IndexError(
|
|
130
|
+
"'FastaFile' only supports header strings as keys"
|
|
131
|
+
)
|
|
132
|
+
start, stop = self._entries[header]
|
|
133
|
+
# Concatenate sequence string from following lines
|
|
134
|
+
seq_string = "".join(
|
|
135
|
+
[line.strip() for line in self.lines[start+1 : stop]]
|
|
136
|
+
)
|
|
137
|
+
return seq_string
|
|
138
|
+
|
|
139
|
+
def __delitem__(self, header):
|
|
140
|
+
start, stop = self._entries[header]
|
|
141
|
+
del self.lines[start:stop]
|
|
142
|
+
del self._entries[header]
|
|
143
|
+
self._find_entries()
|
|
144
|
+
|
|
145
|
+
def __len__(self):
|
|
146
|
+
return len(self._entries)
|
|
147
|
+
|
|
148
|
+
def __iter__(self):
|
|
149
|
+
return self._entries.__iter__()
|
|
150
|
+
|
|
151
|
+
def __contains__(self, identifer):
|
|
152
|
+
return identifer in self._entries
|
|
153
|
+
|
|
154
|
+
def _find_entries(self):
|
|
155
|
+
if len(self.lines) > 0 and self.lines[0][0] != ">":
|
|
156
|
+
raise InvalidFileError(
|
|
157
|
+
f"File starts with '{self.lines[0][0]}' instead of '>'"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
header_i = []
|
|
161
|
+
for i, line in enumerate(self.lines):
|
|
162
|
+
if line[0] == ">":
|
|
163
|
+
header_i.append(i)
|
|
164
|
+
|
|
165
|
+
self._entries = OrderedDict()
|
|
166
|
+
for j in range(len(header_i)):
|
|
167
|
+
# Remove leading '>' from header
|
|
168
|
+
header = self.lines[header_i[j]].strip()[1:]
|
|
169
|
+
start = header_i[j]
|
|
170
|
+
if j < len(header_i) -1:
|
|
171
|
+
# Header in mid or start of file
|
|
172
|
+
# -> stop is start of next header
|
|
173
|
+
stop = header_i[j+1]
|
|
174
|
+
else:
|
|
175
|
+
# Last header -> entry stops at end of file
|
|
176
|
+
stop = len(self.lines)
|
|
177
|
+
self._entries[header] = (start, stop)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def read_iter(file):
|
|
182
|
+
"""
|
|
183
|
+
Create an iterator over each sequence of the given FASTA file.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
file : file-like object or str
|
|
188
|
+
The file to be read.
|
|
189
|
+
Alternatively a file path can be supplied.
|
|
190
|
+
|
|
191
|
+
Yields
|
|
192
|
+
------
|
|
193
|
+
header : str
|
|
194
|
+
The header of the current sequence.
|
|
195
|
+
seq_str : str
|
|
196
|
+
The current sequence as string.
|
|
197
|
+
|
|
198
|
+
Notes
|
|
199
|
+
-----
|
|
200
|
+
This approach gives the same results as
|
|
201
|
+
`FastaFile.read(file).items()`, but is slightly faster and much
|
|
202
|
+
more memory efficient.
|
|
203
|
+
"""
|
|
204
|
+
header = None
|
|
205
|
+
seq_str_list = []
|
|
206
|
+
for line in TextFile.read_iter(file):
|
|
207
|
+
line = line.strip()
|
|
208
|
+
# Ignore empty and comment lines
|
|
209
|
+
if len(line) == 0 or line[0] == ";":
|
|
210
|
+
continue
|
|
211
|
+
if line[0] == ">":
|
|
212
|
+
# New entry
|
|
213
|
+
# -> yield previous entry
|
|
214
|
+
if header is not None:
|
|
215
|
+
yield header, "".join(seq_str_list)
|
|
216
|
+
# Track new header and reset sequence
|
|
217
|
+
header = line[1:]
|
|
218
|
+
seq_str_list = []
|
|
219
|
+
else:
|
|
220
|
+
seq_str_list.append(line)
|
|
221
|
+
# Yield final entry
|
|
222
|
+
if header is not None:
|
|
223
|
+
yield header, "".join(seq_str_list)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def write_iter(file, items, chars_per_line=80):
|
|
228
|
+
"""
|
|
229
|
+
Iterate over the given `items` and write each item into
|
|
230
|
+
the specified `file`.
|
|
231
|
+
|
|
232
|
+
In contrast to :meth:`write()`, the lines of text are not stored
|
|
233
|
+
in an intermediate :class:`TextFile`, but are directly written
|
|
234
|
+
to the file.
|
|
235
|
+
Hence, this static method may save a large amount of memory if
|
|
236
|
+
a large file should be written, especially if the `items`
|
|
237
|
+
are provided as generator.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
file : file-like object or str
|
|
242
|
+
The file to be written to.
|
|
243
|
+
Alternatively a file path can be supplied.
|
|
244
|
+
items : generator or array-like of tuple(str, str)
|
|
245
|
+
The entries to be written into the file.
|
|
246
|
+
Each entry consists of an header string and a sequence
|
|
247
|
+
string.
|
|
248
|
+
chars_per_line : int, optional
|
|
249
|
+
The number characters in a line containing sequence data
|
|
250
|
+
after which a line break is inserted.
|
|
251
|
+
Only relevant, when adding sequences to a file.
|
|
252
|
+
Default is 80.
|
|
253
|
+
|
|
254
|
+
Notes
|
|
255
|
+
-----
|
|
256
|
+
This method does not test, whether the given identifiers are
|
|
257
|
+
unambiguous.
|
|
258
|
+
"""
|
|
259
|
+
def line_generator():
|
|
260
|
+
for item in items:
|
|
261
|
+
header, seq_str = item
|
|
262
|
+
if not isinstance(header, str):
|
|
263
|
+
raise IndexError(
|
|
264
|
+
"'FastaFile' only supports header strings"
|
|
265
|
+
)
|
|
266
|
+
if not isinstance(seq_str, str):
|
|
267
|
+
raise TypeError(
|
|
268
|
+
"'FastaFile' only supports sequence strings"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Yield header line
|
|
272
|
+
yield ">" + header.replace("\n","").strip()
|
|
273
|
+
|
|
274
|
+
# Yield sequence line(s)
|
|
275
|
+
for line in wrap_string(seq_str, width=chars_per_line):
|
|
276
|
+
yield line
|
|
277
|
+
|
|
278
|
+
TextFile.write_iter(file, line_generator())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This subpackage is used for reading and writing sequencing data
|
|
7
|
+
using the popular FASTQ format.
|
|
8
|
+
|
|
9
|
+
This package contains the :class:`FastqFile`, which provides a
|
|
10
|
+
dictionary like interface to FASTQ files, with the sequence identifer
|
|
11
|
+
strings being the keys and the sequences and quality scores being the
|
|
12
|
+
values.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
16
|
+
__author__ = "Patrick Kunzmann"
|
|
17
|
+
|
|
18
|
+
from .file import *
|
|
19
|
+
from .convert import *
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
from ...sequence import Sequence
|
|
10
|
+
from ...alphabet import AlphabetError, LetterAlphabet
|
|
11
|
+
from ...seqtypes import NucleotideSequence
|
|
12
|
+
from ...align.alignment import Alignment
|
|
13
|
+
|
|
14
|
+
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_sequence(fastq_file, header=None):
|
|
18
|
+
"""
|
|
19
|
+
Get a sequence and quality scores from a `FastqFile` instance.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
fastq_file : FastqFile
|
|
24
|
+
The `FastqFile` to be accessed.
|
|
25
|
+
header : str, optional
|
|
26
|
+
The identifier to get the sequence and scores from.
|
|
27
|
+
By default, the first sequence of the file is returned.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
sequence : NucleotideSequence
|
|
32
|
+
The requested sequence.
|
|
33
|
+
scores : ndarray, dtype=int
|
|
34
|
+
The requested scores.
|
|
35
|
+
"""
|
|
36
|
+
if header is not None:
|
|
37
|
+
seq_str, scores = fastq_file[header]
|
|
38
|
+
else:
|
|
39
|
+
# Return first (and probably only) sequence of file
|
|
40
|
+
seq_str = None
|
|
41
|
+
scores = None
|
|
42
|
+
for seq_str, scores in fastq_file.values():
|
|
43
|
+
break
|
|
44
|
+
if seq_str is None:
|
|
45
|
+
raise ValueError("File does not contain any sequences")
|
|
46
|
+
processed_seq_str = seq_str.replace("U","T").replace("X","N")
|
|
47
|
+
return NucleotideSequence(processed_seq_str), scores
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_sequences(fastq_file):
|
|
51
|
+
"""
|
|
52
|
+
Get a dictionary from a `FastqFile` instance,
|
|
53
|
+
where identifiers are keys and sequence-score-tuples are values.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
fastq_file : FastqFile
|
|
58
|
+
The `Fastqile` to be accessed.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
seq_dict : dict
|
|
63
|
+
A dictionary containing identifiers as keys and
|
|
64
|
+
(`NucleotideSequence`, `ndarray`) tuples as values.
|
|
65
|
+
"""
|
|
66
|
+
seq_dict = OrderedDict()
|
|
67
|
+
for header, (seq_str, scores) in fastq_file.items():
|
|
68
|
+
processed_seq_str = seq_str.replace("U","T").replace("X","N")
|
|
69
|
+
seq_dict[header] = NucleotideSequence(processed_seq_str), scores
|
|
70
|
+
return seq_dict
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
|
|
74
|
+
"""
|
|
75
|
+
Set a sequence and a quality score array in a `FastqFile` instance.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
fastq_file : FastqFile
|
|
80
|
+
The `FastqFile` to be accessed.
|
|
81
|
+
sequence : NucleotideSequence
|
|
82
|
+
The sequence to be set.
|
|
83
|
+
scores : ndarray, dtype=int
|
|
84
|
+
The quality scores to be set.
|
|
85
|
+
header : str, optional
|
|
86
|
+
The identifier for the sequence. Default is 'sequence'.
|
|
87
|
+
as_rna : bool, optional
|
|
88
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
89
|
+
by ``'U'``.
|
|
90
|
+
"""
|
|
91
|
+
if header is None:
|
|
92
|
+
header = "sequence"
|
|
93
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def set_sequences(fastq_file, sequence_dict, as_rna=False):
|
|
97
|
+
"""
|
|
98
|
+
Set sequences in a `FastqFile` instance from a dictionary.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
fastq_file : FastqFile
|
|
103
|
+
The `FastqFile` to be accessed.
|
|
104
|
+
sequence_dict : dict
|
|
105
|
+
A dictionary containing the sequences and scores to be set.
|
|
106
|
+
Identifiers are keys,
|
|
107
|
+
(`NucleotideSequence`, `ndarray`) tuples are values.
|
|
108
|
+
as_rna : bool, optional
|
|
109
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
110
|
+
by ``'U'``.
|
|
111
|
+
"""
|
|
112
|
+
for header, (sequence, scores) in sequence_dict.items():
|
|
113
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _convert_to_string(sequence, as_rna):
|
|
117
|
+
if as_rna:
|
|
118
|
+
return(str(sequence).replace("T", "U"))
|
|
119
|
+
else:
|
|
120
|
+
return(str(sequence))
|