biotite 1.6.0__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +426 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +202 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +66 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +224 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +259 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +191 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +127 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +491 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +763 -0
- biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +462 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1596 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cp314-win_amd64.pyd +0 -0
- biotite/structure/charges.pyx +521 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +646 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +426 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2122 -0
- biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +452 -0
- biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.6.0.dist-info/METADATA +162 -0
- biotite-1.6.0.dist-info/RECORD +354 -0
- biotite-1.6.0.dist-info/WHEEL +4 -0
- biotite-1.6.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fasta"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["FastaFile"]
|
|
8
|
+
|
|
9
|
+
from collections import OrderedDict
|
|
10
|
+
from collections.abc import MutableMapping
|
|
11
|
+
from biotite.file import InvalidFileError, TextFile, wrap_string
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FastaFile(TextFile, MutableMapping):
|
|
15
|
+
"""
|
|
16
|
+
This class represents a file in FASTA format.
|
|
17
|
+
|
|
18
|
+
A FASTA file contains so called *header* lines, beginning with
|
|
19
|
+
``>``, that describe following sequence.
|
|
20
|
+
The corresponding sequence starts at the line after the header line
|
|
21
|
+
and ends at the next header line or at the end of file.
|
|
22
|
+
The header along with its sequence forms an entry.
|
|
23
|
+
|
|
24
|
+
This class is used in a dictionary like manner, implementing the
|
|
25
|
+
:class:`MutableMapping` interface:
|
|
26
|
+
Headers (without the leading ``>``) are used as keys,
|
|
27
|
+
and strings containing the sequences are the corresponding values.
|
|
28
|
+
Entries can be accessed using indexing,
|
|
29
|
+
``del`` deletes the entry at the given index.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
chars_per_line : int, optional
|
|
34
|
+
The number characters in a line containing sequence data
|
|
35
|
+
after which a line break is inserted.
|
|
36
|
+
Only relevant, when adding sequences to a file.
|
|
37
|
+
Default is 80.
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
|
|
42
|
+
>>> import os.path
|
|
43
|
+
>>> file = FastaFile()
|
|
44
|
+
>>> file["seq1"] = "ATACT"
|
|
45
|
+
>>> print(file["seq1"])
|
|
46
|
+
ATACT
|
|
47
|
+
>>> file["seq2"] = "AAAATT"
|
|
48
|
+
>>> print(file)
|
|
49
|
+
>seq1
|
|
50
|
+
ATACT
|
|
51
|
+
>seq2
|
|
52
|
+
AAAATT
|
|
53
|
+
>>> print(dict(file.items()))
|
|
54
|
+
{'seq1': 'ATACT', 'seq2': 'AAAATT'}
|
|
55
|
+
>>> for header, seq in file.items():
|
|
56
|
+
... print(header, seq)
|
|
57
|
+
seq1 ATACT
|
|
58
|
+
seq2 AAAATT
|
|
59
|
+
>>> del file["seq1"]
|
|
60
|
+
>>> print(dict(file.items()))
|
|
61
|
+
{'seq2': 'AAAATT'}
|
|
62
|
+
>>> file.write(os.path.join(path_to_directory, "test.fasta"))
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, chars_per_line=80):
|
|
66
|
+
super().__init__()
|
|
67
|
+
self._chars_per_line = chars_per_line
|
|
68
|
+
self._entries = OrderedDict()
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def read(cls, file, chars_per_line=80):
|
|
72
|
+
"""
|
|
73
|
+
Read a FASTA file.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
file : file-like object or str
|
|
78
|
+
The file to be read.
|
|
79
|
+
Alternatively a file path can be supplied.
|
|
80
|
+
chars_per_line : int, optional
|
|
81
|
+
The number characters in a line containing sequence data
|
|
82
|
+
after which a line break is inserted.
|
|
83
|
+
Only relevant, when adding sequences to a file.
|
|
84
|
+
Default is 80.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
file_object : FastaFile
|
|
89
|
+
The parsed file.
|
|
90
|
+
"""
|
|
91
|
+
file = super().read(file, chars_per_line)
|
|
92
|
+
# Filter out empty and comment lines
|
|
93
|
+
file.lines = [
|
|
94
|
+
line for line in file.lines if len(line.strip()) != 0 and line[0] != ";"
|
|
95
|
+
]
|
|
96
|
+
if len(file.lines) == 0:
|
|
97
|
+
raise InvalidFileError("File is empty or contains only comments")
|
|
98
|
+
file._find_entries()
|
|
99
|
+
return file
|
|
100
|
+
|
|
101
|
+
def __setitem__(self, header, seq_str):
|
|
102
|
+
if not isinstance(header, str):
|
|
103
|
+
raise IndexError("'FastaFile' only supports header strings as keys")
|
|
104
|
+
if not isinstance(seq_str, str):
|
|
105
|
+
raise TypeError("'FastaFile' only supports sequence strings as values")
|
|
106
|
+
# Create lines for new header and sequence (with line breaks)
|
|
107
|
+
new_lines = [">" + header.replace("\n", "").strip()] + wrap_string(
|
|
108
|
+
seq_str, width=self._chars_per_line
|
|
109
|
+
)
|
|
110
|
+
if header in self:
|
|
111
|
+
# Delete lines of entry corresponding to the header,
|
|
112
|
+
# if existing
|
|
113
|
+
del self[header]
|
|
114
|
+
self.lines += new_lines
|
|
115
|
+
self._find_entries()
|
|
116
|
+
else:
|
|
117
|
+
# Simply append lines
|
|
118
|
+
# Add entry in a more efficient way than '_find_entries()'
|
|
119
|
+
# for this simple case
|
|
120
|
+
self._entries[header] = (len(self.lines), len(self.lines) + len(new_lines))
|
|
121
|
+
self.lines += new_lines
|
|
122
|
+
|
|
123
|
+
def __getitem__(self, header):
|
|
124
|
+
if not isinstance(header, str):
|
|
125
|
+
raise IndexError("'FastaFile' only supports header strings as keys")
|
|
126
|
+
start, stop = self._entries[header]
|
|
127
|
+
# Concatenate sequence string from following lines
|
|
128
|
+
seq_string = "".join([line.strip() for line in self.lines[start + 1 : stop]])
|
|
129
|
+
return seq_string
|
|
130
|
+
|
|
131
|
+
def __delitem__(self, header):
|
|
132
|
+
start, stop = self._entries[header]
|
|
133
|
+
del self.lines[start:stop]
|
|
134
|
+
del self._entries[header]
|
|
135
|
+
self._find_entries()
|
|
136
|
+
|
|
137
|
+
def __len__(self):
|
|
138
|
+
return len(self._entries)
|
|
139
|
+
|
|
140
|
+
def __iter__(self):
|
|
141
|
+
return self._entries.__iter__()
|
|
142
|
+
|
|
143
|
+
def __contains__(self, identifer):
|
|
144
|
+
return identifer in self._entries
|
|
145
|
+
|
|
146
|
+
def _find_entries(self):
|
|
147
|
+
if len(self.lines) > 0 and self.lines[0][0] != ">":
|
|
148
|
+
raise InvalidFileError(
|
|
149
|
+
f"File starts with '{self.lines[0][0]}' instead of '>'"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
header_i = []
|
|
153
|
+
for i, line in enumerate(self.lines):
|
|
154
|
+
if line[0] == ">":
|
|
155
|
+
header_i.append(i)
|
|
156
|
+
|
|
157
|
+
self._entries = OrderedDict()
|
|
158
|
+
for j in range(len(header_i)):
|
|
159
|
+
# Remove leading '>' from header
|
|
160
|
+
header = self.lines[header_i[j]].strip()[1:]
|
|
161
|
+
start = header_i[j]
|
|
162
|
+
if j < len(header_i) - 1:
|
|
163
|
+
# Header in mid or start of file
|
|
164
|
+
# -> stop is start of next header
|
|
165
|
+
stop = header_i[j + 1]
|
|
166
|
+
else:
|
|
167
|
+
# Last header -> entry stops at end of file
|
|
168
|
+
stop = len(self.lines)
|
|
169
|
+
self._entries[header] = (start, stop)
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def read_iter(file):
|
|
173
|
+
"""
|
|
174
|
+
Create an iterator over each sequence of the given FASTA file.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
file : file-like object or str
|
|
179
|
+
The file to be read.
|
|
180
|
+
Alternatively a file path can be supplied.
|
|
181
|
+
|
|
182
|
+
Yields
|
|
183
|
+
------
|
|
184
|
+
header : str
|
|
185
|
+
The header of the current sequence.
|
|
186
|
+
seq_str : str
|
|
187
|
+
The current sequence as string.
|
|
188
|
+
|
|
189
|
+
Notes
|
|
190
|
+
-----
|
|
191
|
+
This approach gives the same results as
|
|
192
|
+
`FastaFile.read(file).items()`, but is slightly faster and much
|
|
193
|
+
more memory efficient.
|
|
194
|
+
"""
|
|
195
|
+
header = None
|
|
196
|
+
seq_str_list = []
|
|
197
|
+
for line in TextFile.read_iter(file):
|
|
198
|
+
line = line.strip()
|
|
199
|
+
# Ignore empty and comment lines
|
|
200
|
+
if len(line) == 0 or line[0] == ";":
|
|
201
|
+
continue
|
|
202
|
+
if line[0] == ">":
|
|
203
|
+
# New entry
|
|
204
|
+
# -> yield previous entry
|
|
205
|
+
if header is not None:
|
|
206
|
+
yield header, "".join(seq_str_list)
|
|
207
|
+
# Track new header and reset sequence
|
|
208
|
+
header = line[1:]
|
|
209
|
+
seq_str_list = []
|
|
210
|
+
else:
|
|
211
|
+
seq_str_list.append(line)
|
|
212
|
+
# Yield final entry
|
|
213
|
+
if header is not None:
|
|
214
|
+
yield header, "".join(seq_str_list)
|
|
215
|
+
|
|
216
|
+
@staticmethod
|
|
217
|
+
def write_iter(file, items, chars_per_line=80):
|
|
218
|
+
"""
|
|
219
|
+
Iterate over the given `items` and write each item into
|
|
220
|
+
the specified `file`.
|
|
221
|
+
|
|
222
|
+
In contrast to :meth:`write()`, the lines of text are not stored
|
|
223
|
+
in an intermediate :class:`TextFile`, but are directly written
|
|
224
|
+
to the file.
|
|
225
|
+
Hence, this static method may save a large amount of memory if
|
|
226
|
+
a large file should be written, especially if the `items`
|
|
227
|
+
are provided as generator.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
file : file-like object or str
|
|
232
|
+
The file to be written to.
|
|
233
|
+
Alternatively a file path can be supplied.
|
|
234
|
+
items : generator or array-like of tuple(str, str)
|
|
235
|
+
The entries to be written into the file.
|
|
236
|
+
Each entry consists of an header string and a sequence
|
|
237
|
+
string.
|
|
238
|
+
chars_per_line : int, optional
|
|
239
|
+
The number characters in a line containing sequence data
|
|
240
|
+
after which a line break is inserted.
|
|
241
|
+
Only relevant, when adding sequences to a file.
|
|
242
|
+
Default is 80.
|
|
243
|
+
|
|
244
|
+
Notes
|
|
245
|
+
-----
|
|
246
|
+
This method does not test, whether the given identifiers are
|
|
247
|
+
unambiguous.
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
def line_generator():
|
|
251
|
+
for item in items:
|
|
252
|
+
header, seq_str = item
|
|
253
|
+
if not isinstance(header, str):
|
|
254
|
+
raise IndexError("'FastaFile' only supports header strings")
|
|
255
|
+
if not isinstance(seq_str, str):
|
|
256
|
+
raise TypeError("'FastaFile' only supports sequence strings")
|
|
257
|
+
|
|
258
|
+
# Yield header line
|
|
259
|
+
yield ">" + header.replace("\n", "").strip()
|
|
260
|
+
|
|
261
|
+
# Yield sequence line(s)
|
|
262
|
+
for line in wrap_string(seq_str, width=chars_per_line):
|
|
263
|
+
yield line
|
|
264
|
+
|
|
265
|
+
TextFile.write_iter(file, line_generator())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This subpackage is used for reading and writing sequencing data
|
|
7
|
+
using the popular FASTQ format.
|
|
8
|
+
|
|
9
|
+
This package contains the :class:`FastqFile`, which provides a
|
|
10
|
+
dictionary like interface to FASTQ files, with the sequence identifer
|
|
11
|
+
strings being the keys and the sequences and quality scores being the
|
|
12
|
+
values.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
16
|
+
__author__ = "Patrick Kunzmann"
|
|
17
|
+
|
|
18
|
+
from .convert import *
|
|
19
|
+
from .file import *
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.fastq"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
from biotite.sequence.seqtypes import NucleotideSequence
|
|
10
|
+
|
|
11
|
+
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_sequence(fastq_file, header=None):
|
|
15
|
+
"""
|
|
16
|
+
Get a sequence and quality scores from a `FastqFile` instance.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
fastq_file : FastqFile
|
|
21
|
+
The `FastqFile` to be accessed.
|
|
22
|
+
header : str, optional
|
|
23
|
+
The identifier to get the sequence and scores from.
|
|
24
|
+
By default, the first sequence of the file is returned.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
sequence : NucleotideSequence
|
|
29
|
+
The requested sequence.
|
|
30
|
+
scores : ndarray, dtype=int
|
|
31
|
+
The requested scores.
|
|
32
|
+
"""
|
|
33
|
+
if header is not None:
|
|
34
|
+
seq_str, scores = fastq_file[header]
|
|
35
|
+
else:
|
|
36
|
+
# Return first (and probably only) sequence of file
|
|
37
|
+
seq_str = None
|
|
38
|
+
scores = None
|
|
39
|
+
for seq_str, scores in fastq_file.values():
|
|
40
|
+
break
|
|
41
|
+
if seq_str is None:
|
|
42
|
+
raise ValueError("File does not contain any sequences")
|
|
43
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
44
|
+
return NucleotideSequence(processed_seq_str), scores
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_sequences(fastq_file):
|
|
48
|
+
"""
|
|
49
|
+
Get a dictionary from a `FastqFile` instance,
|
|
50
|
+
where identifiers are keys and sequence-score-tuples are values.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
fastq_file : FastqFile
|
|
55
|
+
The `Fastqile` to be accessed.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
seq_dict : dict
|
|
60
|
+
A dictionary containing identifiers as keys and
|
|
61
|
+
(`NucleotideSequence`, `ndarray`) tuples as values.
|
|
62
|
+
"""
|
|
63
|
+
seq_dict = OrderedDict()
|
|
64
|
+
for header, (seq_str, scores) in fastq_file.items():
|
|
65
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
66
|
+
seq_dict[header] = NucleotideSequence(processed_seq_str), scores
|
|
67
|
+
return seq_dict
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
|
|
71
|
+
"""
|
|
72
|
+
Set a sequence and a quality score array in a `FastqFile` instance.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
fastq_file : FastqFile
|
|
77
|
+
The `FastqFile` to be accessed.
|
|
78
|
+
sequence : NucleotideSequence
|
|
79
|
+
The sequence to be set.
|
|
80
|
+
scores : ndarray, dtype=int
|
|
81
|
+
The quality scores to be set.
|
|
82
|
+
header : str, optional
|
|
83
|
+
The identifier for the sequence. Default is 'sequence'.
|
|
84
|
+
as_rna : bool, optional
|
|
85
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
86
|
+
by ``'U'``.
|
|
87
|
+
"""
|
|
88
|
+
if header is None:
|
|
89
|
+
header = "sequence"
|
|
90
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def set_sequences(fastq_file, sequence_dict, as_rna=False):
|
|
94
|
+
"""
|
|
95
|
+
Set sequences in a `FastqFile` instance from a dictionary.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
fastq_file : FastqFile
|
|
100
|
+
The `FastqFile` to be accessed.
|
|
101
|
+
sequence_dict : dict
|
|
102
|
+
A dictionary containing the sequences and scores to be set.
|
|
103
|
+
Identifiers are keys,
|
|
104
|
+
(`NucleotideSequence`, `ndarray`) tuples are values.
|
|
105
|
+
as_rna : bool, optional
|
|
106
|
+
If set to true, the sequence symbol ``'T'`` will be replaced
|
|
107
|
+
by ``'U'``.
|
|
108
|
+
"""
|
|
109
|
+
for header, (sequence, scores) in sequence_dict.items():
|
|
110
|
+
fastq_file[header] = _convert_to_string(sequence, as_rna), scores
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _convert_to_string(sequence, as_rna):
|
|
114
|
+
if as_rna:
|
|
115
|
+
return str(sequence).replace("T", "U")
|
|
116
|
+
else:
|
|
117
|
+
return str(sequence)
|