biotite 0.41.2__cp311-cp311-macosx_11_0_arm64.whl → 1.0.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +221 -235
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/METADATA +5 -5
- biotite-1.0.0.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -7,13 +7,18 @@ __author__ = "Patrick Kunzmann"
|
|
|
7
7
|
|
|
8
8
|
import warnings
|
|
9
9
|
from collections import OrderedDict
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from ...align.alignment import Alignment
|
|
10
|
+
from biotite.sequence.align.alignment import Alignment
|
|
11
|
+
from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
|
|
12
|
+
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
14
13
|
|
|
15
|
-
__all__ = [
|
|
16
|
-
|
|
14
|
+
__all__ = [
|
|
15
|
+
"get_sequence",
|
|
16
|
+
"get_sequences",
|
|
17
|
+
"set_sequence",
|
|
18
|
+
"set_sequences",
|
|
19
|
+
"get_alignment",
|
|
20
|
+
"set_alignment",
|
|
21
|
+
]
|
|
17
22
|
|
|
18
23
|
|
|
19
24
|
def get_sequence(fasta_file, header=None, seq_type=None):
|
|
@@ -180,8 +185,10 @@ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None):
|
|
|
180
185
|
for i, seq_str in enumerate(seq_strings):
|
|
181
186
|
seq_strings[i] = seq_str.replace(char, "-")
|
|
182
187
|
# Remove gaps for creation of sequences
|
|
183
|
-
sequences = [
|
|
184
|
-
|
|
188
|
+
sequences = [
|
|
189
|
+
_convert_to_sequence(seq_str.replace("-", ""), seq_type)
|
|
190
|
+
for seq_str in seq_strings
|
|
191
|
+
]
|
|
185
192
|
trace = Alignment.trace_from_strings(seq_strings)
|
|
186
193
|
return Alignment(sequences, trace, score=None)
|
|
187
194
|
|
|
@@ -212,44 +219,29 @@ def set_alignment(fasta_file, alignment, seq_names):
|
|
|
212
219
|
|
|
213
220
|
|
|
214
221
|
def _convert_to_sequence(seq_str, seq_type=None):
|
|
215
|
-
|
|
216
|
-
# Define preprocessing of preimplemented sequence types
|
|
217
|
-
|
|
218
|
-
# Replace selenocysteine with cysteine
|
|
219
|
-
# and pyrrolysine with lysine
|
|
220
|
-
process_protein_sequence = (
|
|
221
|
-
lambda x : x.upper().replace("U", "C").replace("O", "K")
|
|
222
|
-
)
|
|
223
|
-
# For nucleotides uracil is represented by thymine and there is only
|
|
224
|
-
# one letter for completely unknown nucleotides
|
|
225
|
-
process_nucleotide_sequence = (
|
|
226
|
-
lambda x : x.upper().replace("U","T").replace("X","N")
|
|
227
|
-
)
|
|
228
|
-
|
|
229
222
|
# Set manually selected sequence type
|
|
230
|
-
|
|
231
223
|
if seq_type is not None:
|
|
232
224
|
# Do preprocessing as done without manual selection
|
|
233
225
|
if seq_type == NucleotideSequence:
|
|
234
|
-
seq_str =
|
|
226
|
+
seq_str = _process_nucleotide_sequence(seq_str)
|
|
235
227
|
elif seq_type == ProteinSequence:
|
|
236
228
|
if "U" in seq_str:
|
|
237
229
|
warnings.warn(
|
|
238
230
|
"ProteinSequence objects do not support selenocysteine "
|
|
239
231
|
"(U), occurrences were substituted by cysteine (C)"
|
|
240
232
|
)
|
|
241
|
-
seq_str =
|
|
233
|
+
seq_str = _process_protein_sequence(seq_str)
|
|
242
234
|
# Return the converted sequence
|
|
243
235
|
return seq_type(seq_str)
|
|
244
236
|
|
|
245
237
|
# Attempt to automatically determine sequence type
|
|
246
238
|
|
|
247
239
|
try:
|
|
248
|
-
return NucleotideSequence(
|
|
240
|
+
return NucleotideSequence(_process_nucleotide_sequence(seq_str))
|
|
249
241
|
except AlphabetError:
|
|
250
242
|
pass
|
|
251
243
|
try:
|
|
252
|
-
prot_seq = ProteinSequence(
|
|
244
|
+
prot_seq = ProteinSequence(_process_protein_sequence(seq_str))
|
|
253
245
|
# Raise Warning after conversion into 'ProteinSequence'
|
|
254
246
|
# to wait for potential 'AlphabetError'
|
|
255
247
|
if "U" in seq_str:
|
|
@@ -259,15 +251,34 @@ def _convert_to_sequence(seq_str, seq_type=None):
|
|
|
259
251
|
)
|
|
260
252
|
return prot_seq
|
|
261
253
|
except AlphabetError:
|
|
262
|
-
raise ValueError(
|
|
263
|
-
|
|
254
|
+
raise ValueError(
|
|
255
|
+
"FASTA data cannot be converted either to "
|
|
256
|
+
"'NucleotideSequence' nor to 'ProteinSequence'"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _process_protein_sequence(x):
|
|
261
|
+
"""
|
|
262
|
+
Replace selenocysteine with cysteine and pyrrolysine with lysine.
|
|
263
|
+
"""
|
|
264
|
+
return x.upper().replace("U", "C").replace("O", "K")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _process_nucleotide_sequence(x):
|
|
268
|
+
"""
|
|
269
|
+
For nucleotides uracil is represented by thymine and there is only
|
|
270
|
+
one letter for completely unknown nucleotides
|
|
271
|
+
"""
|
|
272
|
+
return x.upper().replace("U", "T").replace("X", "N")
|
|
264
273
|
|
|
265
274
|
|
|
266
275
|
def _convert_to_string(sequence, as_rna):
|
|
267
276
|
if not isinstance(sequence.get_alphabet(), LetterAlphabet):
|
|
268
|
-
raise ValueError(
|
|
269
|
-
|
|
277
|
+
raise ValueError(
|
|
278
|
+
"Only sequences using single letter alphabets "
|
|
279
|
+
"can be stored in a FASTA file"
|
|
280
|
+
)
|
|
270
281
|
if isinstance(sequence, NucleotideSequence) and as_rna:
|
|
271
|
-
return
|
|
282
|
+
return str(sequence).replace("T", "U")
|
|
272
283
|
else:
|
|
273
|
-
return
|
|
284
|
+
return str(sequence)
|
|
@@ -6,21 +6,21 @@ __name__ = "biotite.sequence.io.fasta"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["FastaFile"]
|
|
8
8
|
|
|
9
|
-
from ....file import TextFile, InvalidFileError, wrap_string
|
|
10
9
|
from collections import OrderedDict
|
|
11
10
|
from collections.abc import MutableMapping
|
|
11
|
+
from biotite.file import InvalidFileError, TextFile, wrap_string
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class FastaFile(TextFile, MutableMapping):
|
|
15
15
|
"""
|
|
16
16
|
This class represents a file in FASTA format.
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
A FASTA file contains so called *header* lines, beginning with
|
|
19
19
|
``>``, that describe following sequence.
|
|
20
20
|
The corresponding sequence starts at the line after the header line
|
|
21
21
|
and ends at the next header line or at the end of file.
|
|
22
22
|
The header along with its sequence forms an entry.
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
This class is used in a dictionary like manner, implementing the
|
|
25
25
|
:class:`MutableMapping` interface:
|
|
26
26
|
Headers (without the leading ``>``) are used as keys,
|
|
@@ -35,10 +35,10 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
35
35
|
after which a line break is inserted.
|
|
36
36
|
Only relevant, when adding sequences to a file.
|
|
37
37
|
Default is 80.
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
Examples
|
|
40
40
|
--------
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
>>> import os.path
|
|
43
43
|
>>> file = FastaFile()
|
|
44
44
|
>>> file["seq1"] = "ATACT"
|
|
@@ -61,17 +61,17 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
61
61
|
{'seq2': 'AAAATT'}
|
|
62
62
|
>>> file.write(os.path.join(path_to_directory, "test.fasta"))
|
|
63
63
|
"""
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
def __init__(self, chars_per_line=80):
|
|
66
66
|
super().__init__()
|
|
67
67
|
self._chars_per_line = chars_per_line
|
|
68
68
|
self._entries = OrderedDict()
|
|
69
|
-
|
|
69
|
+
|
|
70
70
|
@classmethod
|
|
71
71
|
def read(cls, file, chars_per_line=80):
|
|
72
72
|
"""
|
|
73
73
|
Read a FASTA file.
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
Parameters
|
|
76
76
|
----------
|
|
77
77
|
file : file-like object or str
|
|
@@ -82,7 +82,7 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
82
82
|
after which a line break is inserted.
|
|
83
83
|
Only relevant, when adding sequences to a file.
|
|
84
84
|
Default is 80.
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
Returns
|
|
87
87
|
-------
|
|
88
88
|
file_object : FastaFile
|
|
@@ -90,24 +90,23 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
90
90
|
"""
|
|
91
91
|
file = super().read(file, chars_per_line)
|
|
92
92
|
# Filter out empty and comment lines
|
|
93
|
-
file.lines = [
|
|
94
|
-
|
|
93
|
+
file.lines = [
|
|
94
|
+
line for line in file.lines if len(line.strip()) != 0 and line[0] != ";"
|
|
95
|
+
]
|
|
95
96
|
if len(file.lines) == 0:
|
|
96
97
|
raise InvalidFileError("File is empty or contains only comments")
|
|
97
98
|
file._find_entries()
|
|
98
99
|
return file
|
|
99
|
-
|
|
100
|
+
|
|
100
101
|
def __setitem__(self, header, seq_str):
|
|
101
102
|
if not isinstance(header, str):
|
|
102
|
-
raise IndexError(
|
|
103
|
-
"'FastaFile' only supports header strings as keys"
|
|
104
|
-
)
|
|
103
|
+
raise IndexError("'FastaFile' only supports header strings as keys")
|
|
105
104
|
if not isinstance(seq_str, str):
|
|
106
|
-
raise TypeError("'FastaFile' only supports sequence strings "
|
|
107
|
-
"as values")
|
|
105
|
+
raise TypeError("'FastaFile' only supports sequence strings " "as values")
|
|
108
106
|
# Create lines for new header and sequence (with line breaks)
|
|
109
|
-
new_lines = [">" + header.replace("\n","").strip()] +
|
|
110
|
-
|
|
107
|
+
new_lines = [">" + header.replace("\n", "").strip()] + wrap_string(
|
|
108
|
+
seq_str, width=self._chars_per_line
|
|
109
|
+
)
|
|
111
110
|
if header in self:
|
|
112
111
|
# Delete lines of entry corresponding to the header,
|
|
113
112
|
# if existing
|
|
@@ -118,83 +117,75 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
118
117
|
# Simply append lines
|
|
119
118
|
# Add entry in a more efficient way than '_find_entries()'
|
|
120
119
|
# for this simple case
|
|
121
|
-
self._entries[header] = (
|
|
122
|
-
len(self.lines),
|
|
123
|
-
len(self.lines) + len(new_lines)
|
|
124
|
-
)
|
|
120
|
+
self._entries[header] = (len(self.lines), len(self.lines) + len(new_lines))
|
|
125
121
|
self.lines += new_lines
|
|
126
|
-
|
|
122
|
+
|
|
127
123
|
def __getitem__(self, header):
|
|
128
124
|
if not isinstance(header, str):
|
|
129
|
-
raise IndexError(
|
|
130
|
-
"'FastaFile' only supports header strings as keys"
|
|
131
|
-
)
|
|
125
|
+
raise IndexError("'FastaFile' only supports header strings as keys")
|
|
132
126
|
start, stop = self._entries[header]
|
|
133
127
|
# Concatenate sequence string from following lines
|
|
134
|
-
seq_string = "".join(
|
|
135
|
-
[line.strip() for line in self.lines[start+1 : stop]]
|
|
136
|
-
)
|
|
128
|
+
seq_string = "".join([line.strip() for line in self.lines[start + 1 : stop]])
|
|
137
129
|
return seq_string
|
|
138
|
-
|
|
130
|
+
|
|
139
131
|
def __delitem__(self, header):
|
|
140
132
|
start, stop = self._entries[header]
|
|
141
133
|
del self.lines[start:stop]
|
|
142
134
|
del self._entries[header]
|
|
143
135
|
self._find_entries()
|
|
144
|
-
|
|
136
|
+
|
|
145
137
|
def __len__(self):
|
|
146
138
|
return len(self._entries)
|
|
147
|
-
|
|
139
|
+
|
|
148
140
|
def __iter__(self):
|
|
149
141
|
return self._entries.__iter__()
|
|
150
|
-
|
|
142
|
+
|
|
151
143
|
def __contains__(self, identifer):
|
|
152
144
|
return identifer in self._entries
|
|
153
|
-
|
|
145
|
+
|
|
154
146
|
def _find_entries(self):
|
|
155
147
|
if len(self.lines) > 0 and self.lines[0][0] != ">":
|
|
156
148
|
raise InvalidFileError(
|
|
157
149
|
f"File starts with '{self.lines[0][0]}' instead of '>'"
|
|
158
150
|
)
|
|
159
|
-
|
|
151
|
+
|
|
160
152
|
header_i = []
|
|
161
153
|
for i, line in enumerate(self.lines):
|
|
162
154
|
if line[0] == ">":
|
|
163
155
|
header_i.append(i)
|
|
164
|
-
|
|
156
|
+
|
|
165
157
|
self._entries = OrderedDict()
|
|
166
158
|
for j in range(len(header_i)):
|
|
167
159
|
# Remove leading '>' from header
|
|
168
160
|
header = self.lines[header_i[j]].strip()[1:]
|
|
169
161
|
start = header_i[j]
|
|
170
|
-
if j < len(header_i) -1:
|
|
162
|
+
if j < len(header_i) - 1:
|
|
171
163
|
# Header in mid or start of file
|
|
172
164
|
# -> stop is start of next header
|
|
173
|
-
stop = header_i[j+1]
|
|
165
|
+
stop = header_i[j + 1]
|
|
174
166
|
else:
|
|
175
167
|
# Last header -> entry stops at end of file
|
|
176
168
|
stop = len(self.lines)
|
|
177
169
|
self._entries[header] = (start, stop)
|
|
178
170
|
|
|
179
|
-
|
|
180
171
|
@staticmethod
|
|
181
172
|
def read_iter(file):
|
|
182
173
|
"""
|
|
183
174
|
Create an iterator over each sequence of the given FASTA file.
|
|
184
|
-
|
|
175
|
+
|
|
185
176
|
Parameters
|
|
186
177
|
----------
|
|
187
178
|
file : file-like object or str
|
|
188
179
|
The file to be read.
|
|
189
180
|
Alternatively a file path can be supplied.
|
|
190
|
-
|
|
181
|
+
|
|
191
182
|
Yields
|
|
192
183
|
------
|
|
193
184
|
header : str
|
|
194
185
|
The header of the current sequence.
|
|
195
186
|
seq_str : str
|
|
196
187
|
The current sequence as string.
|
|
197
|
-
|
|
188
|
+
|
|
198
189
|
Notes
|
|
199
190
|
-----
|
|
200
191
|
This approach gives the same results as
|
|
@@ -221,7 +212,6 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
221
212
|
# Yield final entry
|
|
222
213
|
if header is not None:
|
|
223
214
|
yield header, "".join(seq_str_list)
|
|
224
|
-
|
|
225
215
|
|
|
226
216
|
@staticmethod
|
|
227
217
|
def write_iter(file, items, chars_per_line=80):
|
|
@@ -235,7 +225,7 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
235
225
|
Hence, this static method may save a large amount of memory if
|
|
236
226
|
a large file should be written, especially if the `items`
|
|
237
227
|
are provided as generator.
|
|
238
|
-
|
|
228
|
+
|
|
239
229
|
Parameters
|
|
240
230
|
----------
|
|
241
231
|
file : file-like object or str
|
|
@@ -256,23 +246,20 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
256
246
|
This method does not test, whether the given identifiers are
|
|
257
247
|
unambiguous.
|
|
258
248
|
"""
|
|
249
|
+
|
|
259
250
|
def line_generator():
|
|
260
251
|
for item in items:
|
|
261
252
|
header, seq_str = item
|
|
262
253
|
if not isinstance(header, str):
|
|
263
|
-
raise IndexError(
|
|
264
|
-
"'FastaFile' only supports header strings"
|
|
265
|
-
)
|
|
254
|
+
raise IndexError("'FastaFile' only supports header strings")
|
|
266
255
|
if not isinstance(seq_str, str):
|
|
267
|
-
raise TypeError(
|
|
268
|
-
|
|
269
|
-
)
|
|
270
|
-
|
|
256
|
+
raise TypeError("'FastaFile' only supports sequence strings")
|
|
257
|
+
|
|
271
258
|
# Yield header line
|
|
272
|
-
yield ">" + header.replace("\n","").strip()
|
|
259
|
+
yield ">" + header.replace("\n", "").strip()
|
|
273
260
|
|
|
274
261
|
# Yield sequence line(s)
|
|
275
262
|
for line in wrap_string(seq_str, width=chars_per_line):
|
|
276
263
|
yield line
|
|
277
|
-
|
|
278
|
-
TextFile.write_iter(file, line_generator())
|
|
264
|
+
|
|
265
|
+
TextFile.write_iter(file, line_generator())
|
|
@@ -6,10 +6,7 @@ __name__ = "biotite.sequence.io.fastq"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
|
|
8
8
|
from collections import OrderedDict
|
|
9
|
-
from
|
|
10
|
-
from ...alphabet import AlphabetError, LetterAlphabet
|
|
11
|
-
from ...seqtypes import NucleotideSequence
|
|
12
|
-
from ...align.alignment import Alignment
|
|
9
|
+
from biotite.sequence.seqtypes import NucleotideSequence
|
|
13
10
|
|
|
14
11
|
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
|
|
15
12
|
|
|
@@ -17,7 +14,7 @@ __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
|
|
|
17
14
|
def get_sequence(fastq_file, header=None):
|
|
18
15
|
"""
|
|
19
16
|
Get a sequence and quality scores from a `FastqFile` instance.
|
|
20
|
-
|
|
17
|
+
|
|
21
18
|
Parameters
|
|
22
19
|
----------
|
|
23
20
|
fastq_file : FastqFile
|
|
@@ -25,7 +22,7 @@ def get_sequence(fastq_file, header=None):
|
|
|
25
22
|
header : str, optional
|
|
26
23
|
The identifier to get the sequence and scores from.
|
|
27
24
|
By default, the first sequence of the file is returned.
|
|
28
|
-
|
|
25
|
+
|
|
29
26
|
Returns
|
|
30
27
|
-------
|
|
31
28
|
sequence : NucleotideSequence
|
|
@@ -43,7 +40,7 @@ def get_sequence(fastq_file, header=None):
|
|
|
43
40
|
break
|
|
44
41
|
if seq_str is None:
|
|
45
42
|
raise ValueError("File does not contain any sequences")
|
|
46
|
-
processed_seq_str = seq_str.replace("U","T").replace("X","N")
|
|
43
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
47
44
|
return NucleotideSequence(processed_seq_str), scores
|
|
48
45
|
|
|
49
46
|
|
|
@@ -51,12 +48,12 @@ def get_sequences(fastq_file):
|
|
|
51
48
|
"""
|
|
52
49
|
Get a dictionary from a `FastqFile` instance,
|
|
53
50
|
where identifiers are keys and sequence-score-tuples are values.
|
|
54
|
-
|
|
51
|
+
|
|
55
52
|
Parameters
|
|
56
53
|
----------
|
|
57
54
|
fastq_file : FastqFile
|
|
58
55
|
The `Fastqile` to be accessed.
|
|
59
|
-
|
|
56
|
+
|
|
60
57
|
Returns
|
|
61
58
|
-------
|
|
62
59
|
seq_dict : dict
|
|
@@ -65,7 +62,7 @@ def get_sequences(fastq_file):
|
|
|
65
62
|
"""
|
|
66
63
|
seq_dict = OrderedDict()
|
|
67
64
|
for header, (seq_str, scores) in fastq_file.items():
|
|
68
|
-
processed_seq_str = seq_str.replace("U","T").replace("X","N")
|
|
65
|
+
processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
|
|
69
66
|
seq_dict[header] = NucleotideSequence(processed_seq_str), scores
|
|
70
67
|
return seq_dict
|
|
71
68
|
|
|
@@ -73,7 +70,7 @@ def get_sequences(fastq_file):
|
|
|
73
70
|
def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
|
|
74
71
|
"""
|
|
75
72
|
Set a sequence and a quality score array in a `FastqFile` instance.
|
|
76
|
-
|
|
73
|
+
|
|
77
74
|
Parameters
|
|
78
75
|
----------
|
|
79
76
|
fastq_file : FastqFile
|
|
@@ -96,7 +93,7 @@ def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
|
|
|
96
93
|
def set_sequences(fastq_file, sequence_dict, as_rna=False):
|
|
97
94
|
"""
|
|
98
95
|
Set sequences in a `FastqFile` instance from a dictionary.
|
|
99
|
-
|
|
96
|
+
|
|
100
97
|
Parameters
|
|
101
98
|
----------
|
|
102
99
|
fastq_file : FastqFile
|
|
@@ -115,6 +112,6 @@ def set_sequences(fastq_file, sequence_dict, as_rna=False):
|
|
|
115
112
|
|
|
116
113
|
def _convert_to_string(sequence, as_rna):
|
|
117
114
|
if as_rna:
|
|
118
|
-
return
|
|
115
|
+
return str(sequence).replace("T", "U")
|
|
119
116
|
else:
|
|
120
|
-
return
|
|
117
|
+
return str(sequence)
|