biotite 0.41.1__cp312-cp312-win_amd64.whl → 1.0.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +36 -10
- biotite/application/application.py +22 -11
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +16 -5
- biotite/sequence/align/__init__.py +160 -6
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +35 -35
- biotite/sequence/align/pairwise.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +112 -126
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp312-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp312-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +64 -64
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +226 -240
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp312-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +88 -100
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp312-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp312-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +21 -7
- biotite/structure/info/groups.py +10 -15
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -52
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp312-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/METADATA +6 -6
- biotite-1.0.0.dist-info/RECORD +322 -0
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/WHEEL +1 -1
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.1.dist-info/RECORD +0 -340
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -5,23 +5,21 @@
|
|
|
5
5
|
__name__ = "biotite.sequence.io.fastq"
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
|
|
8
|
-
import warnings
|
|
9
|
-
from numbers import Integral
|
|
10
8
|
from collections import OrderedDict
|
|
11
9
|
from collections.abc import MutableMapping
|
|
10
|
+
from numbers import Integral
|
|
12
11
|
import numpy as np
|
|
13
|
-
from
|
|
14
|
-
from ...seqtypes import NucleotideSequence
|
|
12
|
+
from biotite.file import InvalidFileError, TextFile, wrap_string
|
|
15
13
|
|
|
16
14
|
__all__ = ["FastqFile"]
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
_OFFSETS = {
|
|
20
|
-
"Sanger"
|
|
21
|
-
"Solexa"
|
|
22
|
-
"Illumina-1.3"
|
|
23
|
-
"Illumina-1.5"
|
|
24
|
-
"Illumina-1.8"
|
|
18
|
+
"Sanger": 33,
|
|
19
|
+
"Solexa": 64,
|
|
20
|
+
"Illumina-1.3": 64,
|
|
21
|
+
"Illumina-1.5": 64,
|
|
22
|
+
"Illumina-1.8": 33,
|
|
25
23
|
}
|
|
26
24
|
|
|
27
25
|
|
|
@@ -47,7 +45,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
47
45
|
An identifier string (without the leading ``@``) is used as index
|
|
48
46
|
to get and set the corresponding sequence and quality.
|
|
49
47
|
``del`` removes an entry in the file.
|
|
50
|
-
|
|
48
|
+
|
|
51
49
|
Parameters
|
|
52
50
|
----------
|
|
53
51
|
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}
|
|
@@ -61,10 +59,10 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
61
59
|
Only relevant, when adding sequences to a file.
|
|
62
60
|
By default each sequence (and score string)
|
|
63
61
|
is put into one line.
|
|
64
|
-
|
|
62
|
+
|
|
65
63
|
Examples
|
|
66
64
|
--------
|
|
67
|
-
|
|
65
|
+
|
|
68
66
|
>>> import os.path
|
|
69
67
|
>>> file = FastqFile(offset="Sanger")
|
|
70
68
|
>>> file["seq1"] = str(NucleotideSequence("ATACT")), [0,3,10,7,12]
|
|
@@ -91,18 +89,18 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
91
89
|
0.96=GD
|
|
92
90
|
>>> file.write(os.path.join(path_to_directory, "test.fastq"))
|
|
93
91
|
"""
|
|
94
|
-
|
|
92
|
+
|
|
95
93
|
def __init__(self, offset, chars_per_line=None):
|
|
96
94
|
super().__init__()
|
|
97
95
|
self._chars_per_line = chars_per_line
|
|
98
96
|
self._entries = OrderedDict()
|
|
99
97
|
self._offset = _convert_offset(offset)
|
|
100
|
-
|
|
98
|
+
|
|
101
99
|
@classmethod
|
|
102
100
|
def read(cls, file, offset, chars_per_line=None):
|
|
103
101
|
"""
|
|
104
102
|
Read a FASTQ file.
|
|
105
|
-
|
|
103
|
+
|
|
106
104
|
Parameters
|
|
107
105
|
----------
|
|
108
106
|
file : file-like object or str
|
|
@@ -119,7 +117,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
119
117
|
Only relevant, when adding sequences to a file.
|
|
120
118
|
By default each sequence (and score string)
|
|
121
119
|
is put into one line.
|
|
122
|
-
|
|
120
|
+
|
|
123
121
|
Returns
|
|
124
122
|
-------
|
|
125
123
|
file_object : FastqFile
|
|
@@ -134,31 +132,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
134
132
|
raise InvalidFileError("File is empty")
|
|
135
133
|
file._find_entries()
|
|
136
134
|
return file
|
|
137
|
-
|
|
138
|
-
def get_sequence(self, identifier):
|
|
139
|
-
"""
|
|
140
|
-
Get the sequence for the specified identifier.
|
|
141
|
-
|
|
142
|
-
DEPRECATED: Use :meth:`get_seq_string()` or
|
|
143
|
-
:func:`get_sequence()` instead.
|
|
144
135
|
|
|
145
|
-
Parameters
|
|
146
|
-
----------
|
|
147
|
-
identifier : str
|
|
148
|
-
The identifier of the sequence.
|
|
149
|
-
|
|
150
|
-
Returns
|
|
151
|
-
-------
|
|
152
|
-
sequence : NucleotideSequence
|
|
153
|
-
The sequence corresponding to the identifier.
|
|
154
|
-
"""
|
|
155
|
-
warnings.warn(
|
|
156
|
-
"'get_sequence()' is deprecated, use the 'get_seq_string()'"
|
|
157
|
-
"method or 'fasta.get_sequence()' function instead",
|
|
158
|
-
DeprecationWarning
|
|
159
|
-
)
|
|
160
|
-
return NucleotideSequence(self.get_seq_string(identifier))
|
|
161
|
-
|
|
162
136
|
def get_seq_string(self, identifier):
|
|
163
137
|
"""
|
|
164
138
|
Get the string representing the sequence for the specified
|
|
@@ -168,22 +142,19 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
168
142
|
----------
|
|
169
143
|
identifier : str
|
|
170
144
|
The identifier of the sequence.
|
|
171
|
-
|
|
145
|
+
|
|
172
146
|
Returns
|
|
173
147
|
-------
|
|
174
148
|
sequence : str
|
|
175
149
|
The sequence corresponding to the identifier.
|
|
176
150
|
"""
|
|
177
151
|
if not isinstance(identifier, str):
|
|
178
|
-
raise IndexError(
|
|
179
|
-
|
|
180
|
-
)
|
|
181
|
-
seq_start, seq_stop, score_start, score_stop \
|
|
182
|
-
= self._entries[identifier]
|
|
152
|
+
raise IndexError("'FastqFile' only supports identifier strings as keys")
|
|
153
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
183
154
|
# Concatenate sequence string from the sequence lines
|
|
184
|
-
seq_str = "".join(self.lines[seq_start
|
|
155
|
+
seq_str = "".join(self.lines[seq_start:seq_stop])
|
|
185
156
|
return seq_str
|
|
186
|
-
|
|
157
|
+
|
|
187
158
|
def get_quality(self, identifier):
|
|
188
159
|
"""
|
|
189
160
|
Get the quality scores for the specified identifier.
|
|
@@ -192,24 +163,20 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
192
163
|
----------
|
|
193
164
|
identifier : str
|
|
194
165
|
The identifier of the quality scores.
|
|
195
|
-
|
|
166
|
+
|
|
196
167
|
Returns
|
|
197
168
|
-------
|
|
198
169
|
scores : ndarray, dtype=int
|
|
199
170
|
The quality scores corresponding to the identifier.
|
|
200
171
|
"""
|
|
201
172
|
if not isinstance(identifier, str):
|
|
202
|
-
raise IndexError(
|
|
203
|
-
|
|
204
|
-
)
|
|
205
|
-
seq_start, seq_stop, score_start, score_stop \
|
|
206
|
-
= self._entries[identifier]
|
|
173
|
+
raise IndexError("'FastqFile' only supports identifier strings as keys")
|
|
174
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
207
175
|
# Concatenate sequence string from the score lines
|
|
208
176
|
return _score_str_to_scores(
|
|
209
|
-
"".join(self.lines[score_start
|
|
210
|
-
self._offset
|
|
177
|
+
"".join(self.lines[score_start:score_stop]), self._offset
|
|
211
178
|
)
|
|
212
|
-
|
|
179
|
+
|
|
213
180
|
def __setitem__(self, identifier, item):
|
|
214
181
|
sequence, scores = item
|
|
215
182
|
if len(sequence) != len(scores):
|
|
@@ -218,24 +185,22 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
218
185
|
f"but score length is {len(scores)}"
|
|
219
186
|
)
|
|
220
187
|
if not isinstance(identifier, str):
|
|
221
|
-
raise IndexError(
|
|
222
|
-
"'FastqFile' only supports strings as identifier"
|
|
223
|
-
)
|
|
188
|
+
raise IndexError("'FastqFile' only supports strings as identifier")
|
|
224
189
|
# Delete lines of entry corresponding to the identifier,
|
|
225
190
|
# if already existing
|
|
226
191
|
if identifier in self:
|
|
227
192
|
del self[identifier]
|
|
228
|
-
|
|
193
|
+
|
|
229
194
|
# Create new lines
|
|
230
195
|
# Start with identifier line
|
|
231
|
-
new_lines = ["@" + identifier.replace("\n","").strip()]
|
|
196
|
+
new_lines = ["@" + identifier.replace("\n", "").strip()]
|
|
232
197
|
# Append new lines with sequence string (with line breaks)
|
|
233
198
|
seq_start_i = len(new_lines)
|
|
234
199
|
if self._chars_per_line is None:
|
|
235
200
|
new_lines.append(str(sequence))
|
|
236
201
|
else:
|
|
237
202
|
new_lines += wrap_string(sequence, width=self._chars_per_line)
|
|
238
|
-
seq_stop_i =len(new_lines)
|
|
203
|
+
seq_stop_i = len(new_lines)
|
|
239
204
|
# Append sequence-score separator
|
|
240
205
|
new_lines += ["+"]
|
|
241
206
|
# Append scores
|
|
@@ -261,29 +226,28 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
261
226
|
len(self.lines) + seq_start_i,
|
|
262
227
|
len(self.lines) + seq_stop_i,
|
|
263
228
|
len(self.lines) + score_start_i,
|
|
264
|
-
len(self.lines) + score_stop_i
|
|
229
|
+
len(self.lines) + score_stop_i,
|
|
265
230
|
)
|
|
266
231
|
self.lines += new_lines
|
|
267
|
-
|
|
232
|
+
|
|
268
233
|
def __getitem__(self, identifier):
|
|
269
234
|
return self.get_seq_string(identifier), self.get_quality(identifier)
|
|
270
|
-
|
|
235
|
+
|
|
271
236
|
def __delitem__(self, identifier):
|
|
272
|
-
seq_start, seq_stop, score_start, score_stop
|
|
273
|
-
|
|
274
|
-
del self.lines[seq_start-1 : score_stop]
|
|
237
|
+
seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
|
|
238
|
+
del self.lines[seq_start - 1 : score_stop]
|
|
275
239
|
del self._entries[identifier]
|
|
276
240
|
self._find_entries()
|
|
277
|
-
|
|
241
|
+
|
|
278
242
|
def __len__(self):
|
|
279
243
|
return len(self._entries)
|
|
280
|
-
|
|
244
|
+
|
|
281
245
|
def __iter__(self):
|
|
282
246
|
return self._entries.__iter__()
|
|
283
|
-
|
|
247
|
+
|
|
284
248
|
def __contains__(self, identifer):
|
|
285
249
|
return identifer in self._entries
|
|
286
|
-
|
|
250
|
+
|
|
287
251
|
def _find_entries(self):
|
|
288
252
|
self._entries = OrderedDict()
|
|
289
253
|
in_sequence = False
|
|
@@ -302,7 +266,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
302
266
|
if not in_scores and not in_sequence and line[0] == "@":
|
|
303
267
|
# Identifier line
|
|
304
268
|
identifier = line[1:]
|
|
305
|
-
seq_start_i = i+1
|
|
269
|
+
seq_start_i = i + 1
|
|
306
270
|
# Next line is sequence
|
|
307
271
|
in_sequence = True
|
|
308
272
|
# Reset
|
|
@@ -314,7 +278,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
314
278
|
in_sequence = False
|
|
315
279
|
in_scores = True
|
|
316
280
|
seq_stop_i = i
|
|
317
|
-
score_start_i = i+1
|
|
281
|
+
score_start_i = i + 1
|
|
318
282
|
else:
|
|
319
283
|
# Still in sequence
|
|
320
284
|
seq_len += len(line)
|
|
@@ -330,9 +294,12 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
330
294
|
in_scores = False
|
|
331
295
|
# Record this entry
|
|
332
296
|
self._entries[identifier] = (
|
|
333
|
-
seq_start_i,
|
|
297
|
+
seq_start_i,
|
|
298
|
+
seq_stop_i,
|
|
299
|
+
score_start_i,
|
|
300
|
+
score_stop_i,
|
|
334
301
|
)
|
|
335
|
-
else:
|
|
302
|
+
else: # score_len > seq_len
|
|
336
303
|
raise InvalidFileError(
|
|
337
304
|
f"The amount of scores is not equal to the sequence "
|
|
338
305
|
f"length for the sequence in line {seq_start_i+1} "
|
|
@@ -343,14 +310,13 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
343
310
|
# must have properly ended
|
|
344
311
|
if in_sequence or in_scores:
|
|
345
312
|
raise InvalidFileError("The last entry in the file is incomplete")
|
|
346
|
-
|
|
347
313
|
|
|
348
314
|
@staticmethod
|
|
349
315
|
def read_iter(file, offset):
|
|
350
316
|
"""
|
|
351
317
|
Create an iterator over each sequence (and corresponding scores)
|
|
352
318
|
of the given FASTQ file.
|
|
353
|
-
|
|
319
|
+
|
|
354
320
|
Parameters
|
|
355
321
|
----------
|
|
356
322
|
file : file-like object or str
|
|
@@ -361,7 +327,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
361
327
|
ASCII code.
|
|
362
328
|
Can either be directly the value, or a string that indicates
|
|
363
329
|
the score format.
|
|
364
|
-
|
|
330
|
+
|
|
365
331
|
Yields
|
|
366
332
|
------
|
|
367
333
|
identifier : str
|
|
@@ -369,7 +335,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
369
335
|
sequence : tuple(str, ndarray)
|
|
370
336
|
The current sequence as string and its corresponding quality
|
|
371
337
|
scores as :class:`ndarray`.
|
|
372
|
-
|
|
338
|
+
|
|
373
339
|
Notes
|
|
374
340
|
-----
|
|
375
341
|
This approach gives the same results as
|
|
@@ -377,7 +343,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
377
343
|
and much more memory efficient.
|
|
378
344
|
"""
|
|
379
345
|
offset = _convert_offset(offset)
|
|
380
|
-
|
|
346
|
+
|
|
381
347
|
identifier = None
|
|
382
348
|
seq_str_list = []
|
|
383
349
|
score_str_list = []
|
|
@@ -391,7 +357,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
391
357
|
# Ignore empty lines
|
|
392
358
|
if len(line) == 0:
|
|
393
359
|
continue
|
|
394
|
-
|
|
360
|
+
|
|
395
361
|
if not in_scores and not in_sequence and line[0] == "@":
|
|
396
362
|
# Track new entry
|
|
397
363
|
identifier = line[1:]
|
|
@@ -401,7 +367,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
401
367
|
score_len = 0
|
|
402
368
|
seq_str_list = []
|
|
403
369
|
score_str_list = []
|
|
404
|
-
|
|
370
|
+
|
|
405
371
|
elif in_sequence:
|
|
406
372
|
if line[0] == "+":
|
|
407
373
|
# End of sequence start of scores
|
|
@@ -411,7 +377,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
411
377
|
# Still in sequence
|
|
412
378
|
seq_len += len(line)
|
|
413
379
|
seq_str_list.append(line)
|
|
414
|
-
|
|
380
|
+
|
|
415
381
|
elif in_scores:
|
|
416
382
|
score_len += len(line)
|
|
417
383
|
score_str_list.append(line)
|
|
@@ -422,20 +388,15 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
422
388
|
# -> End of entry
|
|
423
389
|
in_scores = False
|
|
424
390
|
# yield this entry
|
|
425
|
-
scores = _score_str_to_scores(
|
|
426
|
-
"".join(score_str_list),
|
|
427
|
-
offset
|
|
428
|
-
)
|
|
391
|
+
scores = _score_str_to_scores("".join(score_str_list), offset)
|
|
429
392
|
yield identifier, ("".join(seq_str_list), scores)
|
|
430
|
-
else:
|
|
393
|
+
else: # score_len > seq_len
|
|
431
394
|
raise InvalidFileError(
|
|
432
|
-
|
|
433
|
-
f"length"
|
|
395
|
+
"The amount of scores is not equal to the sequence " "length"
|
|
434
396
|
)
|
|
435
|
-
|
|
397
|
+
|
|
436
398
|
else:
|
|
437
|
-
raise InvalidFileError(
|
|
438
|
-
|
|
399
|
+
raise InvalidFileError("FASTQ file is invalid")
|
|
439
400
|
|
|
440
401
|
@staticmethod
|
|
441
402
|
def write_iter(file, items, offset, chars_per_line=None):
|
|
@@ -449,7 +410,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
449
410
|
Hence, this static method may save a large amount of memory if
|
|
450
411
|
a large file should be written, especially if the `items`
|
|
451
412
|
are provided as generator.
|
|
452
|
-
|
|
413
|
+
|
|
453
414
|
Parameters
|
|
454
415
|
----------
|
|
455
416
|
file : file-like object or str
|
|
@@ -487,12 +448,10 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
487
448
|
f"but score length is {len(scores)}"
|
|
488
449
|
)
|
|
489
450
|
if not isinstance(identifier, str):
|
|
490
|
-
raise IndexError(
|
|
491
|
-
|
|
492
|
-
)
|
|
493
|
-
|
|
451
|
+
raise IndexError("'FastqFile' only supports strings as identifier")
|
|
452
|
+
|
|
494
453
|
# Yield identifier line
|
|
495
|
-
yield "@" + identifier.replace("\n","").strip()
|
|
454
|
+
yield "@" + identifier.replace("\n", "").strip()
|
|
496
455
|
|
|
497
456
|
# Yield sequence line(s)
|
|
498
457
|
if chars_per_line is None:
|
|
@@ -500,10 +459,10 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
500
459
|
else:
|
|
501
460
|
for line in wrap_string(sequence, width=chars_per_line):
|
|
502
461
|
yield line
|
|
503
|
-
|
|
462
|
+
|
|
504
463
|
# Yield separator
|
|
505
464
|
yield "+"
|
|
506
|
-
|
|
465
|
+
|
|
507
466
|
# Yield scores
|
|
508
467
|
score_chars = _scores_to_score_str(scores, offset)
|
|
509
468
|
if chars_per_line is None:
|
|
@@ -511,7 +470,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
511
470
|
else:
|
|
512
471
|
for line in wrap_string(score_chars, width=chars_per_line):
|
|
513
472
|
yield line
|
|
514
|
-
|
|
473
|
+
|
|
515
474
|
TextFile.write_iter(file, line_generator())
|
|
516
475
|
|
|
517
476
|
|
|
@@ -519,15 +478,11 @@ def _score_str_to_scores(score_str, offset):
|
|
|
519
478
|
"""
|
|
520
479
|
Convert an ASCII string into actual score values.
|
|
521
480
|
"""
|
|
522
|
-
scores = np.frombuffer(
|
|
523
|
-
bytearray(
|
|
524
|
-
score_str, encoding="ascii"
|
|
525
|
-
),
|
|
526
|
-
dtype=np.int8
|
|
527
|
-
)
|
|
481
|
+
scores = np.frombuffer(bytearray(score_str, encoding="ascii"), dtype=np.int8)
|
|
528
482
|
scores -= offset
|
|
529
483
|
return scores
|
|
530
484
|
|
|
485
|
+
|
|
531
486
|
def _scores_to_score_str(scores, offset):
|
|
532
487
|
"""
|
|
533
488
|
Convert score values into an ASCII string.
|
|
@@ -535,6 +490,7 @@ def _scores_to_score_str(scores, offset):
|
|
|
535
490
|
scores = np.asarray(scores) + offset
|
|
536
491
|
return scores.astype(np.int8, copy=False).tobytes().decode("ascii")
|
|
537
492
|
|
|
493
|
+
|
|
538
494
|
def _convert_offset(offset_val_or_string):
|
|
539
495
|
"""
|
|
540
496
|
If the given offset is a string return the corresponding numerical
|
|
@@ -543,9 +499,9 @@ def _convert_offset(offset_val_or_string):
|
|
|
543
499
|
if isinstance(offset_val_or_string, Integral):
|
|
544
500
|
return offset_val_or_string
|
|
545
501
|
elif isinstance(offset_val_or_string, str):
|
|
546
|
-
|
|
502
|
+
return _OFFSETS[offset_val_or_string]
|
|
547
503
|
else:
|
|
548
504
|
raise TypeError(
|
|
549
505
|
f"The offset must be either an integer or a string "
|
|
550
506
|
f"indicating the format, not {type(offset_val_or_string).__name__}"
|
|
551
|
-
)
|
|
507
|
+
)
|
|
@@ -11,7 +11,7 @@ and *GenPept* format.
|
|
|
11
11
|
__name__ = "biotite.sequence.io.genbank"
|
|
12
12
|
__author__ = "Patrick Kunzmann"
|
|
13
13
|
|
|
14
|
-
from .file import *
|
|
15
14
|
from .annotation import *
|
|
15
|
+
from .file import *
|
|
16
|
+
from .metadata import *
|
|
16
17
|
from .sequence import *
|
|
17
|
-
from .metadata import *
|
|
@@ -12,10 +12,8 @@ __all__ = ["get_annotation", "set_annotation"]
|
|
|
12
12
|
|
|
13
13
|
import re
|
|
14
14
|
import warnings
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from .file import GenBankFile
|
|
18
|
-
|
|
15
|
+
from biotite.file import InvalidFileError
|
|
16
|
+
from biotite.sequence.annotation import Annotation, Feature, Location
|
|
19
17
|
|
|
20
18
|
_KEY_START = 5
|
|
21
19
|
_QUAL_START = 21
|
|
@@ -46,7 +44,6 @@ def get_annotation(gb_file, include_only=None):
|
|
|
46
44
|
raise InvalidFileError("File has multiple 'FEATURES' fields")
|
|
47
45
|
lines, _ = fields[0]
|
|
48
46
|
|
|
49
|
-
|
|
50
47
|
### Parse all lines to create an index of features,
|
|
51
48
|
# i.e. pairs of the feature key
|
|
52
49
|
# and the text belonging to the respective feature
|
|
@@ -60,13 +57,12 @@ def get_annotation(gb_file, include_only=None):
|
|
|
60
57
|
# Store old feature key and value
|
|
61
58
|
feature_list.append((feature_key, feature_value))
|
|
62
59
|
# Track new key
|
|
63
|
-
feature_key = line[_KEY_START : _QUAL_START-1].strip()
|
|
60
|
+
feature_key = line[_KEY_START : _QUAL_START - 1].strip()
|
|
64
61
|
feature_value = ""
|
|
65
62
|
feature_value += line[_QUAL_START:] + " "
|
|
66
63
|
# Store last feature key and value (loop already exited)
|
|
67
64
|
feature_list.append((feature_key, feature_value))
|
|
68
65
|
|
|
69
|
-
|
|
70
66
|
### Process only relevant features and put them into an Annotation
|
|
71
67
|
annotation = Annotation()
|
|
72
68
|
# Regex to separate qualifiers from each other
|
|
@@ -92,7 +88,7 @@ def get_annotation(gb_file, include_only=None):
|
|
|
92
88
|
loc_string = qualifier_parts.pop(0).strip()
|
|
93
89
|
try:
|
|
94
90
|
locs = _parse_locs(loc_string)
|
|
95
|
-
except:
|
|
91
|
+
except Exception:
|
|
96
92
|
warnings.warn(
|
|
97
93
|
f"'{loc_string}' is an unsupported location identifier, "
|
|
98
94
|
f"skipping feature"
|
|
@@ -114,7 +110,7 @@ def get_annotation(gb_file, include_only=None):
|
|
|
114
110
|
# -> split at whitespaces,
|
|
115
111
|
# as keys do not contain whitespaces
|
|
116
112
|
for subpart in part.split():
|
|
117
|
-
if
|
|
113
|
+
if "=" not in subpart:
|
|
118
114
|
# Qualifier without value, e.g. '/pseudo'
|
|
119
115
|
# -> store immediately
|
|
120
116
|
# Remove "/" -> subpart[1:]
|
|
@@ -147,11 +143,11 @@ def get_annotation(gb_file, include_only=None):
|
|
|
147
143
|
def _parse_locs(loc_str):
|
|
148
144
|
locs = []
|
|
149
145
|
if loc_str.startswith(("join", "order")):
|
|
150
|
-
str_list = loc_str[loc_str.index("(")+1:loc_str.rindex(")")].split(",")
|
|
146
|
+
str_list = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")].split(",")
|
|
151
147
|
for s in str_list:
|
|
152
148
|
locs.extend(_parse_locs(s.strip()))
|
|
153
149
|
elif loc_str.startswith("complement"):
|
|
154
|
-
compl_str = loc_str[loc_str.index("(")+1:loc_str.rindex(")")]
|
|
150
|
+
compl_str = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")]
|
|
155
151
|
compl_locs = [
|
|
156
152
|
Location(loc.first, loc.last, Location.Strand.REVERSE, loc.defect)
|
|
157
153
|
for loc in _parse_locs(compl_str)
|
|
@@ -214,8 +210,6 @@ def _set_qual(qual_dict, key, val):
|
|
|
214
210
|
qual_dict[key] = val
|
|
215
211
|
|
|
216
212
|
|
|
217
|
-
|
|
218
|
-
|
|
219
213
|
def set_annotation(gb_file, annotation):
|
|
220
214
|
"""
|
|
221
215
|
Set the *FEATURES* field of a GenBank file with an annotation.
|
|
@@ -236,12 +230,12 @@ def set_annotation(gb_file, annotation):
|
|
|
236
230
|
for key, values in feature.qual.items():
|
|
237
231
|
if values is None:
|
|
238
232
|
line = " " * _QUAL_START
|
|
239
|
-
line +=
|
|
233
|
+
line += f"/{key}"
|
|
240
234
|
lines.append(line)
|
|
241
235
|
else:
|
|
242
236
|
for val in values.split("\n"):
|
|
243
237
|
line = " " * _QUAL_START
|
|
244
|
-
line +=
|
|
238
|
+
line += f'/{key}="{val}"'
|
|
245
239
|
lines.append(line)
|
|
246
240
|
gb_file.set_field("FEATURES", lines)
|
|
247
241
|
|
|
@@ -254,11 +248,11 @@ def _convert_to_loc_string(locs):
|
|
|
254
248
|
if len(locs) == 1:
|
|
255
249
|
loc = list(locs)[0]
|
|
256
250
|
loc_first_str = str(loc.first)
|
|
257
|
-
loc_last_str
|
|
251
|
+
loc_last_str = str(loc.last)
|
|
258
252
|
if loc.defect & Location.Defect.BEYOND_LEFT:
|
|
259
253
|
loc_first_str = "<" + loc_first_str
|
|
260
254
|
if loc.defect & Location.Defect.BEYOND_RIGHT:
|
|
261
|
-
loc_last_str
|
|
255
|
+
loc_last_str = ">" + loc_last_str
|
|
262
256
|
if loc.first == loc.last:
|
|
263
257
|
loc_string = loc_first_str
|
|
264
258
|
elif loc.defect & Location.Defect.UNK_LOC:
|
|
@@ -270,8 +264,6 @@ def _convert_to_loc_string(locs):
|
|
|
270
264
|
if loc.strand == Location.Strand.REVERSE:
|
|
271
265
|
loc_string = f"complement({loc_string})"
|
|
272
266
|
else:
|
|
273
|
-
loc_string = ",".join(
|
|
274
|
-
[_convert_to_loc_string([loc]) for loc in locs]
|
|
275
|
-
)
|
|
267
|
+
loc_string = ",".join([_convert_to_loc_string([loc]) for loc in locs])
|
|
276
268
|
loc_string = f"join({loc_string})"
|
|
277
269
|
return loc_string
|