biotite 0.38.0__cp311-cp311-win_amd64.whl → 0.40.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +3 -3
- biotite/application/application.py +33 -28
- biotite/application/dssp/app.py +18 -18
- biotite/application/sra/__init__.py +5 -0
- biotite/application/sra/app.py +337 -55
- biotite/database/entrez/__init__.py +2 -1
- biotite/database/entrez/check.py +14 -3
- biotite/database/entrez/download.py +20 -13
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +38 -34
- biotite/database/pubchem/query.py +44 -44
- biotite/database/rcsb/download.py +19 -14
- biotite/database/rcsb/query.py +46 -46
- biotite/sequence/align/__init__.py +5 -1
- biotite/sequence/align/banded.c +1408 -1025
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +69 -0
- biotite/sequence/align/cigar.py +389 -0
- biotite/sequence/align/kmeralphabet.c +3220 -2850
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.c +713 -663
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cpp +68398 -0
- biotite/sequence/align/localgapped.c +1507 -1074
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.c +1143 -833
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.c +1569 -1092
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.c +1612 -1212
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.c +33259 -0
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/{kmertable.c → selector.c} +9129 -16497
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.c +685 -646
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codec.c +1159 -841
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/graphics/alignment.py +212 -2
- biotite/sequence/io/genbank/annotation.py +11 -11
- biotite/sequence/phylo/nj.c +684 -636
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.c +970 -673
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.c +672 -626
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/structure/__init__.py +1 -1
- biotite/structure/atoms.py +1 -1
- biotite/structure/basepairs.py +7 -12
- biotite/structure/bonds.c +3861 -3749
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/celllist.c +727 -707
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/charges.c +1561 -1560
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/filter.py +30 -37
- biotite/structure/info/__init__.py +5 -8
- biotite/structure/info/atoms.py +25 -67
- biotite/structure/info/bonds.py +46 -100
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1646 -0
- biotite/structure/info/ccd/carbohydrates.txt +1133 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +797 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +21 -20
- biotite/structure/info/misc.py +11 -22
- biotite/structure/info/standardize.py +17 -12
- biotite/structure/io/__init__.py +2 -4
- biotite/structure/io/ctab.py +1 -1
- biotite/structure/io/general.py +37 -43
- biotite/structure/io/mmtf/__init__.py +3 -0
- biotite/structure/io/mmtf/convertarray.c +528 -365
- biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.c +725 -676
- biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.c +1070 -754
- biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.c +727 -677
- biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/file.py +34 -26
- biotite/structure/io/npz/__init__.py +3 -0
- biotite/structure/io/npz/file.py +21 -18
- biotite/structure/io/pdb/__init__.py +3 -3
- biotite/structure/io/pdb/file.py +72 -70
- biotite/structure/io/pdb/hybrid36.c +540 -478
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +82 -68
- biotite/structure/io/pdbx/__init__.py +13 -6
- biotite/structure/io/pdbx/bcif.py +649 -0
- biotite/structure/io/pdbx/cif.py +1028 -0
- biotite/structure/io/pdbx/component.py +243 -0
- biotite/structure/io/pdbx/convert.py +707 -359
- biotite/structure/io/pdbx/encoding.c +112813 -0
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/error.py +14 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/molecules.py +151 -151
- biotite/structure/residues.py +40 -40
- biotite/structure/sasa.c +713 -644
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/superimpose.py +158 -115
- biotite/visualize.py +9 -11
- {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
- {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/RECORD +112 -102
- {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
- biotite/structure/info/amino_acids.json +0 -1556
- biotite/structure/info/amino_acids.py +0 -42
- biotite/structure/info/carbohydrates.json +0 -1122
- biotite/structure/info/carbohydrates.py +0 -39
- biotite/structure/info/intra_bonds.msgpack +0 -0
- biotite/structure/info/link_types.msgpack +0 -1
- biotite/structure/info/nucleotides.json +0 -772
- biotite/structure/info/nucleotides.py +0 -39
- biotite/structure/info/residue_masses.msgpack +0 -0
- biotite/structure/info/residue_names.msgpack +0 -3
- biotite/structure/info/residues.msgpack +0 -0
- biotite/structure/io/pdbx/file.py +0 -652
- {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
- {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
biotite/application/sra/app.py
CHANGED
|
@@ -4,19 +4,26 @@
|
|
|
4
4
|
|
|
5
5
|
__name__ = "biotite.application.sra"
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
|
-
__all__ = ["FastqDumpApp"]
|
|
7
|
+
__all__ = ["FastaDumpApp", "FastqDumpApp"]
|
|
8
8
|
|
|
9
|
+
import abc
|
|
10
|
+
from os.path import join
|
|
11
|
+
from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired
|
|
9
12
|
import glob
|
|
10
|
-
from tempfile import
|
|
11
|
-
from ..
|
|
12
|
-
|
|
13
|
+
from tempfile import TemporaryDirectory
|
|
14
|
+
from ..application import Application, AppState, AppStateError, \
|
|
15
|
+
requires_state
|
|
16
|
+
from ...sequence.seqtypes import NucleotideSequence
|
|
13
17
|
from ...sequence.io.fastq.file import FastqFile
|
|
14
|
-
from ...sequence.io.
|
|
18
|
+
from ...sequence.io.fasta.file import FastaFile
|
|
19
|
+
from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores
|
|
20
|
+
from ...sequence.io.fasta.convert import get_sequences
|
|
15
21
|
|
|
16
22
|
|
|
17
|
-
|
|
23
|
+
# Do not use LocalApp, as two programs are executed
|
|
24
|
+
class _DumpApp(Application, metaclass=abc.ABCMeta):
|
|
18
25
|
"""
|
|
19
|
-
Fetch sequencing data
|
|
26
|
+
Fetch sequencing data from the *NCBI sequence read archive*
|
|
20
27
|
(SRA) using *sra-tools*.
|
|
21
28
|
|
|
22
29
|
Parameters
|
|
@@ -31,85 +38,212 @@ class FastqDumpApp(LocalApp):
|
|
|
31
38
|
multiple reads per spot.
|
|
32
39
|
By default, the files are created in a temporary directory and
|
|
33
40
|
deleted after the files have been read.
|
|
34
|
-
|
|
35
|
-
Path to the ``fasterq-dump`` binary
|
|
41
|
+
prefetch_path, fasterq_dump_path : str, optional
|
|
42
|
+
Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
|
|
43
|
+
respectively.
|
|
36
44
|
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
|
|
37
45
|
This value is subtracted from the FASTQ ASCII code to obtain the
|
|
38
46
|
quality score.
|
|
39
47
|
Can either be directly the value, or a string that indicates
|
|
40
48
|
the score format.
|
|
41
49
|
"""
|
|
42
|
-
|
|
43
|
-
def __init__(self, uid, output_path_prefix=None,
|
|
44
|
-
|
|
45
|
-
super().__init__(
|
|
50
|
+
|
|
51
|
+
def __init__(self, uid, output_path_prefix=None,
|
|
52
|
+
prefetch_path="prefetch", fasterq_dump_path="fasterq-dump"):
|
|
53
|
+
super().__init__()
|
|
54
|
+
self._prefetch_path = prefetch_path
|
|
55
|
+
self._fasterq_dump_path = fasterq_dump_path
|
|
46
56
|
self._uid = uid
|
|
47
|
-
self.
|
|
57
|
+
self._sra_dir = TemporaryDirectory(suffix="_sra")
|
|
48
58
|
if output_path_prefix is None:
|
|
49
|
-
|
|
50
|
-
# for FASTQ files
|
|
51
|
-
self._out_file = NamedTemporaryFile("r")
|
|
52
|
-
self._prefix = self._out_file.name
|
|
59
|
+
self._prefix = join(self._sra_dir.name, self._uid)
|
|
53
60
|
else:
|
|
54
|
-
self._out_file = None
|
|
55
61
|
self._prefix = output_path_prefix
|
|
62
|
+
self._prefetch_process = None
|
|
63
|
+
self._fasterq_dump_process = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@requires_state(AppState.RUNNING | AppState.FINISHED)
|
|
67
|
+
def join(self, timeout=None):
|
|
68
|
+
# Override method as repetitive calls of 'is_finished()'
|
|
69
|
+
# are not necessary as 'communicate()' already waits for the
|
|
70
|
+
# finished application
|
|
71
|
+
try:
|
|
72
|
+
_, self._stderr = self._process.communicate(
|
|
73
|
+
timeout=timeout
|
|
74
|
+
)
|
|
75
|
+
except TimeoutExpired:
|
|
76
|
+
self.cancel()
|
|
77
|
+
raise TimeoutError(
|
|
78
|
+
f"The application expired its timeout ({timeout:.1f} s)"
|
|
79
|
+
)
|
|
80
|
+
self._state = AppState.FINISHED
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
self.evaluate()
|
|
84
|
+
except AppStateError:
|
|
85
|
+
raise
|
|
86
|
+
except:
|
|
87
|
+
self._state = AppState.CANCELLED
|
|
88
|
+
raise
|
|
89
|
+
else:
|
|
90
|
+
self._state = AppState.JOINED
|
|
91
|
+
self.clean_up()
|
|
92
|
+
|
|
56
93
|
|
|
57
94
|
def run(self):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
self.
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
95
|
+
# Prefetch into a temp directory with file name equaling UID
|
|
96
|
+
# This ensures that the ID in the header is not the temp prefix
|
|
97
|
+
sra_file_name = join(self._sra_dir.name, self._uid)
|
|
98
|
+
command = (
|
|
99
|
+
f"{self._prefetch_path} -q -O {self._sra_dir.name} "
|
|
100
|
+
f"{self.get_prefetch_options()} {self._uid}; "
|
|
101
|
+
f"{self._fasterq_dump_path} -q -o {self._prefix}.fastq "
|
|
102
|
+
f"{self.get_fastq_dump_options()} {sra_file_name}"
|
|
103
|
+
)
|
|
104
|
+
self._process = Popen(
|
|
105
|
+
command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def is_finished(self):
|
|
110
|
+
code = self._process.poll()
|
|
111
|
+
if code == None:
|
|
112
|
+
return False
|
|
113
|
+
else:
|
|
114
|
+
_, self._stderr = self._process.communicate()
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
|
|
66
118
|
def evaluate(self):
|
|
67
119
|
super().evaluate()
|
|
120
|
+
# Check if applicaion terminated correctly
|
|
121
|
+
exit_code = self._process.returncode
|
|
122
|
+
if exit_code != 0:
|
|
123
|
+
err_msg = self._stderr.replace("\n", " ")
|
|
124
|
+
raise SubprocessError(
|
|
125
|
+
f"'prefetch' or 'fasterq-dump' returned with exit code "
|
|
126
|
+
f"{exit_code}: {err_msg}"
|
|
127
|
+
)
|
|
128
|
+
|
|
68
129
|
self._file_names = (
|
|
69
130
|
# For entries with one read per spot
|
|
70
|
-
glob.glob(self._prefix + ".fastq") +
|
|
131
|
+
glob.glob(self._prefix + ".fastq") +
|
|
71
132
|
# For entries with multiple reads per spot
|
|
72
133
|
glob.glob(self._prefix + "_*.fastq")
|
|
73
134
|
)
|
|
74
135
|
# Only load FASTQ files into memory when needed
|
|
75
136
|
self._fastq_files = None
|
|
76
|
-
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def wait_interval(self):
|
|
140
|
+
# Not used in this implementation of 'join()'
|
|
141
|
+
raise NotImplementedError()
|
|
142
|
+
|
|
143
|
+
|
|
77
144
|
def clean_up(self):
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
145
|
+
if self.get_app_state() == AppState.CANCELLED:
|
|
146
|
+
self._process.kill()
|
|
147
|
+
# Directory with temp files does not need to be deleted,
|
|
148
|
+
# as temp dir is automatically deleted upon object destruction
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@requires_state(AppState.CREATED)
|
|
152
|
+
def get_prefetch_options(self):
|
|
153
|
+
"""
|
|
154
|
+
Get additional options for the `prefetch` call.
|
|
155
|
+
|
|
156
|
+
PROTECTED: Override when inheriting.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
options: str
|
|
161
|
+
The additional options.
|
|
162
|
+
"""
|
|
163
|
+
return ""
|
|
164
|
+
|
|
165
|
+
@requires_state(AppState.CREATED)
|
|
166
|
+
def get_fastq_dump_options(self):
|
|
167
|
+
"""
|
|
168
|
+
Get additional options for the `fasterq-dump` call.
|
|
169
|
+
|
|
170
|
+
PROTECTED: Override when inheriting.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
options: str
|
|
175
|
+
The additional options.
|
|
176
|
+
"""
|
|
177
|
+
return ""
|
|
178
|
+
|
|
179
|
+
|
|
84
180
|
@requires_state(AppState.JOINED)
|
|
85
181
|
def get_file_paths(self):
|
|
86
182
|
"""
|
|
87
|
-
Get the file paths to the downloaded
|
|
88
|
-
|
|
183
|
+
Get the file paths to the downloaded files.
|
|
184
|
+
|
|
89
185
|
Returns
|
|
90
186
|
-------
|
|
91
187
|
paths : list of str
|
|
92
188
|
The file paths to the downloaded files.
|
|
93
189
|
"""
|
|
94
190
|
return self._file_names
|
|
95
|
-
|
|
191
|
+
|
|
192
|
+
|
|
96
193
|
@requires_state(AppState.JOINED)
|
|
194
|
+
@abc.abstractmethod
|
|
97
195
|
def get_sequences(self):
|
|
98
196
|
"""
|
|
99
|
-
Get the sequences
|
|
100
|
-
|
|
197
|
+
Get the sequences from the downloaded file(s).
|
|
198
|
+
|
|
101
199
|
Returns
|
|
102
200
|
-------
|
|
103
|
-
|
|
201
|
+
sequences : list of dict (str -> NucleotideSequence)
|
|
104
202
|
This list contains the reads for each spot:
|
|
105
203
|
The first item contains the first read for each spot, the
|
|
106
204
|
second item contains the second read for each spot (if existing),
|
|
107
205
|
etc.
|
|
108
206
|
Each item in the list is a dictionary mapping identifiers to its
|
|
109
|
-
corresponding sequence
|
|
207
|
+
corresponding sequence.
|
|
110
208
|
"""
|
|
111
|
-
|
|
112
|
-
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class FastqDumpApp(_DumpApp):
|
|
213
|
+
"""
|
|
214
|
+
Fetch sequencing data from the *NCBI sequence read archive*
|
|
215
|
+
(SRA) using *sra-tools*.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
uid : str
|
|
220
|
+
A *unique identifier* (UID) of the file to be downloaded.
|
|
221
|
+
output_path_prefix : str, optional
|
|
222
|
+
The prefix of the path to store the downloaded FASTQ file.
|
|
223
|
+
``.fastq`` is appended to this prefix if the run contains
|
|
224
|
+
a single read per spot.
|
|
225
|
+
``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
|
|
226
|
+
multiple reads per spot.
|
|
227
|
+
By default, the files are created in a temporary directory and
|
|
228
|
+
deleted after the files have been read.
|
|
229
|
+
prefetch_path, fasterq_dump_path : str, optional
|
|
230
|
+
Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
|
|
231
|
+
respectively.
|
|
232
|
+
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
|
|
233
|
+
This value is subtracted from the FASTQ ASCII code to obtain the
|
|
234
|
+
quality score.
|
|
235
|
+
Can either be directly the value, or a string that indicates
|
|
236
|
+
the score format.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
|
|
240
|
+
fasterq_dump_path="fasterq-dump", offset="Sanger"):
|
|
241
|
+
super().__init__(
|
|
242
|
+
uid, output_path_prefix, prefetch_path, fasterq_dump_path
|
|
243
|
+
)
|
|
244
|
+
self._offset = offset
|
|
245
|
+
self._fastq_files = None
|
|
246
|
+
|
|
113
247
|
|
|
114
248
|
@requires_state(AppState.JOINED)
|
|
115
249
|
def get_fastq(self):
|
|
@@ -130,12 +264,47 @@ class FastqDumpApp(LocalApp):
|
|
|
130
264
|
for file_name in self.get_file_paths()
|
|
131
265
|
]
|
|
132
266
|
return self._fastq_files
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@requires_state(AppState.JOINED)
|
|
270
|
+
def get_sequences(self):
|
|
271
|
+
return [
|
|
272
|
+
{
|
|
273
|
+
header: NucleotideSequence(
|
|
274
|
+
seq_str.replace("U","T").replace("X","N")
|
|
275
|
+
)
|
|
276
|
+
for header, (seq_str, _) in fastq_file.items()
|
|
277
|
+
}
|
|
278
|
+
for fastq_file in self.get_fastq()
|
|
279
|
+
]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@requires_state(AppState.JOINED)
|
|
283
|
+
def get_sequences_and_scores(self):
|
|
284
|
+
"""
|
|
285
|
+
Get the sequences and score values from the downloaded file(s).
|
|
286
|
+
|
|
287
|
+
Returns
|
|
288
|
+
-------
|
|
289
|
+
sequences_and_scores : list of dict (str -> (NucleotideSequence, ndarray))
|
|
290
|
+
This list contains the reads for each spot:
|
|
291
|
+
The first item contains the first read for each spot, the
|
|
292
|
+
second item contains the second read for each spot (if existing),
|
|
293
|
+
etc.
|
|
294
|
+
Each item in the list is a dictionary mapping identifiers to its
|
|
295
|
+
corresponding sequence and score values.
|
|
137
296
|
"""
|
|
138
|
-
|
|
297
|
+
return [
|
|
298
|
+
get_sequences_and_scores(fastq_file)
|
|
299
|
+
for fastq_file in self.get_fastq()
|
|
300
|
+
]
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@classmethod
|
|
304
|
+
def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
|
|
305
|
+
fasterq_dump_path="fasterq-dump", offset="Sanger"):
|
|
306
|
+
"""
|
|
307
|
+
Get the sequences belonging to the UID from the
|
|
139
308
|
*NCBI sequence read archive* (SRA).
|
|
140
309
|
|
|
141
310
|
Parameters
|
|
@@ -150,25 +319,138 @@ class FastqDumpApp(LocalApp):
|
|
|
150
319
|
multiple reads per spot.
|
|
151
320
|
By default, the files are created in a temporary directory and
|
|
152
321
|
deleted after the files have been read.
|
|
153
|
-
|
|
154
|
-
Path to the ``fasterq-dump`` binary
|
|
322
|
+
prefetch_path, fasterq_dump_path : str, optional
|
|
323
|
+
Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
|
|
324
|
+
respectively.
|
|
155
325
|
offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
|
|
156
326
|
This value is subtracted from the FASTQ ASCII code to obtain the
|
|
157
327
|
quality score.
|
|
158
328
|
Can either be directly the value, or a string that indicates
|
|
159
329
|
the score format.
|
|
160
|
-
|
|
330
|
+
|
|
161
331
|
Returns
|
|
162
332
|
-------
|
|
163
|
-
|
|
333
|
+
sequences : list of dict (str -> NucleotideSequence)
|
|
164
334
|
This list contains the reads for each spot:
|
|
165
335
|
The first item contains the first read for each spot, the
|
|
166
336
|
second item contains the second read for each spot (if existing),
|
|
167
337
|
etc.
|
|
168
338
|
Each item in the list is a dictionary mapping identifiers to its
|
|
169
|
-
corresponding sequence
|
|
339
|
+
corresponding sequence.
|
|
170
340
|
"""
|
|
171
|
-
app =
|
|
341
|
+
app = cls(
|
|
342
|
+
uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset
|
|
343
|
+
)
|
|
172
344
|
app.start()
|
|
173
345
|
app.join()
|
|
174
346
|
return app.get_sequences()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class FastaDumpApp(_DumpApp):
|
|
350
|
+
"""
|
|
351
|
+
Fetch sequencing data from the *NCBI sequence read archive*
|
|
352
|
+
(SRA) using *sra-tools*.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
uid : str
|
|
357
|
+
A *unique identifier* (UID) of the file to be downloaded.
|
|
358
|
+
output_path_prefix : str, optional
|
|
359
|
+
The prefix of the path to store the downloaded FASTQ file.
|
|
360
|
+
``.fastq`` is appended to this prefix if the run contains
|
|
361
|
+
a single read per spot.
|
|
362
|
+
``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
|
|
363
|
+
multiple reads per spot.
|
|
364
|
+
By default, the files are created in a temporary directory and
|
|
365
|
+
deleted after the files have been read.
|
|
366
|
+
prefetch_path, fasterq_dump_path : str, optional
|
|
367
|
+
Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
|
|
368
|
+
respectively.
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
|
|
372
|
+
fasterq_dump_path="fasterq-dump"):
|
|
373
|
+
super().__init__(
|
|
374
|
+
uid, output_path_prefix, prefetch_path, fasterq_dump_path
|
|
375
|
+
)
|
|
376
|
+
self._fasta_files = None
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@requires_state(AppState.CREATED)
|
|
380
|
+
def get_prefetch_options(self):
|
|
381
|
+
return
|
|
382
|
+
# TODO: Use '--eliminate-quals'
|
|
383
|
+
# when https://github.com/ncbi/sra-tools/issues/883 is resolved
|
|
384
|
+
# return "--eliminate-quals"
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@requires_state(AppState.CREATED)
|
|
388
|
+
def get_fastq_dump_options(self):
|
|
389
|
+
return "--fasta"
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@requires_state(AppState.JOINED)
|
|
393
|
+
def get_fasta(self):
|
|
394
|
+
"""
|
|
395
|
+
Get the `FastaFile` objects from the downloaded file(s).
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
fasta_files : list of FastaFile
|
|
400
|
+
This list contains the reads for each spot:
|
|
401
|
+
The first item contains the first read for each spot, the
|
|
402
|
+
second item contains the second read for each spot (if existing),
|
|
403
|
+
etc.
|
|
404
|
+
"""
|
|
405
|
+
if self._fasta_files is None:
|
|
406
|
+
self._fasta_files = [
|
|
407
|
+
FastaFile.read(file_name)
|
|
408
|
+
for file_name in self.get_file_paths()
|
|
409
|
+
]
|
|
410
|
+
return self._fasta_files
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
@requires_state(AppState.JOINED)
|
|
414
|
+
def get_sequences(self):
|
|
415
|
+
return [get_sequences(fasta_file) for fasta_file in self.get_fasta()]
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
|
|
420
|
+
fasterq_dump_path="fasterq-dump"):
|
|
421
|
+
"""
|
|
422
|
+
Get the sequences belonging to the UID from the
|
|
423
|
+
*NCBI sequence read archive* (SRA).
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
uid : str
|
|
428
|
+
A *unique identifier* (UID) of the file to be downloaded.
|
|
429
|
+
output_path_prefix : str, optional
|
|
430
|
+
The prefix of the path to store the downloaded FASTQ file.
|
|
431
|
+
``.fastq`` is appended to this prefix if the run contains
|
|
432
|
+
a single read per spot.
|
|
433
|
+
``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
|
|
434
|
+
multiple reads per spot.
|
|
435
|
+
By default, the files are created in a temporary directory and
|
|
436
|
+
deleted after the files have been read.
|
|
437
|
+
prefetch_path, fasterq_dump_path : str, optional
|
|
438
|
+
Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
|
|
439
|
+
respectively.
|
|
440
|
+
|
|
441
|
+
Returns
|
|
442
|
+
-------
|
|
443
|
+
sequences : list of dict (str -> NucleotideSequence)
|
|
444
|
+
This list contains the reads for each spot:
|
|
445
|
+
The first item contains the first read for each spot, the
|
|
446
|
+
second item contains the second read for each spot (if existing),
|
|
447
|
+
etc.
|
|
448
|
+
Each item in the list is a dictionary mapping identifiers to its
|
|
449
|
+
corresponding sequence.
|
|
450
|
+
"""
|
|
451
|
+
app = cls(
|
|
452
|
+
uid, output_path_prefix, prefetch_path, fasterq_dump_path
|
|
453
|
+
)
|
|
454
|
+
app.start()
|
|
455
|
+
app.join()
|
|
456
|
+
return app.get_sequences()
|
biotite/database/entrez/check.py
CHANGED
|
@@ -6,6 +6,7 @@ __name__ = "biotite.database.entrez"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann, Maximilian Dombrowsky"
|
|
7
7
|
__all__ = ["check_for_errors"]
|
|
8
8
|
|
|
9
|
+
import json
|
|
9
10
|
from ..error import RequestError
|
|
10
11
|
|
|
11
12
|
|
|
@@ -29,17 +30,27 @@ _error_messages = [
|
|
|
29
30
|
def check_for_errors(message):
|
|
30
31
|
"""
|
|
31
32
|
Check for common error messages in NCBI Entrez database responses.
|
|
32
|
-
|
|
33
|
+
|
|
33
34
|
Parameters
|
|
34
35
|
----------
|
|
35
36
|
message : str
|
|
36
|
-
The message received from NCBI Entrez.
|
|
37
|
-
|
|
37
|
+
The message received from NCBI Entrez.
|
|
38
|
+
|
|
38
39
|
Raises
|
|
39
40
|
------
|
|
40
41
|
RequestError
|
|
41
42
|
If the message contains an error message.
|
|
42
43
|
"""
|
|
44
|
+
# Server can respond short JSON error messages
|
|
45
|
+
if len(message) < 500:
|
|
46
|
+
try:
|
|
47
|
+
message_json = json.loads(message)
|
|
48
|
+
if "error" in message_json:
|
|
49
|
+
raise RequestError(message_json["error"])
|
|
50
|
+
except json.decoder.JSONDecodeError:
|
|
51
|
+
# It is not a JSON message
|
|
52
|
+
pass
|
|
53
|
+
|
|
43
54
|
# Error always appear at the end of message
|
|
44
55
|
message_end = message[-200:]
|
|
45
56
|
# Seemingly arbitrary '+' characters are in NCBI error messages
|
|
@@ -13,6 +13,7 @@ import io
|
|
|
13
13
|
import requests
|
|
14
14
|
from .check import check_for_errors
|
|
15
15
|
from .dbnames import sanitize_database_name
|
|
16
|
+
from .key import get_api_key
|
|
16
17
|
from ..error import RequestError
|
|
17
18
|
|
|
18
19
|
|
|
@@ -23,15 +24,15 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
|
|
|
23
24
|
ret_mode="text", overwrite=False, verbose=False):
|
|
24
25
|
"""
|
|
25
26
|
Download files from the NCBI Entrez database in various formats.
|
|
26
|
-
|
|
27
|
+
|
|
27
28
|
The data for each UID will be fetched into a separate file.
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
A list of valid database, retrieval type and mode combinations can
|
|
30
31
|
be found under
|
|
31
32
|
`<https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_
|
|
32
|
-
|
|
33
|
+
|
|
33
34
|
This function requires an internet connection.
|
|
34
|
-
|
|
35
|
+
|
|
35
36
|
Parameters
|
|
36
37
|
----------
|
|
37
38
|
uids : str or iterable object of str
|
|
@@ -58,7 +59,7 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
|
|
|
58
59
|
verbose: bool, optional
|
|
59
60
|
If true, the function will output the download progress.
|
|
60
61
|
(Default: False)
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
Returns
|
|
63
64
|
-------
|
|
64
65
|
files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO)
|
|
@@ -68,7 +69,7 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
|
|
|
68
69
|
object) was given, a list of strings is returned.
|
|
69
70
|
If `target_path` is ``None``, the file contents are stored in
|
|
70
71
|
either `StringIO` or `BytesIO` objects.
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
Warnings
|
|
73
74
|
--------
|
|
74
75
|
Even if you give valid input to this function, in rare cases the
|
|
@@ -76,14 +77,14 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
|
|
|
76
77
|
In these cases the request should be retried.
|
|
77
78
|
When the issue occurs repeatedly, the error is probably in your
|
|
78
79
|
input.
|
|
79
|
-
|
|
80
|
+
|
|
80
81
|
See also
|
|
81
82
|
--------
|
|
82
83
|
fetch_single_file
|
|
83
|
-
|
|
84
|
+
|
|
84
85
|
Examples
|
|
85
86
|
--------
|
|
86
|
-
|
|
87
|
+
|
|
87
88
|
>>> import os.path
|
|
88
89
|
>>> files = fetch(["1L2Y_A","3O5R_A"], path_to_directory, suffix="fa",
|
|
89
90
|
... db_name="protein", ret_type="fasta")
|
|
@@ -122,6 +123,9 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
|
|
|
122
123
|
"tool" : "Biotite",
|
|
123
124
|
"mail" : "padix.key@gmail.com"
|
|
124
125
|
}
|
|
126
|
+
api_key = get_api_key()
|
|
127
|
+
if api_key is not None:
|
|
128
|
+
param_dict["api_key"] = api_key
|
|
125
129
|
r = requests.get(_fetch_url, params=param_dict)
|
|
126
130
|
content = r.text
|
|
127
131
|
check_for_errors(content)
|
|
@@ -147,7 +151,7 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
|
|
|
147
151
|
"""
|
|
148
152
|
Almost the same as :func:`fetch()`, but the data for the given UIDs
|
|
149
153
|
will be stored in a single file.
|
|
150
|
-
|
|
154
|
+
|
|
151
155
|
Parameters
|
|
152
156
|
----------
|
|
153
157
|
uids : iterable object of str
|
|
@@ -164,14 +168,14 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
|
|
|
164
168
|
overwrite : bool, optional
|
|
165
169
|
If false, the file is only downloaded, if no file with the same
|
|
166
170
|
name already exists.
|
|
167
|
-
|
|
171
|
+
|
|
168
172
|
Returns
|
|
169
173
|
-------
|
|
170
174
|
file : str or StringIO or BytesIO
|
|
171
175
|
The file name of the downloaded file.
|
|
172
176
|
If `file_name` is ``None``, the file content is stored in
|
|
173
177
|
either a `StringIO` or a `BytesIO` object.
|
|
174
|
-
|
|
178
|
+
|
|
175
179
|
Warnings
|
|
176
180
|
--------
|
|
177
181
|
Even if you give valid input to this function, in rare cases the
|
|
@@ -179,7 +183,7 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
|
|
|
179
183
|
In these cases the request should be retried.
|
|
180
184
|
When the issue occurs repeatedly, the error is probably in your
|
|
181
185
|
input.
|
|
182
|
-
|
|
186
|
+
|
|
183
187
|
See also
|
|
184
188
|
--------
|
|
185
189
|
fetch
|
|
@@ -203,6 +207,9 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
|
|
|
203
207
|
"tool" : "Biotite",
|
|
204
208
|
"mail" : "padix.key@gmail.com"
|
|
205
209
|
}
|
|
210
|
+
api_key = get_api_key()
|
|
211
|
+
if api_key is not None:
|
|
212
|
+
param_dict["api_key"] = api_key
|
|
206
213
|
r = requests.get(_fetch_url, params=param_dict)
|
|
207
214
|
content = r.text
|
|
208
215
|
check_for_errors(content)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.database.entrez"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["set_api_key", "get_api_key"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_API_KEY = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key():
|
|
14
|
+
"""
|
|
15
|
+
Get the
|
|
16
|
+
`NCBI API key <https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/>`_.
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
api_key : str or None
|
|
21
|
+
The API key, if it was already set before, ``None`` otherwise.
|
|
22
|
+
"""
|
|
23
|
+
global _API_KEY
|
|
24
|
+
return _API_KEY
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def set_api_key(key):
|
|
28
|
+
"""
|
|
29
|
+
Set the
|
|
30
|
+
`NCBI API key <https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/>`_.
|
|
31
|
+
|
|
32
|
+
Using an API key increases the request limit on the NCBI servers
|
|
33
|
+
and is automatically used by functions in
|
|
34
|
+
:mod:`biotite.database.entrez`.
|
|
35
|
+
This key is kept only in memory and hence removed in the end of the
|
|
36
|
+
Python session.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
api_key : str
|
|
41
|
+
The API key.
|
|
42
|
+
"""
|
|
43
|
+
global _API_KEY
|
|
44
|
+
_API_KEY = key
|