biotite 0.38.0__cp311-cp311-macosx_11_0_arm64.whl → 0.40.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (124) hide show
  1. biotite/__init__.py +3 -3
  2. biotite/application/application.py +33 -28
  3. biotite/application/dssp/app.py +18 -18
  4. biotite/application/sra/__init__.py +5 -0
  5. biotite/application/sra/app.py +337 -55
  6. biotite/database/entrez/__init__.py +2 -1
  7. biotite/database/entrez/check.py +14 -3
  8. biotite/database/entrez/download.py +20 -13
  9. biotite/database/entrez/key.py +44 -0
  10. biotite/database/entrez/query.py +38 -34
  11. biotite/database/pubchem/query.py +44 -44
  12. biotite/database/rcsb/download.py +19 -14
  13. biotite/database/rcsb/query.py +46 -46
  14. biotite/sequence/align/__init__.py +5 -1
  15. biotite/sequence/align/banded.c +1408 -1025
  16. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/buckets.py +69 -0
  18. biotite/sequence/align/cigar.py +389 -0
  19. biotite/sequence/align/kmeralphabet.c +3220 -2850
  20. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/kmersimilarity.c +713 -663
  22. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  23. biotite/sequence/align/kmertable.cpp +68398 -0
  24. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  25. biotite/sequence/align/localgapped.c +1507 -1074
  26. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/localungapped.c +1143 -833
  28. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/multiple.c +1569 -1092
  30. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  31. biotite/sequence/align/pairwise.c +1612 -1212
  32. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  33. biotite/sequence/align/permutation.c +33259 -0
  34. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  35. biotite/sequence/align/primes.txt +821 -0
  36. biotite/sequence/align/{kmertable.c → selector.c} +9129 -16497
  37. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  38. biotite/sequence/align/tracetable.c +685 -646
  39. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  40. biotite/sequence/codec.c +1159 -841
  41. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  42. biotite/sequence/graphics/alignment.py +212 -2
  43. biotite/sequence/io/genbank/annotation.py +11 -11
  44. biotite/sequence/phylo/nj.c +684 -636
  45. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  46. biotite/sequence/phylo/tree.c +970 -673
  47. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  48. biotite/sequence/phylo/upgma.c +672 -626
  49. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  50. biotite/structure/__init__.py +1 -1
  51. biotite/structure/atoms.py +1 -1
  52. biotite/structure/basepairs.py +7 -12
  53. biotite/structure/bonds.c +3861 -3749
  54. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  55. biotite/structure/celllist.c +727 -707
  56. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  57. biotite/structure/charges.c +1561 -1560
  58. biotite/structure/charges.cpython-311-darwin.so +0 -0
  59. biotite/structure/filter.py +30 -37
  60. biotite/structure/info/__init__.py +5 -8
  61. biotite/structure/info/atoms.py +25 -67
  62. biotite/structure/info/bonds.py +46 -100
  63. biotite/structure/info/ccd/README.rst +8 -0
  64. biotite/structure/info/ccd/amino_acids.txt +1646 -0
  65. biotite/structure/info/ccd/carbohydrates.txt +1133 -0
  66. biotite/structure/info/ccd/components.bcif +0 -0
  67. biotite/structure/info/ccd/nucleotides.txt +797 -0
  68. biotite/structure/info/ccd.py +95 -0
  69. biotite/structure/info/groups.py +90 -0
  70. biotite/structure/info/masses.py +21 -20
  71. biotite/structure/info/misc.py +11 -22
  72. biotite/structure/info/standardize.py +17 -12
  73. biotite/structure/io/__init__.py +2 -4
  74. biotite/structure/io/ctab.py +1 -1
  75. biotite/structure/io/general.py +37 -43
  76. biotite/structure/io/mmtf/__init__.py +3 -0
  77. biotite/structure/io/mmtf/convertarray.c +528 -365
  78. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  79. biotite/structure/io/mmtf/convertfile.c +725 -676
  80. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  81. biotite/structure/io/mmtf/decode.c +1070 -754
  82. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  83. biotite/structure/io/mmtf/encode.c +727 -677
  84. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  85. biotite/structure/io/mmtf/file.py +34 -26
  86. biotite/structure/io/npz/__init__.py +3 -0
  87. biotite/structure/io/npz/file.py +21 -18
  88. biotite/structure/io/pdb/__init__.py +3 -3
  89. biotite/structure/io/pdb/file.py +72 -70
  90. biotite/structure/io/pdb/hybrid36.c +540 -478
  91. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  92. biotite/structure/io/pdbqt/file.py +82 -68
  93. biotite/structure/io/pdbx/__init__.py +13 -6
  94. biotite/structure/io/pdbx/bcif.py +649 -0
  95. biotite/structure/io/pdbx/cif.py +1028 -0
  96. biotite/structure/io/pdbx/component.py +243 -0
  97. biotite/structure/io/pdbx/convert.py +707 -359
  98. biotite/structure/io/pdbx/encoding.c +112813 -0
  99. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  100. biotite/structure/io/pdbx/error.py +14 -0
  101. biotite/structure/io/pdbx/legacy.py +267 -0
  102. biotite/structure/molecules.py +151 -151
  103. biotite/structure/residues.py +40 -40
  104. biotite/structure/sasa.c +713 -644
  105. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  106. biotite/structure/superimpose.py +158 -115
  107. biotite/visualize.py +9 -11
  108. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
  109. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/RECORD +112 -102
  110. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
  111. biotite/structure/info/amino_acids.json +0 -1556
  112. biotite/structure/info/amino_acids.py +0 -42
  113. biotite/structure/info/carbohydrates.json +0 -1122
  114. biotite/structure/info/carbohydrates.py +0 -39
  115. biotite/structure/info/intra_bonds.msgpack +0 -0
  116. biotite/structure/info/link_types.msgpack +0 -1
  117. biotite/structure/info/nucleotides.json +0 -772
  118. biotite/structure/info/nucleotides.py +0 -39
  119. biotite/structure/info/residue_masses.msgpack +0 -0
  120. biotite/structure/info/residue_names.msgpack +0 -3
  121. biotite/structure/info/residues.msgpack +0 -0
  122. biotite/structure/io/pdbx/file.py +0 -652
  123. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
  124. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
@@ -4,19 +4,26 @@
4
4
 
5
5
  __name__ = "biotite.application.sra"
6
6
  __author__ = "Patrick Kunzmann"
7
- __all__ = ["FastqDumpApp"]
7
+ __all__ = ["FastaDumpApp", "FastqDumpApp"]
8
8
 
9
+ import abc
10
+ from os.path import join
11
+ from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired
9
12
  import glob
10
- from tempfile import NamedTemporaryFile, gettempdir
11
- from ..localapp import LocalApp, cleanup_tempfile
12
- from ..application import AppState, requires_state
13
+ from tempfile import TemporaryDirectory
14
+ from ..application import Application, AppState, AppStateError, \
15
+ requires_state
16
+ from ...sequence.seqtypes import NucleotideSequence
13
17
  from ...sequence.io.fastq.file import FastqFile
14
- from ...sequence.io.fastq.convert import get_sequences
18
+ from ...sequence.io.fasta.file import FastaFile
19
+ from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores
20
+ from ...sequence.io.fasta.convert import get_sequences
15
21
 
16
22
 
17
- class FastqDumpApp(LocalApp):
23
+ # Do not use LocalApp, as two programs are executed
24
+ class _DumpApp(Application, metaclass=abc.ABCMeta):
18
25
  """
19
- Fetch sequencing data as FASTQ from the *NCBI sequence read archive*
26
+ Fetch sequencing data from the *NCBI sequence read archive*
20
27
  (SRA) using *sra-tools*.
21
28
 
22
29
  Parameters
@@ -31,85 +38,212 @@ class FastqDumpApp(LocalApp):
31
38
  multiple reads per spot.
32
39
  By default, the files are created in a temporary directory and
33
40
  deleted after the files have been read.
34
- bin_path : str, optional
35
- Path to the ``fasterq-dump`` binary.
41
+ prefetch_path, fasterq_dump_path : str, optional
42
+ Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
43
+ respectively.
36
44
  offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
37
45
  This value is subtracted from the FASTQ ASCII code to obtain the
38
46
  quality score.
39
47
  Can either be directly the value, or a string that indicates
40
48
  the score format.
41
49
  """
42
-
43
- def __init__(self, uid, output_path_prefix=None, bin_path="fasterq-dump",
44
- offset="Sanger"):
45
- super().__init__(bin_path)
50
+
51
+ def __init__(self, uid, output_path_prefix=None,
52
+ prefetch_path="prefetch", fasterq_dump_path="fasterq-dump"):
53
+ super().__init__()
54
+ self._prefetch_path = prefetch_path
55
+ self._fasterq_dump_path = fasterq_dump_path
46
56
  self._uid = uid
47
- self._offset = offset
57
+ self._sra_dir = TemporaryDirectory(suffix="_sra")
48
58
  if output_path_prefix is None:
49
- # NamedTemporaryFile is only created to obtain prefix
50
- # for FASTQ files
51
- self._out_file = NamedTemporaryFile("r")
52
- self._prefix = self._out_file.name
59
+ self._prefix = join(self._sra_dir.name, self._uid)
53
60
  else:
54
- self._out_file = None
55
61
  self._prefix = output_path_prefix
62
+ self._prefetch_process = None
63
+ self._fasterq_dump_process = None
64
+
65
+
66
+ @requires_state(AppState.RUNNING | AppState.FINISHED)
67
+ def join(self, timeout=None):
68
+ # Override method as repetitive calls of 'is_finished()'
69
+ # are not necessary as 'communicate()' already waits for the
70
+ # finished application
71
+ try:
72
+ _, self._stderr = self._process.communicate(
73
+ timeout=timeout
74
+ )
75
+ except TimeoutExpired:
76
+ self.cancel()
77
+ raise TimeoutError(
78
+ f"The application expired its timeout ({timeout:.1f} s)"
79
+ )
80
+ self._state = AppState.FINISHED
81
+
82
+ try:
83
+ self.evaluate()
84
+ except AppStateError:
85
+ raise
86
+ except:
87
+ self._state = AppState.CANCELLED
88
+ raise
89
+ else:
90
+ self._state = AppState.JOINED
91
+ self.clean_up()
92
+
56
93
 
57
94
  def run(self):
58
- self.set_arguments([
59
- "-o", self._prefix + ".fastq",
60
- "-t", gettempdir(),
61
- "-f",
62
- self._uid
63
- ])
64
- super().run()
65
-
95
+ # Prefetch into a temp directory with file name equaling UID
96
+ # This ensures that the ID in the header is not the temp prefix
97
+ sra_file_name = join(self._sra_dir.name, self._uid)
98
+ command = (
99
+ f"{self._prefetch_path} -q -O {self._sra_dir.name} "
100
+ f"{self.get_prefetch_options()} {self._uid}; "
101
+ f"{self._fasterq_dump_path} -q -o {self._prefix}.fastq "
102
+ f"{self.get_fastq_dump_options()} {sra_file_name}"
103
+ )
104
+ self._process = Popen(
105
+ command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8"
106
+ )
107
+
108
+
109
+ def is_finished(self):
110
+ code = self._process.poll()
111
+ if code == None:
112
+ return False
113
+ else:
114
+ _, self._stderr = self._process.communicate()
115
+ return True
116
+
117
+
66
118
  def evaluate(self):
67
119
  super().evaluate()
120
+ # Check if applicaion terminated correctly
121
+ exit_code = self._process.returncode
122
+ if exit_code != 0:
123
+ err_msg = self._stderr.replace("\n", " ")
124
+ raise SubprocessError(
125
+ f"'prefetch' or 'fasterq-dump' returned with exit code "
126
+ f"{exit_code}: {err_msg}"
127
+ )
128
+
68
129
  self._file_names = (
69
130
  # For entries with one read per spot
70
- glob.glob(self._prefix + ".fastq") +
131
+ glob.glob(self._prefix + ".fastq") +
71
132
  # For entries with multiple reads per spot
72
133
  glob.glob(self._prefix + "_*.fastq")
73
134
  )
74
135
  # Only load FASTQ files into memory when needed
75
136
  self._fastq_files = None
76
-
137
+
138
+
139
+ def wait_interval(self):
140
+ # Not used in this implementation of 'join()'
141
+ raise NotImplementedError()
142
+
143
+
77
144
  def clean_up(self):
78
- super().clean_up()
79
- if self._out_file is not None:
80
- # This file was only created to reserve a unique file name
81
- # Now it is not needed anymore
82
- self._out_file.close()
83
-
145
+ if self.get_app_state() == AppState.CANCELLED:
146
+ self._process.kill()
147
+ # Directory with temp files does not need to be deleted,
148
+ # as temp dir is automatically deleted upon object destruction
149
+
150
+
151
+ @requires_state(AppState.CREATED)
152
+ def get_prefetch_options(self):
153
+ """
154
+ Get additional options for the `prefetch` call.
155
+
156
+ PROTECTED: Override when inheriting.
157
+
158
+ Returns
159
+ -------
160
+ options: str
161
+ The additional options.
162
+ """
163
+ return ""
164
+
165
+ @requires_state(AppState.CREATED)
166
+ def get_fastq_dump_options(self):
167
+ """
168
+ Get additional options for the `fasterq-dump` call.
169
+
170
+ PROTECTED: Override when inheriting.
171
+
172
+ Returns
173
+ -------
174
+ options: str
175
+ The additional options.
176
+ """
177
+ return ""
178
+
179
+
84
180
  @requires_state(AppState.JOINED)
85
181
  def get_file_paths(self):
86
182
  """
87
- Get the file paths to the downloaded FASTQ files.
88
-
183
+ Get the file paths to the downloaded files.
184
+
89
185
  Returns
90
186
  -------
91
187
  paths : list of str
92
188
  The file paths to the downloaded files.
93
189
  """
94
190
  return self._file_names
95
-
191
+
192
+
96
193
  @requires_state(AppState.JOINED)
194
+ @abc.abstractmethod
97
195
  def get_sequences(self):
98
196
  """
99
- Get the sequences and score values from the downloaded file(s).
100
-
197
+ Get the sequences from the downloaded file(s).
198
+
101
199
  Returns
102
200
  -------
103
- sequences_and_scores : list of dict (str -> (NucleotideSequence, ndarray))
201
+ sequences : list of dict (str -> NucleotideSequence)
104
202
  This list contains the reads for each spot:
105
203
  The first item contains the first read for each spot, the
106
204
  second item contains the second read for each spot (if existing),
107
205
  etc.
108
206
  Each item in the list is a dictionary mapping identifiers to its
109
- corresponding sequence and score values.
207
+ corresponding sequence.
110
208
  """
111
- fastq_files = self.get_fastq()
112
- return [get_sequences(fastq_file) for fastq_file in fastq_files]
209
+ pass
210
+
211
+
212
+ class FastqDumpApp(_DumpApp):
213
+ """
214
+ Fetch sequencing data from the *NCBI sequence read archive*
215
+ (SRA) using *sra-tools*.
216
+
217
+ Parameters
218
+ ----------
219
+ uid : str
220
+ A *unique identifier* (UID) of the file to be downloaded.
221
+ output_path_prefix : str, optional
222
+ The prefix of the path to store the downloaded FASTQ file.
223
+ ``.fastq`` is appended to this prefix if the run contains
224
+ a single read per spot.
225
+ ``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
226
+ multiple reads per spot.
227
+ By default, the files are created in a temporary directory and
228
+ deleted after the files have been read.
229
+ prefetch_path, fasterq_dump_path : str, optional
230
+ Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
231
+ respectively.
232
+ offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
233
+ This value is subtracted from the FASTQ ASCII code to obtain the
234
+ quality score.
235
+ Can either be directly the value, or a string that indicates
236
+ the score format.
237
+ """
238
+
239
+ def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
240
+ fasterq_dump_path="fasterq-dump", offset="Sanger"):
241
+ super().__init__(
242
+ uid, output_path_prefix, prefetch_path, fasterq_dump_path
243
+ )
244
+ self._offset = offset
245
+ self._fastq_files = None
246
+
113
247
 
114
248
  @requires_state(AppState.JOINED)
115
249
  def get_fastq(self):
@@ -130,12 +264,47 @@ class FastqDumpApp(LocalApp):
130
264
  for file_name in self.get_file_paths()
131
265
  ]
132
266
  return self._fastq_files
133
-
134
- @staticmethod
135
- def fetch(uid, output_path_prefix=None, bin_path="fasterq-dump",
136
- offset="Sanger"):
267
+
268
+
269
+ @requires_state(AppState.JOINED)
270
+ def get_sequences(self):
271
+ return [
272
+ {
273
+ header: NucleotideSequence(
274
+ seq_str.replace("U","T").replace("X","N")
275
+ )
276
+ for header, (seq_str, _) in fastq_file.items()
277
+ }
278
+ for fastq_file in self.get_fastq()
279
+ ]
280
+
281
+
282
+ @requires_state(AppState.JOINED)
283
+ def get_sequences_and_scores(self):
284
+ """
285
+ Get the sequences and score values from the downloaded file(s).
286
+
287
+ Returns
288
+ -------
289
+ sequences_and_scores : list of dict (str -> (NucleotideSequence, ndarray))
290
+ This list contains the reads for each spot:
291
+ The first item contains the first read for each spot, the
292
+ second item contains the second read for each spot (if existing),
293
+ etc.
294
+ Each item in the list is a dictionary mapping identifiers to its
295
+ corresponding sequence and score values.
137
296
  """
138
- Get the sequences and score values belonging to the UID from the
297
+ return [
298
+ get_sequences_and_scores(fastq_file)
299
+ for fastq_file in self.get_fastq()
300
+ ]
301
+
302
+
303
+ @classmethod
304
+ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
305
+ fasterq_dump_path="fasterq-dump", offset="Sanger"):
306
+ """
307
+ Get the sequences belonging to the UID from the
139
308
  *NCBI sequence read archive* (SRA).
140
309
 
141
310
  Parameters
@@ -150,25 +319,138 @@ class FastqDumpApp(LocalApp):
150
319
  multiple reads per spot.
151
320
  By default, the files are created in a temporary directory and
152
321
  deleted after the files have been read.
153
- bin_path : str, optional
154
- Path to the ``fasterq-dump`` binary.
322
+ prefetch_path, fasterq_dump_path : str, optional
323
+ Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
324
+ respectively.
155
325
  offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'}, optional
156
326
  This value is subtracted from the FASTQ ASCII code to obtain the
157
327
  quality score.
158
328
  Can either be directly the value, or a string that indicates
159
329
  the score format.
160
-
330
+
161
331
  Returns
162
332
  -------
163
- sequences_and_scores : list of dict (str -> (NucleotideSequence, ndarray))
333
+ sequences : list of dict (str -> NucleotideSequence)
164
334
  This list contains the reads for each spot:
165
335
  The first item contains the first read for each spot, the
166
336
  second item contains the second read for each spot (if existing),
167
337
  etc.
168
338
  Each item in the list is a dictionary mapping identifiers to its
169
- corresponding sequence and score values.
339
+ corresponding sequence.
170
340
  """
171
- app = FastqDumpApp(uid, output_path_prefix, bin_path, offset)
341
+ app = cls(
342
+ uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset
343
+ )
172
344
  app.start()
173
345
  app.join()
174
346
  return app.get_sequences()
347
+
348
+
349
+ class FastaDumpApp(_DumpApp):
350
+ """
351
+ Fetch sequencing data from the *NCBI sequence read archive*
352
+ (SRA) using *sra-tools*.
353
+
354
+ Parameters
355
+ ----------
356
+ uid : str
357
+ A *unique identifier* (UID) of the file to be downloaded.
358
+ output_path_prefix : str, optional
359
+ The prefix of the path to store the downloaded FASTQ file.
360
+ ``.fastq`` is appended to this prefix if the run contains
361
+ a single read per spot.
362
+ ``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
363
+ multiple reads per spot.
364
+ By default, the files are created in a temporary directory and
365
+ deleted after the files have been read.
366
+ prefetch_path, fasterq_dump_path : str, optional
367
+ Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
368
+ respectively.
369
+ """
370
+
371
+ def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
372
+ fasterq_dump_path="fasterq-dump"):
373
+ super().__init__(
374
+ uid, output_path_prefix, prefetch_path, fasterq_dump_path
375
+ )
376
+ self._fasta_files = None
377
+
378
+
379
+ @requires_state(AppState.CREATED)
380
+ def get_prefetch_options(self):
381
+ return
382
+ # TODO: Use '--eliminate-quals'
383
+ # when https://github.com/ncbi/sra-tools/issues/883 is resolved
384
+ # return "--eliminate-quals"
385
+
386
+
387
+ @requires_state(AppState.CREATED)
388
+ def get_fastq_dump_options(self):
389
+ return "--fasta"
390
+
391
+
392
+ @requires_state(AppState.JOINED)
393
+ def get_fasta(self):
394
+ """
395
+ Get the `FastaFile` objects from the downloaded file(s).
396
+
397
+ Returns
398
+ -------
399
+ fasta_files : list of FastaFile
400
+ This list contains the reads for each spot:
401
+ The first item contains the first read for each spot, the
402
+ second item contains the second read for each spot (if existing),
403
+ etc.
404
+ """
405
+ if self._fasta_files is None:
406
+ self._fasta_files = [
407
+ FastaFile.read(file_name)
408
+ for file_name in self.get_file_paths()
409
+ ]
410
+ return self._fasta_files
411
+
412
+
413
+ @requires_state(AppState.JOINED)
414
+ def get_sequences(self):
415
+ return [get_sequences(fasta_file) for fasta_file in self.get_fasta()]
416
+
417
+
418
+ @classmethod
419
+ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
420
+ fasterq_dump_path="fasterq-dump"):
421
+ """
422
+ Get the sequences belonging to the UID from the
423
+ *NCBI sequence read archive* (SRA).
424
+
425
+ Parameters
426
+ ----------
427
+ uid : str
428
+ A *unique identifier* (UID) of the file to be downloaded.
429
+ output_path_prefix : str, optional
430
+ The prefix of the path to store the downloaded FASTQ file.
431
+ ``.fastq`` is appended to this prefix if the run contains
432
+ a single read per spot.
433
+ ``_1.fastq``, ``_2.fastq``, etc. is appended if it contains
434
+ multiple reads per spot.
435
+ By default, the files are created in a temporary directory and
436
+ deleted after the files have been read.
437
+ prefetch_path, fasterq_dump_path : str, optional
438
+ Path to the ``prefetch_path`` and ``fasterq-dump`` binary,
439
+ respectively.
440
+
441
+ Returns
442
+ -------
443
+ sequences : list of dict (str -> NucleotideSequence)
444
+ This list contains the reads for each spot:
445
+ The first item contains the first read for each spot, the
446
+ second item contains the second read for each spot (if existing),
447
+ etc.
448
+ Each item in the list is a dictionary mapping identifiers to its
449
+ corresponding sequence.
450
+ """
451
+ app = cls(
452
+ uid, output_path_prefix, prefetch_path, fasterq_dump_path
453
+ )
454
+ app.start()
455
+ app.join()
456
+ return app.get_sequences()
@@ -11,4 +11,5 @@ __author__ = "Patrick Kunzmann"
11
11
 
12
12
  from .dbnames import *
13
13
  from .download import *
14
- from .query import *
14
+ from .query import *
15
+ from .key import *
@@ -6,6 +6,7 @@ __name__ = "biotite.database.entrez"
6
6
  __author__ = "Patrick Kunzmann, Maximilian Dombrowsky"
7
7
  __all__ = ["check_for_errors"]
8
8
 
9
+ import json
9
10
  from ..error import RequestError
10
11
 
11
12
 
@@ -29,17 +30,27 @@ _error_messages = [
29
30
  def check_for_errors(message):
30
31
  """
31
32
  Check for common error messages in NCBI Entrez database responses.
32
-
33
+
33
34
  Parameters
34
35
  ----------
35
36
  message : str
36
- The message received from NCBI Entrez.
37
-
37
+ The message received from NCBI Entrez.
38
+
38
39
  Raises
39
40
  ------
40
41
  RequestError
41
42
  If the message contains an error message.
42
43
  """
44
+ # Server can respond short JSON error messages
45
+ if len(message) < 500:
46
+ try:
47
+ message_json = json.loads(message)
48
+ if "error" in message_json:
49
+ raise RequestError(message_json["error"])
50
+ except json.decoder.JSONDecodeError:
51
+ # It is not a JSON message
52
+ pass
53
+
43
54
  # Error always appear at the end of message
44
55
  message_end = message[-200:]
45
56
  # Seemingly arbitrary '+' characters are in NCBI error messages
@@ -13,6 +13,7 @@ import io
13
13
  import requests
14
14
  from .check import check_for_errors
15
15
  from .dbnames import sanitize_database_name
16
+ from .key import get_api_key
16
17
  from ..error import RequestError
17
18
 
18
19
 
@@ -23,15 +24,15 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
23
24
  ret_mode="text", overwrite=False, verbose=False):
24
25
  """
25
26
  Download files from the NCBI Entrez database in various formats.
26
-
27
+
27
28
  The data for each UID will be fetched into a separate file.
28
-
29
+
29
30
  A list of valid database, retrieval type and mode combinations can
30
31
  be found under
31
32
  `<https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_
32
-
33
+
33
34
  This function requires an internet connection.
34
-
35
+
35
36
  Parameters
36
37
  ----------
37
38
  uids : str or iterable object of str
@@ -58,7 +59,7 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
58
59
  verbose: bool, optional
59
60
  If true, the function will output the download progress.
60
61
  (Default: False)
61
-
62
+
62
63
  Returns
63
64
  -------
64
65
  files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO)
@@ -68,7 +69,7 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
68
69
  object) was given, a list of strings is returned.
69
70
  If `target_path` is ``None``, the file contents are stored in
70
71
  either `StringIO` or `BytesIO` objects.
71
-
72
+
72
73
  Warnings
73
74
  --------
74
75
  Even if you give valid input to this function, in rare cases the
@@ -76,14 +77,14 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
76
77
  In these cases the request should be retried.
77
78
  When the issue occurs repeatedly, the error is probably in your
78
79
  input.
79
-
80
+
80
81
  See also
81
82
  --------
82
83
  fetch_single_file
83
-
84
+
84
85
  Examples
85
86
  --------
86
-
87
+
87
88
  >>> import os.path
88
89
  >>> files = fetch(["1L2Y_A","3O5R_A"], path_to_directory, suffix="fa",
89
90
  ... db_name="protein", ret_type="fasta")
@@ -122,6 +123,9 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
122
123
  "tool" : "Biotite",
123
124
  "mail" : "padix.key@gmail.com"
124
125
  }
126
+ api_key = get_api_key()
127
+ if api_key is not None:
128
+ param_dict["api_key"] = api_key
125
129
  r = requests.get(_fetch_url, params=param_dict)
126
130
  content = r.text
127
131
  check_for_errors(content)
@@ -147,7 +151,7 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
147
151
  """
148
152
  Almost the same as :func:`fetch()`, but the data for the given UIDs
149
153
  will be stored in a single file.
150
-
154
+
151
155
  Parameters
152
156
  ----------
153
157
  uids : iterable object of str
@@ -164,14 +168,14 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
164
168
  overwrite : bool, optional
165
169
  If false, the file is only downloaded, if no file with the same
166
170
  name already exists.
167
-
171
+
168
172
  Returns
169
173
  -------
170
174
  file : str or StringIO or BytesIO
171
175
  The file name of the downloaded file.
172
176
  If `file_name` is ``None``, the file content is stored in
173
177
  either a `StringIO` or a `BytesIO` object.
174
-
178
+
175
179
  Warnings
176
180
  --------
177
181
  Even if you give valid input to this function, in rare cases the
@@ -179,7 +183,7 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
179
183
  In these cases the request should be retried.
180
184
  When the issue occurs repeatedly, the error is probably in your
181
185
  input.
182
-
186
+
183
187
  See also
184
188
  --------
185
189
  fetch
@@ -203,6 +207,9 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
203
207
  "tool" : "Biotite",
204
208
  "mail" : "padix.key@gmail.com"
205
209
  }
210
+ api_key = get_api_key()
211
+ if api_key is not None:
212
+ param_dict["api_key"] = api_key
206
213
  r = requests.get(_fetch_url, params=param_dict)
207
214
  content = r.text
208
215
  check_for_errors(content)
@@ -0,0 +1,44 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.database.entrez"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["set_api_key", "get_api_key"]
8
+
9
+
10
+ _API_KEY = None
11
+
12
+
13
+ def get_api_key():
14
+ """
15
+ Get the
16
+ `NCBI API key <https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/>`_.
17
+
18
+ Returns
19
+ -------
20
+ api_key : str or None
21
+ The API key, if it was already set before, ``None`` otherwise.
22
+ """
23
+ global _API_KEY
24
+ return _API_KEY
25
+
26
+
27
+ def set_api_key(key):
28
+ """
29
+ Set the
30
+ `NCBI API key <https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/>`_.
31
+
32
+ Using an API key increases the request limit on the NCBI servers
33
+ and is automatically used by functions in
34
+ :mod:`biotite.database.entrez`.
35
+ This key is kept only in memory and hence removed in the end of the
36
+ Python session.
37
+
38
+ Parameters
39
+ ----------
40
+ api_key : str
41
+ The API key.
42
+ """
43
+ global _API_KEY
44
+ _API_KEY = key