biotite 0.41.2__cp310-cp310-macosx_11_0_arm64.whl → 1.0.1__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cpython-310-darwin.so +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +246 -236
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cpython-310-darwin.so +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cpython-310-darwin.so +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cpython-310-darwin.so +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +83 -78
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +140 -110
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +260 -258
- biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
- biotite/structure/io/trajfile.py +90 -107
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cpython-310-darwin.so +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/METADATA +6 -5
- biotite-1.0.1.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -6,14 +6,16 @@ __name__ = "biotite.sequence.io.genbank"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["GenBankFile", "MultiFile"]
|
|
8
8
|
|
|
9
|
-
#import textwrap
|
|
9
|
+
# import textwrap
|
|
10
10
|
import copy
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
# import re
|
|
12
13
|
import io
|
|
13
|
-
from ....file import TextFile, InvalidFileError
|
|
14
14
|
from collections import OrderedDict
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
from biotite.file import InvalidFileError, TextFile
|
|
16
|
+
|
|
17
|
+
# from ...annotation import Location, Feature, Annotation, AnnotatedSequence
|
|
18
|
+
# from ...seqtypes import NucleotideSequence, ProteinSequence
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class GenBankFile(TextFile):
|
|
@@ -33,7 +35,7 @@ class GenBankFile(TextFile):
|
|
|
33
35
|
Some fields may occur multiple times, e.g. the *REFERENCE* field.
|
|
34
36
|
A sample GenBank file can be viewed at
|
|
35
37
|
`<https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html>`_.
|
|
36
|
-
|
|
38
|
+
|
|
37
39
|
This class provides a low-level interface for parsing, editing and
|
|
38
40
|
writing GenBank files.
|
|
39
41
|
It works like a list of field entries, where a field consists of the
|
|
@@ -47,7 +49,7 @@ class GenBankFile(TextFile):
|
|
|
47
49
|
The subfields are represented by a dictionary, with subfield names
|
|
48
50
|
being keys and the corresponding lines being values.
|
|
49
51
|
The *FEATURES* and *ORIGIN* fields have no subfields.
|
|
50
|
-
|
|
52
|
+
|
|
51
53
|
Every entry can be obtained, set and deleted via the index operator.
|
|
52
54
|
|
|
53
55
|
Notes
|
|
@@ -55,7 +57,7 @@ class GenBankFile(TextFile):
|
|
|
55
57
|
This class does not support location identifiers with references
|
|
56
58
|
to other Entrez database entries, e.g.
|
|
57
59
|
``join(1..100,J00194.1:100..202)``.
|
|
58
|
-
|
|
60
|
+
|
|
59
61
|
Examples
|
|
60
62
|
--------
|
|
61
63
|
Create a GenBank file from scratch:
|
|
@@ -79,9 +81,9 @@ class GenBankFile(TextFile):
|
|
|
79
81
|
['One line', 'A second line']
|
|
80
82
|
>>> print(subfields)
|
|
81
83
|
OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])])
|
|
82
|
-
|
|
84
|
+
|
|
83
85
|
Adding an additional field:
|
|
84
|
-
|
|
86
|
+
|
|
85
87
|
>>> file.insert(0, "OTHERFIELD", ["Another line"])
|
|
86
88
|
>>> print(len(file))
|
|
87
89
|
2
|
|
@@ -174,18 +176,18 @@ class GenBankFile(TextFile):
|
|
|
174
176
|
# and names of categories
|
|
175
177
|
self._field_pos = []
|
|
176
178
|
self._find_field_indices()
|
|
177
|
-
|
|
179
|
+
|
|
178
180
|
@classmethod
|
|
179
181
|
def read(cls, file):
|
|
180
182
|
"""
|
|
181
183
|
Read a GenBank file.
|
|
182
|
-
|
|
184
|
+
|
|
183
185
|
Parameters
|
|
184
186
|
----------
|
|
185
187
|
file : file-like object or str
|
|
186
188
|
The file to be read.
|
|
187
189
|
Alternatively a file path can be supplied.
|
|
188
|
-
|
|
190
|
+
|
|
189
191
|
Returns
|
|
190
192
|
-------
|
|
191
193
|
file_object : GenBankFile
|
|
@@ -194,16 +196,16 @@ class GenBankFile(TextFile):
|
|
|
194
196
|
file = super().read(file)
|
|
195
197
|
file._find_field_indices()
|
|
196
198
|
return file
|
|
197
|
-
|
|
199
|
+
|
|
198
200
|
def get_fields(self, name):
|
|
199
201
|
"""
|
|
200
202
|
Get all *GenBank* fields associated with a given field name.
|
|
201
|
-
|
|
203
|
+
|
|
202
204
|
Parameters
|
|
203
205
|
----------
|
|
204
206
|
name : str
|
|
205
207
|
The field name.
|
|
206
|
-
|
|
208
|
+
|
|
207
209
|
Returns
|
|
208
210
|
-------
|
|
209
211
|
fields : list of (list of str, OrderedDict of str -> str)
|
|
@@ -218,17 +220,17 @@ class GenBankFile(TextFile):
|
|
|
218
220
|
indices = self.get_indices(name)
|
|
219
221
|
# Omit the field name
|
|
220
222
|
return [self[i][1:] for i in indices]
|
|
221
|
-
|
|
223
|
+
|
|
222
224
|
def get_indices(self, name):
|
|
223
225
|
"""
|
|
224
226
|
Get the indices to all *GenBank* fields associated with a given
|
|
225
227
|
field name.
|
|
226
|
-
|
|
228
|
+
|
|
227
229
|
Parameters
|
|
228
230
|
----------
|
|
229
231
|
name : str
|
|
230
232
|
The field name.
|
|
231
|
-
|
|
233
|
+
|
|
232
234
|
Returns
|
|
233
235
|
-------
|
|
234
236
|
fields : list of int
|
|
@@ -242,7 +244,7 @@ class GenBankFile(TextFile):
|
|
|
242
244
|
if fname == name:
|
|
243
245
|
indices.append(i)
|
|
244
246
|
return indices
|
|
245
|
-
|
|
247
|
+
|
|
246
248
|
def set_field(self, name, content, subfield_dict=None):
|
|
247
249
|
"""
|
|
248
250
|
Set a *GenBank* field with the given content.
|
|
@@ -250,7 +252,7 @@ class GenBankFile(TextFile):
|
|
|
250
252
|
If the field already exists in the file, the field is
|
|
251
253
|
overwritten, otherwise a new field is created at the end of
|
|
252
254
|
the file.
|
|
253
|
-
|
|
255
|
+
|
|
254
256
|
Parameters
|
|
255
257
|
----------
|
|
256
258
|
name : str
|
|
@@ -261,7 +263,7 @@ class GenBankFile(TextFile):
|
|
|
261
263
|
The subfields of the field.
|
|
262
264
|
The dictionary maps subfield names to the content lines of
|
|
263
265
|
the respective subfield.
|
|
264
|
-
|
|
266
|
+
|
|
265
267
|
Raises
|
|
266
268
|
------
|
|
267
269
|
InvalidFileError
|
|
@@ -283,13 +285,13 @@ class GenBankFile(TextFile):
|
|
|
283
285
|
def __getitem__(self, index):
|
|
284
286
|
index = self._translate_idx(index)
|
|
285
287
|
start, stop, name = self._field_pos[index]
|
|
286
|
-
|
|
288
|
+
|
|
287
289
|
if name in ["FEATURES", "ORIGIN"]:
|
|
288
290
|
# For those two fields return the complete lines,
|
|
289
291
|
# beginning with the line after the field name
|
|
290
|
-
content = self._get_field_content(start+1, stop, indent=0)
|
|
292
|
+
content = self._get_field_content(start + 1, stop, indent=0)
|
|
291
293
|
subfield_dict = OrderedDict()
|
|
292
|
-
|
|
294
|
+
|
|
293
295
|
else:
|
|
294
296
|
# For all metadata fields use the
|
|
295
297
|
# standard GenBank indentation (=12)
|
|
@@ -297,11 +299,11 @@ class GenBankFile(TextFile):
|
|
|
297
299
|
subfield_dict = OrderedDict()
|
|
298
300
|
subfield_start = None
|
|
299
301
|
first_subfield_start = None
|
|
300
|
-
|
|
302
|
+
header = None
|
|
303
|
+
for i in range(start + 1, stop):
|
|
301
304
|
line = self.lines[i]
|
|
302
|
-
# Check if line contains a new subfield
|
|
303
|
-
# (Header beginning from first column)
|
|
304
305
|
if len(line) != 0 and line[:12].strip() != "":
|
|
306
|
+
# New header -> new subfield
|
|
305
307
|
if first_subfield_start is None:
|
|
306
308
|
first_subfield_start = i
|
|
307
309
|
# Store previous subfield
|
|
@@ -320,12 +322,10 @@ class GenBankFile(TextFile):
|
|
|
320
322
|
# that are not part of a subfield
|
|
321
323
|
if first_subfield_start is not None:
|
|
322
324
|
stop = first_subfield_start
|
|
323
|
-
content = self._get_field_content(
|
|
324
|
-
|
|
325
|
-
)
|
|
326
|
-
|
|
325
|
+
content = self._get_field_content(start, stop, indent=12)
|
|
326
|
+
|
|
327
327
|
return name, content, subfield_dict
|
|
328
|
-
|
|
328
|
+
|
|
329
329
|
def __setitem__(self, index, item):
|
|
330
330
|
index = self._translate_idx(index)
|
|
331
331
|
if not isinstance(item, tuple):
|
|
@@ -342,7 +342,7 @@ class GenBankFile(TextFile):
|
|
|
342
342
|
"Expected a tuple of name, content and optionally subfields"
|
|
343
343
|
)
|
|
344
344
|
inserted_lines = self._to_lines(name, content, subfields)
|
|
345
|
-
|
|
345
|
+
|
|
346
346
|
# Stop of field to be replaced is start of new field
|
|
347
347
|
start, old_stop, _ = self._field_pos[index]
|
|
348
348
|
# If not the last element is set,
|
|
@@ -355,12 +355,12 @@ class GenBankFile(TextFile):
|
|
|
355
355
|
# Shift the start/stop indices of the following fields
|
|
356
356
|
# by the amount of created fields
|
|
357
357
|
shift = len(inserted_lines) - (old_stop - start)
|
|
358
|
-
for i in range(index+1, len(self._field_pos)):
|
|
358
|
+
for i in range(index + 1, len(self._field_pos)):
|
|
359
359
|
old_start, old_stop, fname = self._field_pos[i]
|
|
360
|
-
self._field_pos[i] = old_start+shift, old_stop+shift, fname
|
|
360
|
+
self._field_pos[i] = old_start + shift, old_stop + shift, fname
|
|
361
361
|
# Add new entry
|
|
362
|
-
self._field_pos[index] = start, start+len(inserted_lines), name.upper()
|
|
363
|
-
|
|
362
|
+
self._field_pos[index] = start, start + len(inserted_lines), name.upper()
|
|
363
|
+
|
|
364
364
|
def __delitem__(self, index):
|
|
365
365
|
index = self._translate_idx(index)
|
|
366
366
|
start, stop, _ = self._field_pos[index]
|
|
@@ -369,17 +369,17 @@ class GenBankFile(TextFile):
|
|
|
369
369
|
shift = stop - start
|
|
370
370
|
for i in range(index, len(self._field_pos)):
|
|
371
371
|
old_start, old_stop, name = self._field_pos[i]
|
|
372
|
-
self._field_pos[i] = old_start-shift, old_stop-shift, name
|
|
373
|
-
del self.lines[start
|
|
372
|
+
self._field_pos[i] = old_start - shift, old_stop - shift, name
|
|
373
|
+
del self.lines[start:stop]
|
|
374
374
|
del self._field_pos[index]
|
|
375
|
-
|
|
375
|
+
|
|
376
376
|
def __len__(self):
|
|
377
377
|
return len(self._field_pos)
|
|
378
378
|
|
|
379
379
|
def insert(self, index, name, content, subfields=None):
|
|
380
380
|
"""
|
|
381
381
|
Insert a *GenBank* field at the given position.
|
|
382
|
-
|
|
382
|
+
|
|
383
383
|
Parameters
|
|
384
384
|
----------
|
|
385
385
|
index : int
|
|
@@ -398,12 +398,12 @@ class GenBankFile(TextFile):
|
|
|
398
398
|
"""
|
|
399
399
|
index = self._translate_idx(index, length_exclusive=False)
|
|
400
400
|
inserted_lines = self._to_lines(name, content, subfields)
|
|
401
|
-
|
|
401
|
+
|
|
402
402
|
# Stop of previous field is start of new field
|
|
403
403
|
if index == 0:
|
|
404
404
|
start = 0
|
|
405
405
|
else:
|
|
406
|
-
_, start, _ = self._field_pos[index-1]
|
|
406
|
+
_, start, _ = self._field_pos[index - 1]
|
|
407
407
|
# If the new lines are not inserted at the end,
|
|
408
408
|
# the following lines need to be added, too
|
|
409
409
|
if start is not len(self.lines):
|
|
@@ -416,17 +416,16 @@ class GenBankFile(TextFile):
|
|
|
416
416
|
shift = len(inserted_lines)
|
|
417
417
|
for i in range(index, len(self._field_pos)):
|
|
418
418
|
old_start, old_stop, fname = self._field_pos[i]
|
|
419
|
-
self._field_pos[i] = old_start+shift, old_stop+shift, fname
|
|
419
|
+
self._field_pos[i] = old_start + shift, old_stop + shift, fname
|
|
420
420
|
# Add new entry
|
|
421
421
|
self._field_pos.insert(
|
|
422
|
-
index,
|
|
423
|
-
(start, start+len(inserted_lines), name.upper())
|
|
422
|
+
index, (start, start + len(inserted_lines), name.upper())
|
|
424
423
|
)
|
|
425
|
-
|
|
424
|
+
|
|
426
425
|
def append(self, name, content, subfields=None):
|
|
427
426
|
"""
|
|
428
427
|
Create a new *GenBank* field at the end of the file.
|
|
429
|
-
|
|
428
|
+
|
|
430
429
|
Parameters
|
|
431
430
|
----------
|
|
432
431
|
name : str
|
|
@@ -440,7 +439,6 @@ class GenBankFile(TextFile):
|
|
|
440
439
|
"""
|
|
441
440
|
self.insert(len(self), name, content, subfields)
|
|
442
441
|
|
|
443
|
-
|
|
444
442
|
def _find_field_indices(self):
|
|
445
443
|
"""
|
|
446
444
|
Identify the start and exclusive stop indices of lines
|
|
@@ -469,10 +467,10 @@ class GenBankFile(TextFile):
|
|
|
469
467
|
|
|
470
468
|
def _get_field_content(self, start, stop, indent):
|
|
471
469
|
if indent == 0:
|
|
472
|
-
return self.lines[start
|
|
470
|
+
return self.lines[start:stop]
|
|
473
471
|
else:
|
|
474
|
-
return [line[12:] for line in self.lines[start
|
|
475
|
-
|
|
472
|
+
return [line[12:] for line in self.lines[start:stop]]
|
|
473
|
+
|
|
476
474
|
def _to_lines(self, name, content, subfields):
|
|
477
475
|
"""
|
|
478
476
|
Convert the field name, field content und subfield dictionary
|
|
@@ -480,22 +478,22 @@ class GenBankFile(TextFile):
|
|
|
480
478
|
"""
|
|
481
479
|
if subfields is None:
|
|
482
480
|
subfields = {}
|
|
483
|
-
|
|
481
|
+
|
|
484
482
|
name = name.strip().upper()
|
|
485
483
|
if len(name) == 0:
|
|
486
|
-
raise ValueError(
|
|
487
|
-
subfields = OrderedDict(
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
484
|
+
raise ValueError("Must give a non emtpy name")
|
|
485
|
+
subfields = OrderedDict(
|
|
486
|
+
{
|
|
487
|
+
subfield_name.upper().strip(): subfield_lines
|
|
488
|
+
for subfield_name, subfield_lines in subfields.items()
|
|
489
|
+
}
|
|
490
|
+
)
|
|
491
|
+
|
|
492
492
|
# Create lines for new field
|
|
493
493
|
if name == "FEATURES":
|
|
494
494
|
# Header line plus all actual feature lines
|
|
495
495
|
lines = copy.copy(content)
|
|
496
|
-
lines.insert(
|
|
497
|
-
0, "FEATURES" + " "*13 + "Location/Qualifiers"
|
|
498
|
-
)
|
|
496
|
+
lines.insert(0, "FEATURES" + " " * 13 + "Location/Qualifiers")
|
|
499
497
|
elif name == "ORIGIN":
|
|
500
498
|
# Header line plus all actual sequence lines
|
|
501
499
|
lines = copy.copy(content)
|
|
@@ -504,19 +502,19 @@ class GenBankFile(TextFile):
|
|
|
504
502
|
name_column = []
|
|
505
503
|
content_column = []
|
|
506
504
|
# Create a line for the field name and empty lines
|
|
507
|
-
# for each additional line required by the content
|
|
508
|
-
name_column += [name] + [""] * (len(content)-1)
|
|
505
|
+
# for each additional line required by the content
|
|
506
|
+
name_column += [name] + [""] * (len(content) - 1)
|
|
509
507
|
content_column += content
|
|
510
508
|
for subfield_name, subfield_lines in subfields.items():
|
|
511
|
-
name_column += [" " + subfield_name]
|
|
512
|
-
+ [""] * (len(subfield_lines)-1)
|
|
509
|
+
name_column += [" " + subfield_name] + [""] * (len(subfield_lines) - 1)
|
|
513
510
|
content_column += subfield_lines
|
|
514
|
-
lines = [
|
|
515
|
-
|
|
516
|
-
|
|
511
|
+
lines = [
|
|
512
|
+
f"{n_col:12}{c_col}"
|
|
513
|
+
for n_col, c_col in zip(name_column, content_column)
|
|
514
|
+
]
|
|
515
|
+
|
|
517
516
|
return lines
|
|
518
517
|
|
|
519
|
-
|
|
520
518
|
def _translate_idx(self, index, length_exclusive=True):
|
|
521
519
|
"""
|
|
522
520
|
Check index boundaries and convert negative index to positive
|
|
@@ -539,15 +537,15 @@ class MultiFile(TextFile):
|
|
|
539
537
|
"""
|
|
540
538
|
This class represents a file in *GenBank* or *GenPept* format,
|
|
541
539
|
that contains multiple entries, for more than one UID.
|
|
542
|
-
|
|
540
|
+
|
|
543
541
|
The information for each UID are appended to each other in such a
|
|
544
542
|
file.
|
|
545
543
|
Objects of this class can be iterated to obtain a
|
|
546
544
|
:class:`GenBankFile` for each entry in the file.
|
|
547
|
-
|
|
545
|
+
|
|
548
546
|
Examples
|
|
549
547
|
--------
|
|
550
|
-
|
|
548
|
+
|
|
551
549
|
>>> import os.path
|
|
552
550
|
>>> file_name = fetch_single_file(
|
|
553
551
|
... ["1L2Y_A", "3O5R_A", "5UGO_A"],
|
|
@@ -568,8 +566,8 @@ class MultiFile(TextFile):
|
|
|
568
566
|
line = self.lines[i]
|
|
569
567
|
if line.strip() == "//":
|
|
570
568
|
# Create file with lines corresponding to that file
|
|
571
|
-
file_content = "\n".join(self.lines[start_i : i+1])
|
|
569
|
+
file_content = "\n".join(self.lines[start_i : i + 1])
|
|
572
570
|
file = GenBankFile.read(io.StringIO(file_content))
|
|
573
571
|
# Reset file start index
|
|
574
572
|
start_i = i
|
|
575
|
-
yield file
|
|
573
|
+
yield file
|
|
@@ -8,17 +8,24 @@ Functions for obtaining metadata fields of a GenBank file.
|
|
|
8
8
|
|
|
9
9
|
__name__ = "biotite.sequence.io.genbank"
|
|
10
10
|
__author__ = "Patrick Kunzmann, Natasha Jaffe"
|
|
11
|
-
__all__ = [
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
__all__ = [
|
|
12
|
+
"get_locus",
|
|
13
|
+
"get_definition",
|
|
14
|
+
"get_accession",
|
|
15
|
+
"get_version",
|
|
16
|
+
"get_gi",
|
|
17
|
+
"get_db_link",
|
|
18
|
+
"get_source",
|
|
19
|
+
"set_locus",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
from biotite.file import InvalidFileError
|
|
14
23
|
|
|
15
|
-
from ....file import InvalidFileError
|
|
16
|
-
from .file import GenBankFile
|
|
17
24
|
|
|
18
25
|
def get_locus(gb_file):
|
|
19
26
|
"""
|
|
20
27
|
Parse the *LOCUS* field of a GenBank or GenPept file.
|
|
21
|
-
|
|
28
|
+
|
|
22
29
|
Parameters
|
|
23
30
|
----------
|
|
24
31
|
gb_file : GenBankFile
|
|
@@ -39,10 +46,10 @@ def get_locus(gb_file):
|
|
|
39
46
|
The GenBank division to which the file belongs.
|
|
40
47
|
date : str, optional
|
|
41
48
|
The date of last modification.
|
|
42
|
-
|
|
49
|
+
|
|
43
50
|
Examples
|
|
44
51
|
--------
|
|
45
|
-
|
|
52
|
+
|
|
46
53
|
>>> import os.path
|
|
47
54
|
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
|
|
48
55
|
>>> name, length, mol_type, is_circular, division, date = get_locus(file)
|
|
@@ -68,59 +75,57 @@ def get_locus(gb_file):
|
|
|
68
75
|
# The first field will always be the ID
|
|
69
76
|
name = fields[0]
|
|
70
77
|
|
|
71
|
-
# The second field will always be the length followed
|
|
78
|
+
# The second field will always be the length followed
|
|
72
79
|
# by units (eg 1224 aa)
|
|
73
80
|
length = int(fields[1])
|
|
74
81
|
|
|
75
|
-
# The third field *should* be the molecular type
|
|
82
|
+
# The third field *should* be the molecular type
|
|
76
83
|
# but sometimes this is missing. This gets tricky
|
|
77
84
|
# because sometimes the next field, circular/linear,
|
|
78
85
|
# is missing, too. The field after that, division,
|
|
79
86
|
# is a 3 letter all caps token. Unfortunately, mol_type
|
|
80
|
-
# is also often a 3 letter all caps token (eg DNA)!
|
|
87
|
+
# is also often a 3 letter all caps token (eg DNA)!
|
|
81
88
|
# Fortunately, GenBank publishes the set list of divisions
|
|
82
89
|
# here: https://www.ncbi.nlm.nih.gov/genbank/samplerecord ,
|
|
83
90
|
# so we can check against that set when determining whether
|
|
84
91
|
# the current token represents the molecular type.
|
|
85
92
|
divisions = (
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
93
|
+
"PRI", # primate sequences
|
|
94
|
+
"ROD", # rodent sequences
|
|
95
|
+
"MAM", # other mammalian sequences
|
|
96
|
+
"VRT", # other vertebrate sequences
|
|
97
|
+
"INV", # invertebrate sequences
|
|
98
|
+
"PLN", # plant, fungal, and algal sequences
|
|
99
|
+
"BCT", # bacterial sequences
|
|
100
|
+
"VRL", # viral sequences
|
|
101
|
+
"PHG", # bacteriophage sequences
|
|
102
|
+
"SYN", # synthetic sequences
|
|
103
|
+
"UNA", # unannotated sequences
|
|
104
|
+
"EST", # EST sequences (expressed sequence tags)
|
|
105
|
+
"PAT", # patent sequences
|
|
106
|
+
"STS", # STS sequences (sequence tagged sites)
|
|
107
|
+
"GSS", # GSS sequences (genome survey sequences)
|
|
108
|
+
"HTG", # HTG sequences (high-throughput genomic sequences)
|
|
109
|
+
"HTC", # unfinished high-throughput cDNA sequencing
|
|
110
|
+
"ENV", # environmental sampling sequences
|
|
111
|
+
"CON",
|
|
105
112
|
)
|
|
106
113
|
|
|
107
|
-
# NOTE: Remember that fields[2] is the unit for length,
|
|
114
|
+
# NOTE: Remember that fields[2] is the unit for length,
|
|
108
115
|
# eg bp or aa, so we move to fields[3] here.
|
|
109
|
-
if fields[3] not in (
|
|
110
|
-
and fields[3] not in divisions:
|
|
116
|
+
if fields[3] not in ("linear", "circular") and fields[3] not in divisions:
|
|
111
117
|
mol_type = fields[3]
|
|
112
118
|
next_idx = 4
|
|
113
119
|
else:
|
|
114
120
|
mol_type = None
|
|
115
121
|
next_idx = 3
|
|
116
122
|
|
|
117
|
-
|
|
118
|
-
# The next field should be the token 'linear' or 'circular',
|
|
123
|
+
# The next field should be the token 'linear' or 'circular',
|
|
119
124
|
# but sometimes this is missing
|
|
120
|
-
if
|
|
125
|
+
if "linear" == fields[next_idx]:
|
|
121
126
|
is_circular = False
|
|
122
127
|
next_idx += 1
|
|
123
|
-
elif
|
|
128
|
+
elif "circular" == fields[next_idx]:
|
|
124
129
|
is_circular = True
|
|
125
130
|
next_idx += 1
|
|
126
131
|
else:
|
|
@@ -136,23 +141,24 @@ def get_locus(gb_file):
|
|
|
136
141
|
|
|
137
142
|
return name, length, mol_type, is_circular, division, date
|
|
138
143
|
|
|
144
|
+
|
|
139
145
|
def get_definition(gb_file):
|
|
140
146
|
"""
|
|
141
147
|
Parse the *DEFINITION* field of a GenBank or GenPept file.
|
|
142
|
-
|
|
148
|
+
|
|
143
149
|
Parameters
|
|
144
150
|
----------
|
|
145
151
|
gb_file : GenBankFile
|
|
146
152
|
The GenBank file to read the *DEFINITION* field from.
|
|
147
|
-
|
|
153
|
+
|
|
148
154
|
Returns
|
|
149
155
|
-------
|
|
150
156
|
definition : str
|
|
151
157
|
Content of the *DEFINITION* field.
|
|
152
|
-
|
|
158
|
+
|
|
153
159
|
Examples
|
|
154
160
|
--------
|
|
155
|
-
|
|
161
|
+
|
|
156
162
|
>>> import os.path
|
|
157
163
|
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
|
|
158
164
|
>>> print(get_definition(file))
|
|
@@ -161,23 +167,24 @@ def get_definition(gb_file):
|
|
|
161
167
|
lines, _ = _expect_single_field(gb_file, "DEFINITION")
|
|
162
168
|
return " ".join([line.strip() for line in lines])
|
|
163
169
|
|
|
170
|
+
|
|
164
171
|
def get_accession(gb_file):
|
|
165
172
|
"""
|
|
166
173
|
Parse the *ACCESSION* field of a GenBank or GenPept file.
|
|
167
|
-
|
|
174
|
+
|
|
168
175
|
Parameters
|
|
169
176
|
----------
|
|
170
177
|
gb_file : GenBankFile
|
|
171
178
|
The GenBank file to read the *ACCESSION* field from.
|
|
172
|
-
|
|
179
|
+
|
|
173
180
|
Returns
|
|
174
181
|
-------
|
|
175
182
|
accession : str
|
|
176
183
|
The accession ID of the file.
|
|
177
|
-
|
|
184
|
+
|
|
178
185
|
Examples
|
|
179
186
|
--------
|
|
180
|
-
|
|
187
|
+
|
|
181
188
|
>>> import os.path
|
|
182
189
|
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
|
|
183
190
|
>>> print(get_accession(file))
|
|
@@ -187,16 +194,17 @@ def get_accession(gb_file):
|
|
|
187
194
|
# 'ACCESSION' field has only one line
|
|
188
195
|
return lines[0]
|
|
189
196
|
|
|
197
|
+
|
|
190
198
|
def get_version(gb_file):
|
|
191
199
|
"""
|
|
192
200
|
Parse the version from the *VERSION* field of a GenBank or GenPept
|
|
193
201
|
file.
|
|
194
|
-
|
|
202
|
+
|
|
195
203
|
Parameters
|
|
196
204
|
----------
|
|
197
205
|
gb_file : GenBankFile
|
|
198
206
|
The GenBank file to read the *VERSION* field from.
|
|
199
|
-
|
|
207
|
+
|
|
200
208
|
Returns
|
|
201
209
|
-------
|
|
202
210
|
version : str
|
|
@@ -206,16 +214,17 @@ def get_version(gb_file):
|
|
|
206
214
|
# 'VERSION' field has only one line
|
|
207
215
|
return lines[0].split()[0]
|
|
208
216
|
|
|
217
|
+
|
|
209
218
|
def get_gi(gb_file):
|
|
210
219
|
"""
|
|
211
220
|
Parse the GI from the *VERSION* field of a GenBank or GenPept
|
|
212
221
|
file.
|
|
213
|
-
|
|
222
|
+
|
|
214
223
|
Parameters
|
|
215
224
|
----------
|
|
216
225
|
gb_file : GenBankFile
|
|
217
226
|
The GenBank file to read the *VERSION* field from.
|
|
218
|
-
|
|
227
|
+
|
|
219
228
|
Returns
|
|
220
229
|
-------
|
|
221
230
|
gi : str
|
|
@@ -229,24 +238,25 @@ def get_gi(gb_file):
|
|
|
229
238
|
# Truncate GI
|
|
230
239
|
return int(version_info[1][3:])
|
|
231
240
|
|
|
241
|
+
|
|
232
242
|
def get_db_link(gb_file):
|
|
233
243
|
"""
|
|
234
244
|
Parse the *DBLINK* field of a GenBank or GenPept file.
|
|
235
|
-
|
|
245
|
+
|
|
236
246
|
Parameters
|
|
237
247
|
----------
|
|
238
248
|
gb_file : GenBankFile
|
|
239
249
|
The GenBank file to read the *DBLINK* field from.
|
|
240
|
-
|
|
250
|
+
|
|
241
251
|
Returns
|
|
242
252
|
-------
|
|
243
253
|
link_dict : dict
|
|
244
254
|
A dictionary storing the database links, with the database
|
|
245
255
|
name as key, and the corresponding ID as value.
|
|
246
|
-
|
|
256
|
+
|
|
247
257
|
Examples
|
|
248
258
|
--------
|
|
249
|
-
|
|
259
|
+
|
|
250
260
|
>>> import os.path
|
|
251
261
|
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
|
|
252
262
|
>>> for key, val in get_db_link(file).items():
|
|
@@ -265,12 +275,12 @@ def get_db_link(gb_file):
|
|
|
265
275
|
def get_source(gb_file):
|
|
266
276
|
"""
|
|
267
277
|
Parse the *SOURCE* field of a GenBank or GenPept file.
|
|
268
|
-
|
|
278
|
+
|
|
269
279
|
Parameters
|
|
270
280
|
----------
|
|
271
281
|
gb_file : GenBankFile
|
|
272
282
|
The GenBank file to read the *SOURCE* field from.
|
|
273
|
-
|
|
283
|
+
|
|
274
284
|
Returns
|
|
275
285
|
-------
|
|
276
286
|
accession : str
|
|
@@ -290,12 +300,12 @@ def _expect_single_field(gb_file, name):
|
|
|
290
300
|
return fields[0]
|
|
291
301
|
|
|
292
302
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
303
|
+
def set_locus(
|
|
304
|
+
gb_file, name, length, mol_type=None, is_circular=False, division=None, date=None
|
|
305
|
+
):
|
|
296
306
|
"""
|
|
297
307
|
Set the *LOCUS* field of a GenBank file.
|
|
298
|
-
|
|
308
|
+
|
|
299
309
|
Parameters
|
|
300
310
|
----------
|
|
301
311
|
gb_file : GenBankFile
|
|
@@ -319,6 +329,8 @@ def set_locus(gb_file, name, length, mol_type=None, is_circular=False,
|
|
|
319
329
|
circularity = "circular" if is_circular else "linear"
|
|
320
330
|
division = "" if division is None else division
|
|
321
331
|
date = "" if date is None else date
|
|
322
|
-
line =
|
|
323
|
-
|
|
324
|
-
|
|
332
|
+
line = (
|
|
333
|
+
f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} "
|
|
334
|
+
f"{circularity:8} {division:3} {date:11}"
|
|
335
|
+
)
|
|
336
|
+
gb_file.set_field("LOCUS", [line])
|