biotite 0.41.2__cp310-cp310-macosx_11_0_arm64.whl → 1.0.1__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cpython-310-darwin.so +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +246 -236
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cpython-310-darwin.so +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cpython-310-darwin.so +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cpython-310-darwin.so +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +83 -78
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +140 -110
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +260 -258
- biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
- biotite/structure/io/trajfile.py +90 -107
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cpython-310-darwin.so +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/METADATA +6 -5
- biotite-1.0.1.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -8,16 +8,19 @@ Functions for converting a sequence from/to a GenBank file.
|
|
|
8
8
|
|
|
9
9
|
__name__ = "biotite.sequence.io.genbank"
|
|
10
10
|
__author__ = "Patrick Kunzmann"
|
|
11
|
-
__all__ = [
|
|
12
|
-
|
|
11
|
+
__all__ = [
|
|
12
|
+
"get_raw_sequence",
|
|
13
|
+
"get_sequence",
|
|
14
|
+
"get_annotated_sequence",
|
|
15
|
+
"set_sequence",
|
|
16
|
+
"set_annotated_sequence",
|
|
17
|
+
]
|
|
13
18
|
|
|
14
19
|
import re
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from .
|
|
19
|
-
from .annotation import get_annotation, set_annotation
|
|
20
|
-
|
|
20
|
+
from biotite.file import InvalidFileError
|
|
21
|
+
from biotite.sequence.annotation import AnnotatedSequence
|
|
22
|
+
from biotite.sequence.io.genbank.annotation import get_annotation, set_annotation
|
|
23
|
+
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
21
24
|
|
|
22
25
|
_SYMBOLS_PER_CHUNK = 10
|
|
23
26
|
_SEQ_CHUNKS_PER_LINE = 6
|
|
@@ -112,7 +115,7 @@ def _convert_seq_str(seq_str, format):
|
|
|
112
115
|
if len(seq_str) == 0:
|
|
113
116
|
raise InvalidFileError("The file's 'ORIGIN' field is empty")
|
|
114
117
|
if format == "gb":
|
|
115
|
-
return NucleotideSequence(seq_str.replace("U","T").replace("X","N"))
|
|
118
|
+
return NucleotideSequence(seq_str.replace("U", "T").replace("X", "N"))
|
|
116
119
|
elif format == "gp":
|
|
117
120
|
return ProteinSequence(seq_str.replace("U", "C").replace("O", "K"))
|
|
118
121
|
else:
|
|
@@ -125,8 +128,6 @@ def _get_seq_start(origin_content):
|
|
|
125
128
|
return int(origin_content[0].split()[0])
|
|
126
129
|
|
|
127
130
|
|
|
128
|
-
|
|
129
|
-
|
|
130
131
|
def set_sequence(gb_file, sequence, sequence_start=1):
|
|
131
132
|
"""
|
|
132
133
|
Set the *ORIGIN* field of a GenBank file with a sequence.
|
|
@@ -167,6 +168,4 @@ def set_annotated_sequence(gb_file, annot_sequence):
|
|
|
167
168
|
The annotated sequence that is put into the GenBank file.
|
|
168
169
|
"""
|
|
169
170
|
set_annotation(gb_file, annot_sequence.annotation)
|
|
170
|
-
set_sequence(
|
|
171
|
-
gb_file, annot_sequence.sequence, annot_sequence.sequence_start
|
|
172
|
-
)
|
|
171
|
+
set_sequence(gb_file, annot_sequence.sequence, annot_sequence.sequence_start)
|
biotite/sequence/io/general.py
CHANGED
|
@@ -9,31 +9,27 @@ general sequence files.
|
|
|
9
9
|
|
|
10
10
|
__name__ = "biotite.sequence.io"
|
|
11
11
|
__author__ = "Patrick Kunzmann"
|
|
12
|
-
__all__ = ["load_sequence", "save_sequence",
|
|
13
|
-
"load_sequences", "save_sequences"]
|
|
12
|
+
__all__ = ["load_sequence", "save_sequence", "load_sequences", "save_sequences"]
|
|
14
13
|
|
|
15
|
-
import itertools
|
|
16
14
|
import os.path
|
|
17
|
-
import io
|
|
18
15
|
from collections import OrderedDict
|
|
19
16
|
import numpy as np
|
|
20
|
-
from
|
|
21
|
-
from ..alphabet import Alphabet
|
|
17
|
+
from biotite.sequence.seqtypes import NucleotideSequence
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
def load_sequence(file_path):
|
|
25
21
|
"""
|
|
26
22
|
Load a sequence from a sequence file without the need
|
|
27
23
|
to manually instantiate a :class:`File` object.
|
|
28
|
-
|
|
24
|
+
|
|
29
25
|
Internally this function uses a :class:`File` object, based on the
|
|
30
26
|
file extension.
|
|
31
|
-
|
|
27
|
+
|
|
32
28
|
Parameters
|
|
33
29
|
----------
|
|
34
30
|
file_path : str
|
|
35
31
|
The path to the sequence file.
|
|
36
|
-
|
|
32
|
+
|
|
37
33
|
Returns
|
|
38
34
|
-------
|
|
39
35
|
sequence : Sequence
|
|
@@ -42,11 +38,13 @@ def load_sequence(file_path):
|
|
|
42
38
|
# We only need the suffix here
|
|
43
39
|
filename, suffix = os.path.splitext(file_path)
|
|
44
40
|
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
|
|
45
|
-
from .fasta import FastaFile, get_sequence
|
|
41
|
+
from biotite.sequence.io.fasta import FastaFile, get_sequence
|
|
42
|
+
|
|
46
43
|
file = FastaFile.read(file_path)
|
|
47
44
|
return get_sequence(file)
|
|
48
45
|
elif suffix in [".fastq", ".fq"]:
|
|
49
|
-
from .fastq import FastqFile
|
|
46
|
+
from biotite.sequence.io.fastq import FastqFile
|
|
47
|
+
|
|
50
48
|
# Quality scores are irrelevant for this function
|
|
51
49
|
# -> Offset is irrelevant
|
|
52
50
|
file = FastqFile.read(file_path, offset="Sanger")
|
|
@@ -56,7 +54,8 @@ def load_sequence(file_path):
|
|
|
56
54
|
break
|
|
57
55
|
return sequence
|
|
58
56
|
elif suffix in [".gb", ".gbk", ".gp"]:
|
|
59
|
-
from .genbank import GenBankFile, get_sequence
|
|
57
|
+
from biotite.sequence.io.genbank import GenBankFile, get_sequence
|
|
58
|
+
|
|
60
59
|
format = "gp" if suffix == ".gp" else "gb"
|
|
61
60
|
file = GenBankFile.read(file_path)
|
|
62
61
|
return get_sequence(file, format)
|
|
@@ -68,10 +67,10 @@ def save_sequence(file_path, sequence):
|
|
|
68
67
|
"""
|
|
69
68
|
Save a sequence into a sequence file without the need
|
|
70
69
|
to manually instantiate a :class:`File` object.
|
|
71
|
-
|
|
70
|
+
|
|
72
71
|
Internally this function uses a :class:`File` object, based on the
|
|
73
72
|
given file extension.
|
|
74
|
-
|
|
73
|
+
|
|
75
74
|
Parameters
|
|
76
75
|
----------
|
|
77
76
|
file_path : str
|
|
@@ -82,12 +81,14 @@ def save_sequence(file_path, sequence):
|
|
|
82
81
|
# We only need the suffix here
|
|
83
82
|
filename, suffix = os.path.splitext(file_path)
|
|
84
83
|
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
|
|
85
|
-
from .fasta import FastaFile, set_sequence
|
|
84
|
+
from biotite.sequence.io.fasta import FastaFile, set_sequence
|
|
85
|
+
|
|
86
86
|
file = FastaFile()
|
|
87
87
|
set_sequence(file, sequence)
|
|
88
88
|
file.write(file_path)
|
|
89
89
|
elif suffix in [".fastq", ".fq"]:
|
|
90
|
-
from .fastq import FastqFile
|
|
90
|
+
from biotite.sequence.io.fastq import FastqFile
|
|
91
|
+
|
|
91
92
|
# Quality scores are irrelevant for this function
|
|
92
93
|
# -> Offset is irrelevant
|
|
93
94
|
file = FastqFile(offset="Sanger")
|
|
@@ -96,7 +97,8 @@ def save_sequence(file_path, sequence):
|
|
|
96
97
|
file["sequence"] = str(sequence), scores
|
|
97
98
|
file.write(file_path)
|
|
98
99
|
elif suffix in [".gb", ".gbk", ".gp"]:
|
|
99
|
-
from .genbank import GenBankFile, set_locus, set_sequence
|
|
100
|
+
from biotite.sequence.io.genbank import GenBankFile, set_locus, set_sequence
|
|
101
|
+
|
|
100
102
|
file = GenBankFile()
|
|
101
103
|
set_locus(file, "sequence", len(sequence))
|
|
102
104
|
set_sequence(file, sequence)
|
|
@@ -109,37 +111,42 @@ def load_sequences(file_path):
|
|
|
109
111
|
"""
|
|
110
112
|
Load multiple sequences from a sequence file without the need
|
|
111
113
|
to manually instantiate a :class:`File` object.
|
|
112
|
-
|
|
114
|
+
|
|
113
115
|
Internally this function uses a :class:`File` object, based on the
|
|
114
116
|
file extension.
|
|
115
|
-
|
|
117
|
+
|
|
116
118
|
Parameters
|
|
117
119
|
----------
|
|
118
120
|
file_path : str
|
|
119
121
|
The path to the sequence file.
|
|
120
|
-
|
|
122
|
+
|
|
121
123
|
Returns
|
|
122
124
|
-------
|
|
123
125
|
sequences : dict of (str, Sequence)
|
|
124
126
|
The sequences in the file.
|
|
125
127
|
This dictionary maps each header name to
|
|
126
|
-
the respective sequence.
|
|
128
|
+
the respective sequence.
|
|
127
129
|
"""
|
|
128
130
|
# We only need the suffix here
|
|
129
131
|
filename, suffix = os.path.splitext(file_path)
|
|
130
132
|
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
|
|
131
|
-
from .fasta import FastaFile, get_sequences
|
|
133
|
+
from biotite.sequence.io.fasta import FastaFile, get_sequences
|
|
134
|
+
|
|
132
135
|
file = FastaFile.read(file_path)
|
|
133
136
|
return get_sequences(file)
|
|
134
137
|
elif suffix in [".fastq", ".fq"]:
|
|
135
|
-
from .fastq import FastqFile
|
|
138
|
+
from biotite.sequence.io.fastq import FastqFile
|
|
139
|
+
|
|
136
140
|
# Quality scores are irrelevant for this function
|
|
137
141
|
# -> Offset is irrelevant
|
|
138
142
|
file = FastqFile.read(file_path, offset="Sanger")
|
|
139
|
-
return {
|
|
140
|
-
|
|
143
|
+
return {
|
|
144
|
+
identifier: NucleotideSequence(seq_str)
|
|
145
|
+
for identifier, (seq_str, scores) in file.items()
|
|
146
|
+
}
|
|
141
147
|
elif suffix in [".gb", ".gbk", ".gp"]:
|
|
142
|
-
from .genbank import MultiFile, get_definition, get_sequence
|
|
148
|
+
from biotite.sequence.io.genbank import MultiFile, get_definition, get_sequence
|
|
149
|
+
|
|
143
150
|
file = MultiFile.read(file_path)
|
|
144
151
|
format = "gp" if suffix == ".gp" else "gb"
|
|
145
152
|
sequences = OrderedDict()
|
|
@@ -154,10 +161,10 @@ def save_sequences(file_path, sequences):
|
|
|
154
161
|
"""
|
|
155
162
|
Save multiple sequences into a sequence file without the need
|
|
156
163
|
to manually instantiate a :class:`File` object.
|
|
157
|
-
|
|
164
|
+
|
|
158
165
|
Internally this function uses a :class:`File` object, based on the
|
|
159
166
|
given file extension.
|
|
160
|
-
|
|
167
|
+
|
|
161
168
|
Parameters
|
|
162
169
|
----------
|
|
163
170
|
file_path : str
|
|
@@ -169,12 +176,14 @@ def save_sequences(file_path, sequences):
|
|
|
169
176
|
# We only need the suffix here
|
|
170
177
|
filename, suffix = os.path.splitext(file_path)
|
|
171
178
|
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
|
|
172
|
-
from .fasta import FastaFile, set_sequences
|
|
179
|
+
from biotite.sequence.io.fasta import FastaFile, set_sequences
|
|
180
|
+
|
|
173
181
|
file = FastaFile()
|
|
174
182
|
set_sequences(file, sequences)
|
|
175
183
|
file.write(file_path)
|
|
176
184
|
elif suffix in [".fastq", ".fq"]:
|
|
177
|
-
from .fastq import FastqFile
|
|
185
|
+
from biotite.sequence.io.fastq import FastqFile
|
|
186
|
+
|
|
178
187
|
# Quality scores are irrelevant for this function
|
|
179
188
|
# -> Offset is irrelevant
|
|
180
189
|
file = FastqFile(offset="Sanger")
|
|
@@ -14,7 +14,7 @@ interface to this format, and high-level functions for extracting
|
|
|
14
14
|
GFF 3 files. This means, that you cannot directly access the the
|
|
15
15
|
parent or child of a feature.
|
|
16
16
|
However, the ``Id`` and ``Name`` attributes are stored in the
|
|
17
|
-
qualifiers of the created :class:`Feature` objects.
|
|
17
|
+
qualifiers of the created :class:`Feature` objects.
|
|
18
18
|
Hence, it is possible to implement such a data structure from this
|
|
19
19
|
information.
|
|
20
20
|
"""
|
|
@@ -22,5 +22,5 @@ interface to this format, and high-level functions for extracting
|
|
|
22
22
|
__name__ = "biotite.sequence.io.gff"
|
|
23
23
|
__author__ = "Patrick Kunzmann"
|
|
24
24
|
|
|
25
|
+
from .convert import *
|
|
25
26
|
from .file import *
|
|
26
|
-
from .convert import *
|
|
@@ -6,7 +6,7 @@ __name__ = "biotite.sequence.io.gff"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["get_annotation", "set_annotation"]
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from biotite.sequence.annotation import Annotation, Feature, Location
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def get_annotation(gff_file):
|
|
@@ -22,12 +22,12 @@ def get_annotation(gff_file):
|
|
|
22
22
|
Thus, for entries with the same ``ID``, the *type* and *attributes*
|
|
23
23
|
are only parsed once and the locations are aggregated from each
|
|
24
24
|
entry.
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Parameters
|
|
27
27
|
----------
|
|
28
28
|
gff_file : GFFFile
|
|
29
29
|
The file tro extract the :class:`Annotation` object from.
|
|
30
|
-
|
|
30
|
+
|
|
31
31
|
Returns
|
|
32
32
|
-------
|
|
33
33
|
annotation : Annotation
|
|
@@ -45,9 +45,7 @@ def get_annotation(gff_file):
|
|
|
45
45
|
# (beginning of the file)
|
|
46
46
|
if current_key is not None:
|
|
47
47
|
# Beginning of new feature -> Save previous feature
|
|
48
|
-
annot.add_feature(
|
|
49
|
-
Feature(current_key, current_locs, current_qual)
|
|
50
|
-
)
|
|
48
|
+
annot.add_feature(Feature(current_key, current_locs, current_qual))
|
|
51
49
|
# Track new feature
|
|
52
50
|
current_key = type
|
|
53
51
|
current_locs = [Location(start, end, strand)]
|
|
@@ -61,15 +59,14 @@ def get_annotation(gff_file):
|
|
|
61
59
|
return annot
|
|
62
60
|
|
|
63
61
|
|
|
64
|
-
def set_annotation(gff_file, annotation,
|
|
65
|
-
seqid=None, source=None, is_stranded=True):
|
|
62
|
+
def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=True):
|
|
66
63
|
"""
|
|
67
64
|
Write an :class:`Annotation` object into a GFF3 file.
|
|
68
65
|
|
|
69
66
|
Each feature will get one entry for each location it has.
|
|
70
67
|
:class:`Feature` objects with multiple locations require the ``ID``
|
|
71
68
|
qualifier in its :attr:`Feature.qual` attribute.
|
|
72
|
-
|
|
69
|
+
|
|
73
70
|
Parameters
|
|
74
71
|
----------
|
|
75
72
|
gff_file : GFFFile
|
|
@@ -87,14 +84,13 @@ def set_annotation(gff_file, annotation,
|
|
|
87
84
|
for feature in sorted(annotation):
|
|
88
85
|
if len(feature.locs) > 1 and "ID" not in feature.qual:
|
|
89
86
|
raise ValueError(
|
|
90
|
-
"The 'Id' qualifier is required "
|
|
91
|
-
"for features with multiple locations"
|
|
87
|
+
"The 'Id' qualifier is required " "for features with multiple locations"
|
|
92
88
|
)
|
|
93
89
|
## seqid ##
|
|
94
90
|
if seqid is not None and " " in seqid:
|
|
95
91
|
raise ValueError("The 'seqid' must not contain whitespaces")
|
|
96
92
|
## source ##
|
|
97
|
-
#Nothing to be done
|
|
93
|
+
# Nothing to be done
|
|
98
94
|
## type ##
|
|
99
95
|
type = feature.key
|
|
100
96
|
## strand ##
|
|
@@ -128,6 +124,5 @@ def set_annotation(gff_file, annotation,
|
|
|
128
124
|
else:
|
|
129
125
|
phase = None
|
|
130
126
|
gff_file.append(
|
|
131
|
-
seqid, source, type, start, end,
|
|
132
|
-
|
|
133
|
-
)
|
|
127
|
+
seqid, source, type, start, end, score, strand, phase, attributes
|
|
128
|
+
)
|
biotite/sequence/io/gff/file.py
CHANGED
|
@@ -6,19 +6,17 @@ __name__ = "biotite.sequence.io.gff"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["GFFFile"]
|
|
8
8
|
|
|
9
|
-
import copy
|
|
10
9
|
import string
|
|
11
|
-
from urllib.parse import quote, unquote
|
|
12
10
|
import warnings
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
|
|
11
|
+
from urllib.parse import quote, unquote
|
|
12
|
+
from biotite.file import InvalidFileError, TextFile
|
|
13
|
+
from biotite.sequence.annotation import Location
|
|
16
14
|
|
|
17
15
|
# All punctuation characters except
|
|
18
16
|
# percent, semicolon, equals, ampersand, comma
|
|
19
|
-
_NOT_QUOTED =
|
|
20
|
-
[char for char in string.punctuation if char not in "%;=&,"]
|
|
21
|
-
)
|
|
17
|
+
_NOT_QUOTED = (
|
|
18
|
+
"".join([char for char in string.punctuation if char not in "%;=&,"]) + " "
|
|
19
|
+
)
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class GFFFile(TextFile):
|
|
@@ -61,7 +59,7 @@ class GFFFile(TextFile):
|
|
|
61
59
|
The content after the ``##FASTA`` directive is simply ignored.
|
|
62
60
|
Please provide the sequence via a separate file or read the FASTA
|
|
63
61
|
data directly via the :attr:`lines` attribute:
|
|
64
|
-
|
|
62
|
+
|
|
65
63
|
>>> import os.path
|
|
66
64
|
>>> from io import StringIO
|
|
67
65
|
>>> gff_file = GFFFile.read(os.path.join(path_to_sequences, "indexing_test.gff3"))
|
|
@@ -121,7 +119,7 @@ class GFFFile(TextFile):
|
|
|
121
119
|
##Example directive param1 param2
|
|
122
120
|
SomeSeqID Biotite CDS 1 99 . + 0 ID=FeatureID;product=A protein
|
|
123
121
|
"""
|
|
124
|
-
|
|
122
|
+
|
|
125
123
|
def __init__(self):
|
|
126
124
|
super().__init__()
|
|
127
125
|
# Maps entry indices to line indices
|
|
@@ -132,18 +130,18 @@ class GFFFile(TextFile):
|
|
|
132
130
|
self._has_fasta = None
|
|
133
131
|
self._index_entries()
|
|
134
132
|
self.append_directive("gff-version", "3")
|
|
135
|
-
|
|
133
|
+
|
|
136
134
|
@classmethod
|
|
137
135
|
def read(cls, file):
|
|
138
136
|
"""
|
|
139
137
|
Read a GFF3 file.
|
|
140
|
-
|
|
138
|
+
|
|
141
139
|
Parameters
|
|
142
140
|
----------
|
|
143
141
|
file : file-like object or str
|
|
144
142
|
The file to be read.
|
|
145
143
|
Alternatively a file path can be supplied.
|
|
146
|
-
|
|
144
|
+
|
|
147
145
|
Returns
|
|
148
146
|
-------
|
|
149
147
|
file_object : GFFFile
|
|
@@ -152,18 +150,29 @@ class GFFFile(TextFile):
|
|
|
152
150
|
file = super().read(file)
|
|
153
151
|
file._index_entries()
|
|
154
152
|
return file
|
|
155
|
-
|
|
156
|
-
def insert(
|
|
157
|
-
|
|
153
|
+
|
|
154
|
+
def insert(
|
|
155
|
+
self,
|
|
156
|
+
index,
|
|
157
|
+
seqid,
|
|
158
|
+
source,
|
|
159
|
+
type,
|
|
160
|
+
start,
|
|
161
|
+
end,
|
|
162
|
+
score,
|
|
163
|
+
strand,
|
|
164
|
+
phase,
|
|
165
|
+
attributes=None,
|
|
166
|
+
):
|
|
158
167
|
"""
|
|
159
168
|
Insert an entry at the given index.
|
|
160
|
-
|
|
169
|
+
|
|
161
170
|
Parameters
|
|
162
171
|
----------
|
|
163
172
|
index : int
|
|
164
173
|
Index where the entry is inserted.
|
|
165
174
|
If the index is equal to the length of the file, the entry
|
|
166
|
-
is appended at the end of the file.
|
|
175
|
+
is appended at the end of the file.
|
|
167
176
|
seqid : str
|
|
168
177
|
The ID of the reference sequence.
|
|
169
178
|
source : str
|
|
@@ -184,22 +193,23 @@ class GFFFile(TextFile):
|
|
|
184
193
|
Additional properties of the feature.
|
|
185
194
|
"""
|
|
186
195
|
if index == len(self):
|
|
187
|
-
self.append(
|
|
188
|
-
|
|
196
|
+
self.append(
|
|
197
|
+
seqid, source, type, start, end, score, strand, phase, attributes
|
|
198
|
+
)
|
|
189
199
|
else:
|
|
190
200
|
line_index = self._entries[index]
|
|
191
201
|
line = GFFFile._create_line(
|
|
192
|
-
seqid, source, type, start, end,
|
|
193
|
-
score, strand, phase, attributes
|
|
202
|
+
seqid, source, type, start, end, score, strand, phase, attributes
|
|
194
203
|
)
|
|
195
204
|
self.lines.insert(line_index, line)
|
|
196
205
|
self._index_entries()
|
|
197
|
-
|
|
198
|
-
def append(
|
|
199
|
-
|
|
206
|
+
|
|
207
|
+
def append(
|
|
208
|
+
self, seqid, source, type, start, end, score, strand, phase, attributes=None
|
|
209
|
+
):
|
|
200
210
|
"""
|
|
201
211
|
Append an entry to the end of the file.
|
|
202
|
-
|
|
212
|
+
|
|
203
213
|
Parameters
|
|
204
214
|
----------
|
|
205
215
|
seqid : str
|
|
@@ -232,11 +242,11 @@ class GFFFile(TextFile):
|
|
|
232
242
|
self.lines.append(line)
|
|
233
243
|
# Fast update of entry index by adding last line
|
|
234
244
|
self._entries.append(len(self.lines) - 1)
|
|
235
|
-
|
|
245
|
+
|
|
236
246
|
def append_directive(self, directive, *args):
|
|
237
247
|
"""
|
|
238
248
|
Append a directive line to the end of the file.
|
|
239
|
-
|
|
249
|
+
|
|
240
250
|
Parameters
|
|
241
251
|
----------
|
|
242
252
|
directive : str
|
|
@@ -245,13 +255,13 @@ class GFFFile(TextFile):
|
|
|
245
255
|
Optional parameters for the directive.
|
|
246
256
|
Each argument is simply appended to the directive, separated
|
|
247
257
|
by a single space character.
|
|
248
|
-
|
|
258
|
+
|
|
249
259
|
Raises
|
|
250
260
|
------
|
|
251
261
|
NotImplementedError
|
|
252
262
|
If the ``##FASTA`` directive is used, which is not
|
|
253
263
|
supported.
|
|
254
|
-
|
|
264
|
+
|
|
255
265
|
Examples
|
|
256
266
|
--------
|
|
257
267
|
|
|
@@ -262,17 +272,15 @@ class GFFFile(TextFile):
|
|
|
262
272
|
##Example directive param1 param2
|
|
263
273
|
"""
|
|
264
274
|
if directive.startswith("FASTA"):
|
|
265
|
-
raise NotImplementedError(
|
|
266
|
-
"Adding FASTA information is not supported"
|
|
267
|
-
)
|
|
275
|
+
raise NotImplementedError("Adding FASTA information is not supported")
|
|
268
276
|
directive_line = "##" + directive + " " + " ".join(args)
|
|
269
277
|
self._directives.append((directive_line[2:], len(self.lines)))
|
|
270
278
|
self.lines.append(directive_line)
|
|
271
|
-
|
|
279
|
+
|
|
272
280
|
def directives(self):
|
|
273
281
|
"""
|
|
274
282
|
Get the directives in the file.
|
|
275
|
-
|
|
283
|
+
|
|
276
284
|
Returns
|
|
277
285
|
-------
|
|
278
286
|
directives : list of tuple(str, int)
|
|
@@ -283,7 +291,7 @@ class GFFFile(TextFile):
|
|
|
283
291
|
"""
|
|
284
292
|
# Sort in line order
|
|
285
293
|
return sorted(self._directives, key=lambda directive: directive[1])
|
|
286
|
-
|
|
294
|
+
|
|
287
295
|
def __setitem__(self, index, item):
|
|
288
296
|
seqid, source, type, start, end, score, strand, phase, attrib = item
|
|
289
297
|
line = GFFFile._create_line(
|
|
@@ -292,15 +300,13 @@ class GFFFile(TextFile):
|
|
|
292
300
|
line_index = self._entries[index]
|
|
293
301
|
self.lines[line_index] = line
|
|
294
302
|
|
|
295
|
-
|
|
296
303
|
def __getitem__(self, index):
|
|
297
|
-
if (index >= 0 and
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
+
if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)):
|
|
305
|
+
raise IndexError(
|
|
306
|
+
f"Index {index} is out of range for GFFFile with "
|
|
307
|
+
f"{len(self)} entries"
|
|
308
|
+
)
|
|
309
|
+
|
|
304
310
|
line_index = self._entries[index]
|
|
305
311
|
# Columns are tab separated
|
|
306
312
|
s = self.lines[line_index].strip().split("\t")
|
|
@@ -324,15 +330,15 @@ class GFFFile(TextFile):
|
|
|
324
330
|
attrib = GFFFile._parse_attributes(attrib)
|
|
325
331
|
|
|
326
332
|
return seqid, source, type, start, end, score, strand, phase, attrib
|
|
327
|
-
|
|
333
|
+
|
|
328
334
|
def __delitem__(self, index):
|
|
329
335
|
line_index = self._entries[index]
|
|
330
336
|
del self.lines[line_index]
|
|
331
337
|
self._index_entries()
|
|
332
|
-
|
|
338
|
+
|
|
333
339
|
def __len__(self):
|
|
334
340
|
return len(self._entries)
|
|
335
|
-
|
|
341
|
+
|
|
336
342
|
def _index_entries(self):
|
|
337
343
|
"""
|
|
338
344
|
Parse the file for comment and directive lines.
|
|
@@ -374,15 +380,12 @@ class GFFFile(TextFile):
|
|
|
374
380
|
self._entries = self._entries[:entry_counter]
|
|
375
381
|
|
|
376
382
|
@staticmethod
|
|
377
|
-
def _create_line(seqid, source, type, start, end,
|
|
378
|
-
score, strand, phase, attributes):
|
|
383
|
+
def _create_line(seqid, source, type, start, end, score, strand, phase, attributes):
|
|
379
384
|
"""
|
|
380
385
|
Create a line for a newly created entry.
|
|
381
386
|
"""
|
|
382
|
-
seqid = quote(seqid.strip(), safe=_NOT_QUOTED)
|
|
383
|
-
|
|
384
|
-
source = quote(source.strip(), safe=_NOT_QUOTED) \
|
|
385
|
-
if source is not None else "."
|
|
387
|
+
seqid = quote(seqid.strip(), safe=_NOT_QUOTED) if seqid is not None else "."
|
|
388
|
+
source = quote(source.strip(), safe=_NOT_QUOTED) if source is not None else "."
|
|
386
389
|
type = type.strip()
|
|
387
390
|
|
|
388
391
|
# Perform checks
|
|
@@ -394,7 +397,7 @@ class GFFFile(TextFile):
|
|
|
394
397
|
raise ValueError("'type' must not be empty")
|
|
395
398
|
if seqid[0] == ">":
|
|
396
399
|
raise ValueError("'seqid' must not start with '>'")
|
|
397
|
-
|
|
400
|
+
|
|
398
401
|
score = str(score) if score is not None else "."
|
|
399
402
|
if strand == Location.Strand.FORWARD:
|
|
400
403
|
strand = "+"
|
|
@@ -403,16 +406,31 @@ class GFFFile(TextFile):
|
|
|
403
406
|
else:
|
|
404
407
|
strand = "."
|
|
405
408
|
phase = str(phase) if phase is not None else "."
|
|
406
|
-
attributes =
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
409
|
+
attributes = (
|
|
410
|
+
";".join(
|
|
411
|
+
[
|
|
412
|
+
quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED)
|
|
413
|
+
for key, val in attributes.items()
|
|
414
|
+
]
|
|
415
|
+
)
|
|
416
|
+
if attributes is not None and len(attributes) > 0
|
|
417
|
+
else "."
|
|
418
|
+
)
|
|
410
419
|
|
|
411
420
|
return "\t".join(
|
|
412
|
-
[
|
|
413
|
-
|
|
421
|
+
[
|
|
422
|
+
seqid,
|
|
423
|
+
source,
|
|
424
|
+
type,
|
|
425
|
+
str(start),
|
|
426
|
+
str(end),
|
|
427
|
+
str(score),
|
|
428
|
+
strand,
|
|
429
|
+
phase,
|
|
430
|
+
attributes,
|
|
431
|
+
]
|
|
414
432
|
)
|
|
415
|
-
|
|
433
|
+
|
|
416
434
|
@staticmethod
|
|
417
435
|
def _parse_attributes(attributes):
|
|
418
436
|
"""
|
|
@@ -426,9 +444,7 @@ class GFFFile(TextFile):
|
|
|
426
444
|
for entry in attrib_entries:
|
|
427
445
|
compounds = entry.split("=")
|
|
428
446
|
if len(compounds) != 2:
|
|
429
|
-
raise InvalidFileError(
|
|
430
|
-
f"Attribute entry '{entry}' is invalid"
|
|
431
|
-
)
|
|
447
|
+
raise InvalidFileError(f"Attribute entry '{entry}' is invalid")
|
|
432
448
|
key, val = compounds
|
|
433
449
|
attrib_dict[unquote(key)] = unquote(val)
|
|
434
|
-
return attrib_dict
|
|
450
|
+
return attrib_dict
|
|
Binary file
|
|
Binary file
|
|
Binary file
|