biotite 0.41.2__cp310-cp310-win_amd64.whl → 1.0.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +221 -235
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp310-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/METADATA +5 -5
- biotite-1.0.0.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/seqtypes.py
CHANGED
|
@@ -6,17 +6,16 @@ __name__ = "biotite.sequence"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann", "Thomas Nevolianis"
|
|
7
7
|
__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
|
|
8
8
|
|
|
9
|
-
from .sequence import Sequence
|
|
10
|
-
from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper
|
|
11
9
|
import numpy as np
|
|
12
|
-
import
|
|
10
|
+
from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet
|
|
11
|
+
from biotite.sequence.sequence import Sequence
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class GeneralSequence(Sequence):
|
|
16
15
|
"""
|
|
17
16
|
This class allows the creation of a sequence with custom
|
|
18
17
|
:class:`Alphabet` without the need to subclass :class:`Sequence`.
|
|
19
|
-
|
|
18
|
+
|
|
20
19
|
Parameters
|
|
21
20
|
----------
|
|
22
21
|
alphabet : Alphabet
|
|
@@ -27,22 +26,24 @@ class GeneralSequence(Sequence):
|
|
|
27
26
|
may also be a :class:`str` object.
|
|
28
27
|
By default the sequence is empty.
|
|
29
28
|
"""
|
|
30
|
-
|
|
29
|
+
|
|
31
30
|
def __init__(self, alphabet, sequence=()):
|
|
32
31
|
self._alphabet = alphabet
|
|
33
32
|
super().__init__(sequence)
|
|
34
33
|
|
|
35
34
|
def __repr__(self):
|
|
36
35
|
"""Represent GeneralSequence as a string for debugging."""
|
|
37
|
-
return
|
|
38
|
-
|
|
36
|
+
return (
|
|
37
|
+
f"GeneralSequence(Alphabet({self._alphabet}), "
|
|
38
|
+
f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
|
|
39
|
+
)
|
|
39
40
|
|
|
40
41
|
def __copy_create__(self):
|
|
41
42
|
return GeneralSequence(self._alphabet)
|
|
42
|
-
|
|
43
|
+
|
|
43
44
|
def get_alphabet(self):
|
|
44
45
|
return self._alphabet
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
def as_type(self, sequence):
|
|
47
48
|
"""
|
|
48
49
|
Convert the :class:`GeneralSequence` into a sequence of another
|
|
@@ -58,12 +59,12 @@ class GeneralSequence(Sequence):
|
|
|
58
59
|
of this object.
|
|
59
60
|
The alphabet must equal or extend the alphabet of this
|
|
60
61
|
object.
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
Returns
|
|
63
64
|
-------
|
|
64
65
|
sequence : Sequence
|
|
65
66
|
The input `sequence` with replaced sequence code.
|
|
66
|
-
|
|
67
|
+
|
|
67
68
|
Raises
|
|
68
69
|
------
|
|
69
70
|
AlphabetError
|
|
@@ -78,16 +79,17 @@ class GeneralSequence(Sequence):
|
|
|
78
79
|
sequence.code = self.code
|
|
79
80
|
return sequence
|
|
80
81
|
|
|
82
|
+
|
|
81
83
|
class NucleotideSequence(Sequence):
|
|
82
84
|
"""
|
|
83
85
|
Representation of a nucleotide sequence (DNA or RNA).
|
|
84
|
-
|
|
86
|
+
|
|
85
87
|
This class may have one of two different alphabets:
|
|
86
88
|
:attr:`unambiguous_alphabet()` contains only the unambiguous DNA
|
|
87
89
|
letters 'A', 'C', 'G' and 'T'.
|
|
88
|
-
:attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
|
|
90
|
+
:attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
|
|
89
91
|
letters.
|
|
90
|
-
|
|
92
|
+
|
|
91
93
|
Parameters
|
|
92
94
|
----------
|
|
93
95
|
sequence : iterable object, optional
|
|
@@ -100,35 +102,36 @@ class NucleotideSequence(Sequence):
|
|
|
100
102
|
ambiguous letters in the sequence, the ambiguous alphabet
|
|
101
103
|
is used.
|
|
102
104
|
"""
|
|
103
|
-
|
|
104
|
-
alphabet_unamb = LetterAlphabet(["A","C","G","T"])
|
|
105
|
-
alphabet_amb
|
|
106
|
-
["A","C","G","T","R","Y","W","S",
|
|
107
|
-
"M","K","H","B","V","D","N"]
|
|
105
|
+
|
|
106
|
+
alphabet_unamb = LetterAlphabet(["A", "C", "G", "T"])
|
|
107
|
+
alphabet_amb = LetterAlphabet(
|
|
108
|
+
["A", "C", "G", "T", "R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"]
|
|
108
109
|
)
|
|
109
|
-
|
|
110
|
-
compl_symbol_dict = {
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
110
|
+
|
|
111
|
+
compl_symbol_dict = {
|
|
112
|
+
"A": "T",
|
|
113
|
+
"C": "G",
|
|
114
|
+
"G": "C",
|
|
115
|
+
"T": "A",
|
|
116
|
+
"M": "K",
|
|
117
|
+
"R": "Y",
|
|
118
|
+
"W": "W",
|
|
119
|
+
"S": "S",
|
|
120
|
+
"Y": "R",
|
|
121
|
+
"K": "M",
|
|
122
|
+
"V": "B",
|
|
123
|
+
"H": "D",
|
|
124
|
+
"D": "H",
|
|
125
|
+
"B": "V",
|
|
126
|
+
"N": "N",
|
|
127
|
+
}
|
|
125
128
|
# List comprehension does not work in this scope
|
|
126
129
|
_compl_symbols = []
|
|
127
130
|
for _symbol in alphabet_amb.get_symbols():
|
|
128
131
|
_compl_symbols.append(compl_symbol_dict[_symbol])
|
|
129
132
|
_compl_alphabet_unamb = LetterAlphabet(_compl_symbols)
|
|
130
133
|
_compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb)
|
|
131
|
-
|
|
134
|
+
|
|
132
135
|
def __init__(self, sequence=[], ambiguous=None):
|
|
133
136
|
if isinstance(sequence, str):
|
|
134
137
|
sequence = sequence.upper()
|
|
@@ -164,28 +167,28 @@ class NucleotideSequence(Sequence):
|
|
|
164
167
|
else:
|
|
165
168
|
seq_copy = NucleotideSequence(ambiguous=False)
|
|
166
169
|
return seq_copy
|
|
167
|
-
|
|
170
|
+
|
|
168
171
|
def get_alphabet(self):
|
|
169
172
|
return self._alphabet
|
|
170
|
-
|
|
173
|
+
|
|
171
174
|
def complement(self):
|
|
172
175
|
"""
|
|
173
176
|
Get the complement nucleotide sequence.
|
|
174
|
-
|
|
177
|
+
|
|
175
178
|
Returns
|
|
176
179
|
-------
|
|
177
180
|
complement : NucleotideSequence
|
|
178
181
|
The complement sequence.
|
|
179
|
-
|
|
182
|
+
|
|
180
183
|
Examples
|
|
181
184
|
--------
|
|
182
|
-
|
|
185
|
+
|
|
183
186
|
>>> dna_seq = NucleotideSequence("ACGCTT")
|
|
184
187
|
>>> print(dna_seq.complement())
|
|
185
188
|
TGCGAA
|
|
186
189
|
>>> print(dna_seq.reverse().complement())
|
|
187
190
|
AAGCGT
|
|
188
|
-
|
|
191
|
+
|
|
189
192
|
"""
|
|
190
193
|
# Interpreting the sequence code of this object in the
|
|
191
194
|
# complementary alphabet gives the complementary symbols
|
|
@@ -194,18 +197,18 @@ class NucleotideSequence(Sequence):
|
|
|
194
197
|
# alphabet into the original alphabet
|
|
195
198
|
compl_code = NucleotideSequence._compl_mapper[self.code]
|
|
196
199
|
return self.copy(compl_code)
|
|
197
|
-
|
|
200
|
+
|
|
198
201
|
def translate(self, complete=False, codon_table=None, met_start=False):
|
|
199
202
|
"""
|
|
200
203
|
Translate the nucleotide sequence into a protein sequence.
|
|
201
|
-
|
|
204
|
+
|
|
202
205
|
If `complete` is true, the entire sequence is translated,
|
|
203
206
|
beginning with the first codon and ending with the last codon,
|
|
204
207
|
even if stop codons occur during the translation.
|
|
205
|
-
|
|
208
|
+
|
|
206
209
|
Otherwise this method returns possible ORFs in the
|
|
207
210
|
sequence, even if not stop codon occurs in an ORF.
|
|
208
|
-
|
|
211
|
+
|
|
209
212
|
Parameters
|
|
210
213
|
----------
|
|
211
214
|
complete : bool, optional
|
|
@@ -222,7 +225,7 @@ class NucleotideSequence(Sequence):
|
|
|
222
225
|
Otherwise the translation starts with the amino acid
|
|
223
226
|
the codon codes for. Only applies, if `complete` is false.
|
|
224
227
|
(Default: False)
|
|
225
|
-
|
|
228
|
+
|
|
226
229
|
Returns
|
|
227
230
|
-------
|
|
228
231
|
protein : ProteinSequence or list of ProteinSequence
|
|
@@ -233,15 +236,15 @@ class NucleotideSequence(Sequence):
|
|
|
233
236
|
pos : list of tuple (int, int)
|
|
234
237
|
Is only returned if `complete` is false. The list contains
|
|
235
238
|
a tuple for each ORF.
|
|
236
|
-
The first element of the tuple is the index of the
|
|
239
|
+
The first element of the tuple is the index of the
|
|
237
240
|
:class:`NucleotideSequence`, where the translation starts.
|
|
238
241
|
The second element is the exclusive stop index, it
|
|
239
242
|
represents the first nucleotide in the
|
|
240
243
|
:class:`NucleotideSequence` after a stop codon.
|
|
241
|
-
|
|
244
|
+
|
|
242
245
|
Examples
|
|
243
246
|
--------
|
|
244
|
-
|
|
247
|
+
|
|
245
248
|
>>> dna_seq = NucleotideSequence("AATGATGCTATAGAT")
|
|
246
249
|
>>> prot_seq = dna_seq.translate(complete=True)
|
|
247
250
|
>>> print(prot_seq)
|
|
@@ -251,29 +254,32 @@ class NucleotideSequence(Sequence):
|
|
|
251
254
|
... print(seq)
|
|
252
255
|
MML*
|
|
253
256
|
ML*
|
|
254
|
-
|
|
257
|
+
|
|
255
258
|
"""
|
|
256
259
|
if self._alphabet != NucleotideSequence.alphabet_unamb:
|
|
257
260
|
raise AlphabetError("Translation requires unambiguous alphabet")
|
|
258
261
|
# Determine codon_table
|
|
259
262
|
if codon_table is None:
|
|
260
263
|
# Import at this position to avoid circular import
|
|
261
|
-
from .codon import CodonTable
|
|
264
|
+
from biotite.sequence.codon import CodonTable
|
|
265
|
+
|
|
262
266
|
codon_table = CodonTable.default_table()
|
|
263
|
-
|
|
267
|
+
|
|
264
268
|
if complete:
|
|
265
269
|
if len(self) % 3 != 0:
|
|
266
|
-
raise ValueError(
|
|
267
|
-
|
|
270
|
+
raise ValueError(
|
|
271
|
+
"Sequence length needs to be a multiple of 3 "
|
|
272
|
+
"for complete translation"
|
|
273
|
+
)
|
|
268
274
|
# Reshape code into (n,3), with n being the amount of codons
|
|
269
275
|
codons = self.code.reshape(-1, 3)
|
|
270
276
|
protein_seq = ProteinSequence()
|
|
271
277
|
protein_seq.code = codon_table.map_codon_codes(codons)
|
|
272
278
|
return protein_seq
|
|
273
|
-
|
|
279
|
+
|
|
274
280
|
else:
|
|
275
281
|
stop_code = ProteinSequence.alphabet.encode("*")
|
|
276
|
-
met_code
|
|
282
|
+
met_code = ProteinSequence.alphabet.encode("M")
|
|
277
283
|
protein_seqs = []
|
|
278
284
|
pos = []
|
|
279
285
|
code = self.code
|
|
@@ -282,7 +288,7 @@ class NucleotideSequence(Sequence):
|
|
|
282
288
|
# The frame length is always a multiple of 3
|
|
283
289
|
# If there is a trailing partial codon, remove it
|
|
284
290
|
frame_length = ((len(code) - shift) // 3) * 3
|
|
285
|
-
frame = code[shift : shift+frame_length]
|
|
291
|
+
frame = code[shift : shift + frame_length]
|
|
286
292
|
# Reshape frame into (n,3), with n being the amount of codons
|
|
287
293
|
frame_codons = frame.reshape(-1, 3)
|
|
288
294
|
# At first, translate frame completely
|
|
@@ -297,8 +303,7 @@ class NucleotideSequence(Sequence):
|
|
|
297
303
|
stops = np.where(code_from_start == stop_code)[0]
|
|
298
304
|
# Find first stop codon after start codon
|
|
299
305
|
# Include stop -> stops[0] + 1
|
|
300
|
-
stop_i = stops[0] + 1 if len(stops) > 0
|
|
301
|
-
else len(code_from_start)
|
|
306
|
+
stop_i = stops[0] + 1 if len(stops) > 0 else len(code_from_start)
|
|
302
307
|
code_from_start_to_stop = code_from_start[:stop_i]
|
|
303
308
|
prot_seq = ProteinSequence()
|
|
304
309
|
if met_start:
|
|
@@ -310,13 +315,13 @@ class NucleotideSequence(Sequence):
|
|
|
310
315
|
protein_seqs.append(prot_seq)
|
|
311
316
|
# Codon indices are transformed
|
|
312
317
|
# to nucleotide sequence indices
|
|
313
|
-
pos.append((shift + start_i*3, shift + (start_i+stop_i)*3))
|
|
318
|
+
pos.append((shift + start_i * 3, shift + (start_i + stop_i) * 3))
|
|
314
319
|
# Sort by start position
|
|
315
320
|
order = np.argsort([start for start, stop in pos])
|
|
316
321
|
pos = [pos[i] for i in order]
|
|
317
322
|
protein_seqs = [protein_seqs[i] for i in order]
|
|
318
323
|
return protein_seqs, pos
|
|
319
|
-
|
|
324
|
+
|
|
320
325
|
@staticmethod
|
|
321
326
|
def unambiguous_alphabet():
|
|
322
327
|
"""
|
|
@@ -329,7 +334,7 @@ class NucleotideSequence(Sequence):
|
|
|
329
334
|
The unambiguous nucleotide alphabet.
|
|
330
335
|
"""
|
|
331
336
|
return NucleotideSequence.alphabet_unamb
|
|
332
|
-
|
|
337
|
+
|
|
333
338
|
@staticmethod
|
|
334
339
|
def ambiguous_alphabet():
|
|
335
340
|
"""
|
|
@@ -348,10 +353,10 @@ class NucleotideSequence(Sequence):
|
|
|
348
353
|
class ProteinSequence(Sequence):
|
|
349
354
|
"""
|
|
350
355
|
Representation of a protein sequence.
|
|
351
|
-
|
|
356
|
+
|
|
352
357
|
Furthermore this class offers a conversion of amino acids from
|
|
353
358
|
3-letter code into 1-letter code and vice versa.
|
|
354
|
-
|
|
359
|
+
|
|
355
360
|
Parameters
|
|
356
361
|
----------
|
|
357
362
|
sequence : iterable object, optional
|
|
@@ -359,7 +364,7 @@ class ProteinSequence(Sequence):
|
|
|
359
364
|
string. May take upper or lower case letters. If a list is
|
|
360
365
|
given, the list elements can be 1-letter or 3-letter amino acid
|
|
361
366
|
representations. By default the sequence is empty.
|
|
362
|
-
|
|
367
|
+
|
|
363
368
|
Notes
|
|
364
369
|
-----
|
|
365
370
|
The :class:`Alphabet` of this :class:`Sequence` class does not
|
|
@@ -370,106 +375,138 @@ class ProteinSequence(Sequence):
|
|
|
370
375
|
"""
|
|
371
376
|
|
|
372
377
|
_codon_table = None
|
|
373
|
-
|
|
374
|
-
alphabet = LetterAlphabet(
|
|
375
|
-
|
|
376
|
-
|
|
378
|
+
|
|
379
|
+
alphabet = LetterAlphabet(
|
|
380
|
+
[
|
|
381
|
+
"A",
|
|
382
|
+
"C",
|
|
383
|
+
"D",
|
|
384
|
+
"E",
|
|
385
|
+
"F",
|
|
386
|
+
"G",
|
|
387
|
+
"H",
|
|
388
|
+
"I",
|
|
389
|
+
"K",
|
|
390
|
+
"L",
|
|
391
|
+
"M",
|
|
392
|
+
"N",
|
|
393
|
+
"P",
|
|
394
|
+
"Q",
|
|
395
|
+
"R",
|
|
396
|
+
"S",
|
|
397
|
+
"T",
|
|
398
|
+
"V",
|
|
399
|
+
"W",
|
|
400
|
+
"Y",
|
|
401
|
+
"B",
|
|
402
|
+
"Z",
|
|
403
|
+
"X",
|
|
404
|
+
"*",
|
|
405
|
+
]
|
|
406
|
+
)
|
|
377
407
|
|
|
378
408
|
# Masses are taken from
|
|
379
409
|
# https://web.expasy.org/findmod/findmod_masses.html#AA
|
|
380
410
|
|
|
381
|
-
_mol_weight_average = np.array(
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
411
|
+
_mol_weight_average = np.array(
|
|
412
|
+
[
|
|
413
|
+
71.0788, # A
|
|
414
|
+
103.1388, # C
|
|
415
|
+
115.0886, # D
|
|
416
|
+
129.1155, # E
|
|
417
|
+
147.1766, # F
|
|
418
|
+
57.0519, # G
|
|
419
|
+
137.1411, # H
|
|
420
|
+
113.1594, # I
|
|
421
|
+
128.1741, # K
|
|
422
|
+
113.1594, # L
|
|
423
|
+
131.1926, # M
|
|
424
|
+
114.1038, # N
|
|
425
|
+
97.1167, # P
|
|
426
|
+
128.1307, # Q
|
|
427
|
+
156.1875, # R
|
|
428
|
+
87.0782, # S
|
|
429
|
+
101.1051, # T
|
|
430
|
+
99.1326, # V
|
|
431
|
+
186.2132, # W
|
|
432
|
+
163.1760, # Y
|
|
433
|
+
np.nan, # B
|
|
434
|
+
np.nan, # Z
|
|
435
|
+
np.nan, # X
|
|
436
|
+
np.nan, # *
|
|
437
|
+
]
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
_mol_weight_monoisotopic = np.array(
|
|
441
|
+
[
|
|
442
|
+
71.03711, # A
|
|
443
|
+
103.00919, # C
|
|
444
|
+
115.02694, # D
|
|
445
|
+
129.04259, # E
|
|
446
|
+
147.06841, # F
|
|
447
|
+
57.02146, # G
|
|
448
|
+
137.05891, # H
|
|
449
|
+
113.08406, # I
|
|
450
|
+
128.09496, # K
|
|
451
|
+
113.08406, # L
|
|
452
|
+
131.04049, # M
|
|
453
|
+
114.04293, # N
|
|
454
|
+
97.05276, # P
|
|
455
|
+
128.05858, # Q
|
|
456
|
+
156.10111, # R
|
|
457
|
+
87.03203, # S
|
|
458
|
+
101.04768, # T
|
|
459
|
+
99.06841, # V
|
|
460
|
+
186.07931, # W
|
|
461
|
+
163.06333, # Y
|
|
462
|
+
np.nan, # B
|
|
463
|
+
np.nan, # Z
|
|
464
|
+
np.nan, # X
|
|
465
|
+
np.nan, # *
|
|
466
|
+
]
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
_dict_1to3 = {
|
|
470
|
+
"A": "ALA",
|
|
471
|
+
"C": "CYS",
|
|
472
|
+
"D": "ASP",
|
|
473
|
+
"E": "GLU",
|
|
474
|
+
"F": "PHE",
|
|
475
|
+
"G": "GLY",
|
|
476
|
+
"H": "HIS",
|
|
477
|
+
"I": "ILE",
|
|
478
|
+
"K": "LYS",
|
|
479
|
+
"L": "LEU",
|
|
480
|
+
"M": "MET",
|
|
481
|
+
"N": "ASN",
|
|
482
|
+
"P": "PRO",
|
|
483
|
+
"Q": "GLN",
|
|
484
|
+
"R": "ARG",
|
|
485
|
+
"S": "SER",
|
|
486
|
+
"T": "THR",
|
|
487
|
+
"V": "VAL",
|
|
488
|
+
"W": "TRP",
|
|
489
|
+
"Y": "TYR",
|
|
490
|
+
"B": "ASX",
|
|
491
|
+
"Z": "GLX",
|
|
492
|
+
"X": "UNK",
|
|
493
|
+
"*": " * ",
|
|
494
|
+
}
|
|
495
|
+
|
|
460
496
|
_dict_3to1 = {}
|
|
461
497
|
for _key, _value in _dict_1to3.items():
|
|
462
498
|
_dict_3to1[_value] = _key
|
|
463
499
|
_dict_3to1["SEC"] = "C"
|
|
464
500
|
_dict_3to1["MSE"] = "M"
|
|
465
|
-
|
|
501
|
+
|
|
466
502
|
def __init__(self, sequence=()):
|
|
467
503
|
dict_3to1 = ProteinSequence._dict_3to1
|
|
468
|
-
alph = ProteinSequence.alphabet
|
|
469
504
|
# Convert 3-letter codes to single letter codes,
|
|
470
505
|
# if list contains 3-letter codes
|
|
471
|
-
sequence = [
|
|
472
|
-
|
|
506
|
+
sequence = [
|
|
507
|
+
dict_3to1[symbol.upper()] if len(symbol) == 3 else symbol.upper()
|
|
508
|
+
for symbol in sequence
|
|
509
|
+
]
|
|
473
510
|
super().__init__(sequence)
|
|
474
511
|
|
|
475
512
|
def __repr__(self):
|
|
@@ -478,11 +515,11 @@ class ProteinSequence(Sequence):
|
|
|
478
515
|
|
|
479
516
|
def get_alphabet(self):
|
|
480
517
|
return ProteinSequence.alphabet
|
|
481
|
-
|
|
518
|
+
|
|
482
519
|
def remove_stops(self):
|
|
483
520
|
"""
|
|
484
521
|
Remove *stop signals* from the sequence.
|
|
485
|
-
|
|
522
|
+
|
|
486
523
|
Returns
|
|
487
524
|
-------
|
|
488
525
|
no_stop : ProteinSequence
|
|
@@ -493,34 +530,34 @@ class ProteinSequence(Sequence):
|
|
|
493
530
|
seq_code = no_stop.code
|
|
494
531
|
no_stop.code = seq_code[seq_code != stop_code]
|
|
495
532
|
return no_stop
|
|
496
|
-
|
|
533
|
+
|
|
497
534
|
@staticmethod
|
|
498
535
|
def convert_letter_3to1(symbol):
|
|
499
536
|
"""
|
|
500
537
|
Convert a 3-letter to a 1-letter amino acid representation.
|
|
501
|
-
|
|
538
|
+
|
|
502
539
|
Parameters
|
|
503
540
|
----------
|
|
504
541
|
symbol : string
|
|
505
542
|
3-letter amino acid representation.
|
|
506
|
-
|
|
543
|
+
|
|
507
544
|
Returns
|
|
508
545
|
-------
|
|
509
546
|
convert : string
|
|
510
547
|
1-letter amino acid representation.
|
|
511
548
|
"""
|
|
512
549
|
return ProteinSequence._dict_3to1[symbol.upper()]
|
|
513
|
-
|
|
550
|
+
|
|
514
551
|
@staticmethod
|
|
515
552
|
def convert_letter_1to3(symbol):
|
|
516
553
|
"""
|
|
517
554
|
Convert a 1-letter to a 3-letter amino acid representation.
|
|
518
|
-
|
|
555
|
+
|
|
519
556
|
Parameters
|
|
520
557
|
----------
|
|
521
558
|
symbol : string
|
|
522
559
|
1-letter amino acid representation.
|
|
523
|
-
|
|
560
|
+
|
|
524
561
|
Returns
|
|
525
562
|
-------
|
|
526
563
|
convert : string
|
|
@@ -531,7 +568,7 @@ class ProteinSequence(Sequence):
|
|
|
531
568
|
def get_molecular_weight(self, monoisotopic=False):
|
|
532
569
|
"""
|
|
533
570
|
Calculate the molecular weight of this protein.
|
|
534
|
-
|
|
571
|
+
|
|
535
572
|
Average protein molecular weight is calculated by the addition
|
|
536
573
|
of average isotopic masses of the amino acids
|
|
537
574
|
in the protein and the average isotopic mass of one water
|
|
@@ -550,7 +587,6 @@ class ProteinSequence(Sequence):
|
|
|
550
587
|
|
|
551
588
|
if np.isnan(weight):
|
|
552
589
|
raise ValueError(
|
|
553
|
-
"Sequence contains ambiguous amino acids, "
|
|
554
|
-
"cannot calculate weight"
|
|
590
|
+
"Sequence contains ambiguous amino acids, " "cannot calculate weight"
|
|
555
591
|
)
|
|
556
592
|
return weight
|