biotite 0.41.1__cp311-cp311-win_amd64.whl → 1.0.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +36 -10
- biotite/application/application.py +22 -11
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +16 -5
- biotite/sequence/align/__init__.py +160 -6
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +35 -35
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +112 -126
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +64 -64
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +226 -240
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +88 -100
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +21 -7
- biotite/structure/info/groups.py +10 -15
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -52
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/METADATA +6 -6
- biotite-1.0.0.dist-info/RECORD +322 -0
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/WHEEL +1 -1
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.1.dist-info/RECORD +0 -340
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/alphabet.py
CHANGED
|
@@ -4,58 +4,62 @@
|
|
|
4
4
|
|
|
5
5
|
__name__ = "biotite.sequence"
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
|
-
__all__ = [
|
|
8
|
-
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Alphabet",
|
|
9
|
+
"LetterAlphabet",
|
|
10
|
+
"AlphabetMapper",
|
|
11
|
+
"AlphabetError",
|
|
12
|
+
"common_alphabet",
|
|
13
|
+
]
|
|
9
14
|
|
|
10
|
-
import copy
|
|
11
|
-
from numbers import Integral
|
|
12
15
|
import string
|
|
16
|
+
from numbers import Integral
|
|
13
17
|
import numpy as np
|
|
14
|
-
from .codec import
|
|
18
|
+
from biotite.sequence.codec import decode_to_chars, encode_chars, map_sequence_code
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class Alphabet(object):
|
|
18
22
|
"""
|
|
19
23
|
This class defines the allowed symbols for a :class:`Sequence` and
|
|
20
24
|
handles the encoding/decoding between symbols and symbol codes.
|
|
21
|
-
|
|
25
|
+
|
|
22
26
|
An :class:`Alphabet` is created with the list of symbols, that can
|
|
23
27
|
be used in this context.
|
|
24
28
|
In most cases a symbol will be simply a letter, hence a string of
|
|
25
29
|
length 1. But in principle every hashable Python object can serve
|
|
26
30
|
as symbol.
|
|
27
|
-
|
|
31
|
+
|
|
28
32
|
The encoding of a symbol into a symbol code is
|
|
29
33
|
done in the following way: Find the first index in the symbol list,
|
|
30
34
|
where the list element equals the symbol. This index is the
|
|
31
35
|
symbol code. If the symbol is not found in the list, an
|
|
32
36
|
:class:`AlphabetError` is raised.
|
|
33
|
-
|
|
37
|
+
|
|
34
38
|
Internally, a dictionary is used for encoding, with symbols as keys
|
|
35
39
|
and symbol codes as values. Therefore, every symbol must be
|
|
36
40
|
hashable. For decoding the symbol list is indexed with the symbol
|
|
37
41
|
code.
|
|
38
|
-
|
|
42
|
+
|
|
39
43
|
If an alphabet *1* contains the same symbols and the same
|
|
40
44
|
symbol-code-mappings like another alphabet *2*, but alphabet *1*
|
|
41
45
|
introduces also new symbols, then alphabet *1* *extends* alphabet
|
|
42
46
|
*2*.
|
|
43
47
|
Per definition, every alphabet also extends itself.
|
|
44
|
-
|
|
48
|
+
|
|
45
49
|
Objects of this class are immutable.
|
|
46
|
-
|
|
50
|
+
|
|
47
51
|
Parameters
|
|
48
52
|
----------
|
|
49
53
|
symbols : iterable object
|
|
50
54
|
The symbols, that are allowed in this alphabet. The
|
|
51
55
|
corresponding code for a symbol, is the index of that symbol
|
|
52
56
|
in this list.
|
|
53
|
-
|
|
57
|
+
|
|
54
58
|
Examples
|
|
55
59
|
--------
|
|
56
60
|
Create an Alphabet containing DNA letters and encode/decode a
|
|
57
61
|
letter/code:
|
|
58
|
-
|
|
62
|
+
|
|
59
63
|
>>> alph = Alphabet(["A","C","G","T"])
|
|
60
64
|
>>> print(alph.encode("G"))
|
|
61
65
|
2
|
|
@@ -66,9 +70,9 @@ class Alphabet(object):
|
|
|
66
70
|
... except Exception as e:
|
|
67
71
|
... print(e)
|
|
68
72
|
Symbol 'foo' is not in the alphabet
|
|
69
|
-
|
|
73
|
+
|
|
70
74
|
Create an Alphabet of arbitrary objects:
|
|
71
|
-
|
|
75
|
+
|
|
72
76
|
>>> alph = Alphabet(["foo", 42, (1,2,3), 5, 3.141])
|
|
73
77
|
>>> print(alph.encode((1,2,3)))
|
|
74
78
|
2
|
|
@@ -77,53 +81,53 @@ class Alphabet(object):
|
|
|
77
81
|
|
|
78
82
|
On the subject of alphabet extension:
|
|
79
83
|
An alphabet always extends itself.
|
|
80
|
-
|
|
84
|
+
|
|
81
85
|
>>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","G","T"]))
|
|
82
86
|
True
|
|
83
87
|
|
|
84
88
|
An alphabet extends an alphabet when it contains additional symbols...
|
|
85
|
-
|
|
89
|
+
|
|
86
90
|
>>> Alphabet(["A","C","G","T","U"]).extends(Alphabet(["A","C","G","T"]))
|
|
87
91
|
True
|
|
88
|
-
|
|
92
|
+
|
|
89
93
|
...but not vice versa
|
|
90
|
-
|
|
94
|
+
|
|
91
95
|
>>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","G","T","U"]))
|
|
92
96
|
False
|
|
93
|
-
|
|
97
|
+
|
|
94
98
|
Two alphabets with same symbols but different symbol-code-mappings
|
|
95
|
-
|
|
96
|
-
>>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","T","G"]))
|
|
99
|
+
|
|
100
|
+
>>> Alphabet(["A","C","G","T"]).extends(Alphabet(["A","C","T","G"]))
|
|
97
101
|
False
|
|
98
102
|
"""
|
|
99
|
-
|
|
103
|
+
|
|
100
104
|
def __init__(self, symbols):
|
|
101
105
|
if len(symbols) == 0:
|
|
102
106
|
raise ValueError("Symbol list is empty")
|
|
103
|
-
self._symbols =
|
|
107
|
+
self._symbols = tuple(symbols)
|
|
104
108
|
self._symbol_dict = {}
|
|
105
109
|
for i, symbol in enumerate(symbols):
|
|
106
110
|
self._symbol_dict[symbol] = i
|
|
107
111
|
|
|
108
112
|
def __repr__(self):
|
|
109
113
|
"""Represent Alphabet as a string for debugging."""
|
|
110
|
-
return f
|
|
114
|
+
return f"Alphabet({self._symbols})"
|
|
111
115
|
|
|
112
116
|
def get_symbols(self):
|
|
113
117
|
"""
|
|
114
118
|
Get the symbols in the alphabet.
|
|
115
|
-
|
|
119
|
+
|
|
116
120
|
Returns
|
|
117
121
|
-------
|
|
118
|
-
symbols :
|
|
119
|
-
|
|
122
|
+
symbols : tuple
|
|
123
|
+
The symbols.
|
|
120
124
|
"""
|
|
121
|
-
return
|
|
122
|
-
|
|
125
|
+
return self._symbols
|
|
126
|
+
|
|
123
127
|
def extends(self, alphabet):
|
|
124
128
|
"""
|
|
125
129
|
Check, if this alphabet extends another alphabet.
|
|
126
|
-
|
|
130
|
+
|
|
127
131
|
Parameters
|
|
128
132
|
----------
|
|
129
133
|
alphabet : Alphabet
|
|
@@ -139,23 +143,22 @@ class Alphabet(object):
|
|
|
139
143
|
elif len(alphabet) > len(self):
|
|
140
144
|
return False
|
|
141
145
|
else:
|
|
142
|
-
return alphabet.get_symbols()
|
|
143
|
-
|
|
144
|
-
|
|
146
|
+
return alphabet.get_symbols() == self.get_symbols()[: len(alphabet)]
|
|
147
|
+
|
|
145
148
|
def encode(self, symbol):
|
|
146
149
|
"""
|
|
147
150
|
Use the alphabet to encode a symbol.
|
|
148
|
-
|
|
151
|
+
|
|
149
152
|
Parameters
|
|
150
153
|
----------
|
|
151
154
|
symbol : object
|
|
152
155
|
The object to encode into a symbol code.
|
|
153
|
-
|
|
156
|
+
|
|
154
157
|
Returns
|
|
155
158
|
-------
|
|
156
159
|
code : int
|
|
157
160
|
The symbol code of `symbol`.
|
|
158
|
-
|
|
161
|
+
|
|
159
162
|
Raises
|
|
160
163
|
------
|
|
161
164
|
AlphabetError
|
|
@@ -164,24 +167,22 @@ class Alphabet(object):
|
|
|
164
167
|
try:
|
|
165
168
|
return self._symbol_dict[symbol]
|
|
166
169
|
except KeyError:
|
|
167
|
-
raise AlphabetError(
|
|
168
|
-
|
|
169
|
-
)
|
|
170
|
-
|
|
170
|
+
raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet")
|
|
171
|
+
|
|
171
172
|
def decode(self, code):
|
|
172
173
|
"""
|
|
173
174
|
Use the alphabet to decode a symbol code.
|
|
174
|
-
|
|
175
|
+
|
|
175
176
|
Parameters
|
|
176
177
|
----------
|
|
177
178
|
code : int
|
|
178
179
|
The symbol code to be decoded.
|
|
179
|
-
|
|
180
|
+
|
|
180
181
|
Returns
|
|
181
182
|
-------
|
|
182
183
|
symbol : object
|
|
183
184
|
The symbol corresponding to `code`.
|
|
184
|
-
|
|
185
|
+
|
|
185
186
|
Raises
|
|
186
187
|
------
|
|
187
188
|
AlphabetError
|
|
@@ -190,41 +191,41 @@ class Alphabet(object):
|
|
|
190
191
|
if code < 0 or code >= len(self._symbols):
|
|
191
192
|
raise AlphabetError(f"'{code:d}' is not a valid code")
|
|
192
193
|
return self._symbols[code]
|
|
193
|
-
|
|
194
|
+
|
|
194
195
|
def encode_multiple(self, symbols, dtype=np.int64):
|
|
195
196
|
"""
|
|
196
197
|
Encode a list of symbols.
|
|
197
|
-
|
|
198
|
+
|
|
198
199
|
Parameters
|
|
199
200
|
----------
|
|
200
201
|
symbols : array-like
|
|
201
202
|
The symbols to encode.
|
|
202
203
|
dtype : dtype, optional
|
|
203
204
|
The dtype of the output ndarray. (Default: `int64`)
|
|
204
|
-
|
|
205
|
+
|
|
205
206
|
Returns
|
|
206
207
|
-------
|
|
207
208
|
code : ndarray
|
|
208
209
|
The sequence code.
|
|
209
210
|
"""
|
|
210
211
|
return np.array([self.encode(e) for e in symbols], dtype=dtype)
|
|
211
|
-
|
|
212
|
+
|
|
212
213
|
def decode_multiple(self, code):
|
|
213
214
|
"""
|
|
214
215
|
Decode a sequence code into a list of symbols.
|
|
215
|
-
|
|
216
|
+
|
|
216
217
|
Parameters
|
|
217
218
|
----------
|
|
218
219
|
code : ndarray
|
|
219
220
|
The sequence code to decode.
|
|
220
|
-
|
|
221
|
+
|
|
221
222
|
Returns
|
|
222
223
|
-------
|
|
223
224
|
symbols : list
|
|
224
225
|
The decoded list of symbols.
|
|
225
226
|
"""
|
|
226
227
|
return [self.decode(c) for c in code]
|
|
227
|
-
|
|
228
|
+
|
|
228
229
|
def is_letter_alphabet(self):
|
|
229
230
|
"""
|
|
230
231
|
Check whether the symbols in this alphabet are single printable
|
|
@@ -238,30 +239,33 @@ class Alphabet(object):
|
|
|
238
239
|
have length 1 and are printable.
|
|
239
240
|
"""
|
|
240
241
|
for symbol in self:
|
|
241
|
-
if not isinstance(symbol, (str, bytes))
|
|
242
|
-
|
|
243
|
-
return False
|
|
242
|
+
if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
|
|
243
|
+
return False
|
|
244
244
|
if isinstance(symbol, str):
|
|
245
245
|
symbol = symbol.encode("ASCII")
|
|
246
|
-
if symbol not in LetterAlphabet.
|
|
246
|
+
if symbol not in LetterAlphabet.PRINTABLES:
|
|
247
247
|
return False
|
|
248
248
|
return True
|
|
249
|
-
|
|
249
|
+
|
|
250
250
|
def __str__(self):
|
|
251
251
|
return str(self.get_symbols())
|
|
252
|
-
|
|
252
|
+
|
|
253
253
|
def __len__(self):
|
|
254
254
|
return len(self.get_symbols())
|
|
255
|
-
|
|
255
|
+
|
|
256
256
|
def __iter__(self):
|
|
257
257
|
return self.get_symbols().__iter__()
|
|
258
|
-
|
|
258
|
+
|
|
259
259
|
def __contains__(self, symbol):
|
|
260
260
|
return symbol in self.get_symbols()
|
|
261
|
-
|
|
261
|
+
|
|
262
262
|
def __hash__(self):
|
|
263
|
-
|
|
264
|
-
|
|
263
|
+
symbols = self.get_symbols()
|
|
264
|
+
if isinstance(symbols, tuple):
|
|
265
|
+
return hash(symbols)
|
|
266
|
+
else:
|
|
267
|
+
return hash(tuple(symbols))
|
|
268
|
+
|
|
265
269
|
def __eq__(self, item):
|
|
266
270
|
if item is self:
|
|
267
271
|
return True
|
|
@@ -291,9 +295,10 @@ class LetterAlphabet(Alphabet):
|
|
|
291
295
|
corresponding code for a symbol, is the index of that symbol
|
|
292
296
|
in this list.
|
|
293
297
|
"""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
298
|
+
|
|
299
|
+
PRINTABLES = (string.digits + string.ascii_letters + string.punctuation).encode(
|
|
300
|
+
"ASCII"
|
|
301
|
+
)
|
|
297
302
|
|
|
298
303
|
def __init__(self, symbols):
|
|
299
304
|
if len(symbols) == 0:
|
|
@@ -304,7 +309,7 @@ class LetterAlphabet(Alphabet):
|
|
|
304
309
|
raise ValueError(f"Symbol '{symbol}' is not a single letter")
|
|
305
310
|
if isinstance(symbol, str):
|
|
306
311
|
symbol = symbol.encode("ASCII")
|
|
307
|
-
if symbol not in LetterAlphabet.
|
|
312
|
+
if symbol not in LetterAlphabet.PRINTABLES:
|
|
308
313
|
raise ValueError(
|
|
309
314
|
f"Symbol {repr(symbol)} is not printable or whitespace"
|
|
310
315
|
)
|
|
@@ -312,57 +317,43 @@ class LetterAlphabet(Alphabet):
|
|
|
312
317
|
# Direct 'astype' conversion is not allowed by numpy
|
|
313
318
|
# -> frombuffer()
|
|
314
319
|
self._symbols = np.frombuffer(
|
|
315
|
-
np.array(self._symbols, dtype="|S1"),
|
|
316
|
-
dtype=np.ubyte
|
|
320
|
+
np.array(self._symbols, dtype="|S1"), dtype=np.ubyte
|
|
317
321
|
)
|
|
318
322
|
|
|
319
323
|
def __repr__(self):
|
|
320
324
|
"""Represent LetterAlphabet as a string for debugging."""
|
|
321
|
-
return f
|
|
322
|
-
|
|
325
|
+
return f"LetterAlphabet({self.get_symbols()})"
|
|
326
|
+
|
|
323
327
|
def extends(self, alphabet):
|
|
324
328
|
if alphabet is self:
|
|
325
329
|
return True
|
|
326
|
-
elif
|
|
330
|
+
elif isinstance(alphabet, LetterAlphabet):
|
|
327
331
|
if len(alphabet._symbols) > len(self._symbols):
|
|
328
332
|
return False
|
|
329
|
-
return np.all(
|
|
330
|
-
alphabet._symbols == self._symbols[:len(alphabet._symbols)]
|
|
331
|
-
)
|
|
333
|
+
return np.all(alphabet._symbols == self._symbols[: len(alphabet._symbols)])
|
|
332
334
|
else:
|
|
333
335
|
return super().extends(alphabet)
|
|
334
336
|
|
|
335
337
|
def get_symbols(self):
|
|
336
|
-
""
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
Returns
|
|
340
|
-
-------
|
|
341
|
-
symbols : list
|
|
342
|
-
Copy of the internal list of symbols.
|
|
343
|
-
"""
|
|
344
|
-
return [symbol.decode("ASCII") for symbol
|
|
345
|
-
in self._symbols_as_bytes()]
|
|
346
|
-
|
|
338
|
+
return tuple([symbol.decode("ASCII") for symbol in self._symbols_as_bytes()])
|
|
339
|
+
|
|
347
340
|
def encode(self, symbol):
|
|
348
341
|
if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
|
|
349
342
|
raise AlphabetError(f"Symbol '{symbol}' is not a single letter")
|
|
350
343
|
indices = np.where(self._symbols == ord(symbol))[0]
|
|
351
344
|
if len(indices) == 0:
|
|
352
|
-
raise AlphabetError(
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
return indices[0]
|
|
356
|
-
|
|
345
|
+
raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet")
|
|
346
|
+
return indices[0].item()
|
|
347
|
+
|
|
357
348
|
def decode(self, code, as_bytes=False):
|
|
358
349
|
if code < 0 or code >= len(self._symbols):
|
|
359
350
|
raise AlphabetError(f"'{code:d}' is not a valid code")
|
|
360
351
|
return chr(self._symbols[code])
|
|
361
|
-
|
|
352
|
+
|
|
362
353
|
def encode_multiple(self, symbols, dtype=None):
|
|
363
354
|
"""
|
|
364
355
|
Encode multiple symbols.
|
|
365
|
-
|
|
356
|
+
|
|
366
357
|
Parameters
|
|
367
358
|
----------
|
|
368
359
|
symbols : iterable object or str or bytes
|
|
@@ -371,7 +362,7 @@ class LetterAlphabet(Alphabet):
|
|
|
371
362
|
containing the symbols is provided, instead of e.g. a list.
|
|
372
363
|
dtype : dtype, optional
|
|
373
364
|
For compatibility with superclass. The value is ignored
|
|
374
|
-
|
|
365
|
+
|
|
375
366
|
Returns
|
|
376
367
|
-------
|
|
377
368
|
code : ndarray
|
|
@@ -382,20 +373,17 @@ class LetterAlphabet(Alphabet):
|
|
|
382
373
|
elif isinstance(symbols, bytes):
|
|
383
374
|
symbols = np.frombuffer(symbols, dtype=np.ubyte)
|
|
384
375
|
elif isinstance(symbols, np.ndarray):
|
|
385
|
-
symbols = np.frombuffer(
|
|
386
|
-
symbols.astype(dtype="|S1"), dtype=np.ubyte
|
|
387
|
-
)
|
|
376
|
+
symbols = np.frombuffer(symbols.astype(dtype="|S1"), dtype=np.ubyte)
|
|
388
377
|
else:
|
|
389
378
|
symbols = np.frombuffer(
|
|
390
|
-
np.array(list(symbols), dtype="|S1"),
|
|
391
|
-
dtype=np.ubyte
|
|
379
|
+
np.array(list(symbols), dtype="|S1"), dtype=np.ubyte
|
|
392
380
|
)
|
|
393
381
|
return encode_chars(alphabet=self._symbols, symbols=symbols)
|
|
394
|
-
|
|
382
|
+
|
|
395
383
|
def decode_multiple(self, code, as_bytes=False):
|
|
396
384
|
"""
|
|
397
385
|
Decode a sequence code into a list of symbols.
|
|
398
|
-
|
|
386
|
+
|
|
399
387
|
Parameters
|
|
400
388
|
----------
|
|
401
389
|
code : ndarray, dtype=uint8
|
|
@@ -421,20 +409,19 @@ class LetterAlphabet(Alphabet):
|
|
|
421
409
|
if not as_bytes:
|
|
422
410
|
symbols = symbols.astype("U1")
|
|
423
411
|
return symbols
|
|
424
|
-
|
|
412
|
+
|
|
425
413
|
def __contains__(self, symbol):
|
|
426
414
|
if not isinstance(symbol, (str, bytes)):
|
|
427
415
|
return False
|
|
428
416
|
return ord(symbol) in self._symbols
|
|
429
|
-
|
|
417
|
+
|
|
430
418
|
def __len__(self):
|
|
431
419
|
return len(self._symbols)
|
|
432
|
-
|
|
420
|
+
|
|
433
421
|
def _symbols_as_bytes(self):
|
|
434
422
|
"Properly convert from dtype 'np.ubyte' to '|S1'"
|
|
435
423
|
return np.frombuffer(self._symbols, dtype="|S1")
|
|
436
424
|
|
|
437
|
-
|
|
438
425
|
|
|
439
426
|
class AlphabetMapper(object):
|
|
440
427
|
"""
|
|
@@ -445,7 +432,7 @@ class AlphabetMapper(object):
|
|
|
445
432
|
alphabet so that the symbol itself is preserved.
|
|
446
433
|
This class works for single symbol codes or an entire sequence code
|
|
447
434
|
likewise.
|
|
448
|
-
|
|
435
|
+
|
|
449
436
|
Parameters
|
|
450
437
|
----------
|
|
451
438
|
source_alphabet, target_alphabet : Alphabet
|
|
@@ -454,7 +441,7 @@ class AlphabetMapper(object):
|
|
|
454
441
|
The target alphabet must contain at least all symbols of the
|
|
455
442
|
source alphabet, but it is not required that the shared symbols
|
|
456
443
|
are in the same order.
|
|
457
|
-
|
|
444
|
+
|
|
458
445
|
Examples
|
|
459
446
|
--------
|
|
460
447
|
|
|
@@ -470,56 +457,54 @@ class AlphabetMapper(object):
|
|
|
470
457
|
>>> in_sequence = GeneralSequence(source_alph, "GCCTAT")
|
|
471
458
|
>>> print(in_sequence.code)
|
|
472
459
|
[2 1 1 3 0 3]
|
|
473
|
-
>>> print(in_sequence)
|
|
460
|
+
>>> print("".join(in_sequence.symbols))
|
|
474
461
|
GCCTAT
|
|
475
462
|
>>> out_sequence = GeneralSequence(target_alph)
|
|
476
463
|
>>> out_sequence.code = mapper[in_sequence.code]
|
|
477
464
|
>>> print(out_sequence.code)
|
|
478
465
|
[3 4 4 0 2 0]
|
|
479
|
-
>>> print(out_sequence)
|
|
466
|
+
>>> print("".join(out_sequence.symbols))
|
|
480
467
|
GCCTAT
|
|
481
468
|
"""
|
|
482
|
-
|
|
469
|
+
|
|
483
470
|
def __init__(self, source_alphabet, target_alphabet):
|
|
484
471
|
if target_alphabet.extends(source_alphabet):
|
|
485
472
|
self._necessary_mapping = False
|
|
486
473
|
else:
|
|
487
474
|
self._necessary_mapping = True
|
|
488
475
|
self._mapper = np.zeros(
|
|
489
|
-
len(source_alphabet),
|
|
490
|
-
dtype=AlphabetMapper._dtype(len(target_alphabet))
|
|
476
|
+
len(source_alphabet), dtype=AlphabetMapper._dtype(len(target_alphabet))
|
|
491
477
|
)
|
|
492
478
|
for old_code in range(len(source_alphabet)):
|
|
493
479
|
symbol = source_alphabet.decode(old_code)
|
|
494
480
|
new_code = target_alphabet.encode(symbol)
|
|
495
481
|
self._mapper[old_code] = new_code
|
|
496
|
-
|
|
482
|
+
|
|
497
483
|
def __getitem__(self, code):
|
|
498
484
|
if isinstance(code, Integral):
|
|
499
485
|
if self._necessary_mapping:
|
|
500
486
|
return self._mapper[code]
|
|
501
487
|
else:
|
|
502
488
|
return code
|
|
503
|
-
if not isinstance(code, np.ndarray)
|
|
504
|
-
|
|
505
|
-
|
|
489
|
+
if not isinstance(code, np.ndarray) or code.dtype not in (
|
|
490
|
+
np.uint8,
|
|
491
|
+
np.uint16,
|
|
492
|
+
np.uint32,
|
|
493
|
+
np.uint64,
|
|
494
|
+
):
|
|
495
|
+
code = np.array(code, dtype=np.uint64)
|
|
506
496
|
if self._necessary_mapping:
|
|
507
497
|
mapped_code = np.empty(len(code), dtype=self._mapper.dtype)
|
|
508
|
-
map_sequence_code(
|
|
509
|
-
self._mapper,
|
|
510
|
-
code,
|
|
511
|
-
mapped_code
|
|
512
|
-
)
|
|
498
|
+
map_sequence_code(self._mapper, code, mapped_code)
|
|
513
499
|
return mapped_code
|
|
514
500
|
else:
|
|
515
501
|
return code
|
|
516
502
|
|
|
517
|
-
|
|
518
503
|
@staticmethod
|
|
519
504
|
def _dtype(alphabet_size):
|
|
520
|
-
_size_uint8
|
|
521
|
-
_size_uint16 = np.iinfo(np.uint16).max +1
|
|
522
|
-
_size_uint32 = np.iinfo(np.uint32).max +1
|
|
505
|
+
_size_uint8 = np.iinfo(np.uint8).max + 1
|
|
506
|
+
_size_uint16 = np.iinfo(np.uint16).max + 1
|
|
507
|
+
_size_uint32 = np.iinfo(np.uint32).max + 1
|
|
523
508
|
if alphabet_size <= _size_uint8:
|
|
524
509
|
return np.uint8
|
|
525
510
|
elif alphabet_size <= _size_uint16:
|
|
@@ -535,6 +520,7 @@ class AlphabetError(Exception):
|
|
|
535
520
|
This exception is raised, when a code or a symbol is not in an
|
|
536
521
|
:class:`Alphabet`.
|
|
537
522
|
"""
|
|
523
|
+
|
|
538
524
|
pass
|
|
539
525
|
|
|
540
526
|
|
|
@@ -552,7 +538,7 @@ def common_alphabet(alphabets):
|
|
|
552
538
|
-------
|
|
553
539
|
common_alphabet : Alphabet or None
|
|
554
540
|
The alphabet from `alphabets` that extends all alphabets.
|
|
555
|
-
``None`` if no such common alphabet exists.
|
|
541
|
+
``None`` if no such common alphabet exists.
|
|
556
542
|
"""
|
|
557
543
|
common_alphabet = None
|
|
558
544
|
for alphabet in alphabets:
|
|
@@ -563,4 +549,4 @@ def common_alphabet(alphabets):
|
|
|
563
549
|
common_alphabet = alphabet
|
|
564
550
|
else:
|
|
565
551
|
return None
|
|
566
|
-
return common_alphabet
|
|
552
|
+
return common_alphabet
|