biotite 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +34 -0
- biotite/application/muscle/app3.py +2 -15
- biotite/application/muscle/app5.py +2 -2
- biotite/application/util.py +1 -1
- biotite/application/viennarna/rnaplot.py +6 -2
- biotite/database/rcsb/query.py +6 -6
- biotite/database/uniprot/check.py +20 -15
- biotite/database/uniprot/download.py +1 -1
- biotite/database/uniprot/query.py +1 -1
- biotite/sequence/align/alignment.py +16 -3
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +5 -5
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +17 -0
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +52 -42
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/matrix.py +273 -55
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/alphabet.py +3 -0
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/profile.py +86 -4
- biotite/sequence/seqtypes.py +124 -3
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +4 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +110 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +171 -0
- biotite/structure/alphabet/unkerasify.py +122 -0
- biotite/structure/atoms.py +156 -43
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +72 -21
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/filter.py +1 -1
- biotite/structure/geometry.py +60 -113
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +13 -13
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -32
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +63 -17
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -21
- biotite/structure/info/standardize.py +3 -2
- biotite/structure/io/mol/sdf.py +41 -40
- biotite/structure/io/pdb/convert.py +2 -0
- biotite/structure/io/pdb/file.py +74 -3
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +32 -8
- biotite/structure/io/pdbx/cif.py +148 -107
- biotite/structure/io/pdbx/component.py +9 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +227 -68
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +16 -16
- biotite/structure/molecules.py +141 -141
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/segments.py +1 -2
- biotite/structure/util.py +73 -1
- biotite/version.py +2 -2
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/align/matrix.py
CHANGED
|
@@ -2,14 +2,21 @@
|
|
|
2
2
|
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
|
+
__all__ = ["SubstitutionMatrix"]
|
|
5
6
|
__name__ = "biotite.sequence.align"
|
|
6
7
|
__author__ = "Patrick Kunzmann"
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
+
import functools
|
|
10
|
+
from pathlib import Path
|
|
9
11
|
import numpy as np
|
|
10
|
-
from biotite.sequence.seqtypes import
|
|
12
|
+
from biotite.sequence.seqtypes import (
|
|
13
|
+
NucleotideSequence,
|
|
14
|
+
PositionalSequence,
|
|
15
|
+
ProteinSequence,
|
|
16
|
+
)
|
|
11
17
|
|
|
12
|
-
|
|
18
|
+
# Directory of matrix files
|
|
19
|
+
_DB_DIR = Path(__file__).parent / "matrix_data"
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
class SubstitutionMatrix(object):
|
|
@@ -59,6 +66,11 @@ class SubstitutionMatrix(object):
|
|
|
59
66
|
- **RBLOSUM<n>_<BLOCKS>**
|
|
60
67
|
- **CorBLOSUM<n>_<BLOCKS>**
|
|
61
68
|
|
|
69
|
+
- Structural alphabet substitution matrices
|
|
70
|
+
|
|
71
|
+
- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
|
|
72
|
+
- **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
|
|
73
|
+
|
|
62
74
|
A list of all available matrix names is returned by
|
|
63
75
|
:meth:`list_db()`.
|
|
64
76
|
|
|
@@ -78,6 +90,11 @@ class SubstitutionMatrix(object):
|
|
|
78
90
|
or a dictionary mapping the symbol pairing to scores,
|
|
79
91
|
or a string referencing a matrix in the internal database.
|
|
80
92
|
|
|
93
|
+
Attributes
|
|
94
|
+
----------
|
|
95
|
+
shape : tuple
|
|
96
|
+
The shape of the substitution matrix.
|
|
97
|
+
|
|
81
98
|
Raises
|
|
82
99
|
------
|
|
83
100
|
KeyError
|
|
@@ -110,7 +127,7 @@ class SubstitutionMatrix(object):
|
|
|
110
127
|
Creating an identity substitution matrix via the score matrix:
|
|
111
128
|
|
|
112
129
|
>>> alph = NucleotideSequence.alphabet_unamb
|
|
113
|
-
>>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph)))
|
|
130
|
+
>>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph), dtype=int))
|
|
114
131
|
>>> print(matrix)
|
|
115
132
|
A C G T
|
|
116
133
|
A 1 0 0 0
|
|
@@ -124,9 +141,6 @@ class SubstitutionMatrix(object):
|
|
|
124
141
|
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
|
|
125
142
|
"""
|
|
126
143
|
|
|
127
|
-
# Directory of matrix files
|
|
128
|
-
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
|
|
129
|
-
|
|
130
144
|
def __init__(self, alphabet1, alphabet2, score_matrix):
|
|
131
145
|
self._alph1 = alphabet1
|
|
132
146
|
self._alph2 = alphabet2
|
|
@@ -139,7 +153,21 @@ class SubstitutionMatrix(object):
|
|
|
139
153
|
f"Matrix has shape {score_matrix.shape}, "
|
|
140
154
|
f"but {alph_shape} is required"
|
|
141
155
|
)
|
|
156
|
+
if not np.issubdtype(score_matrix.dtype, np.integer):
|
|
157
|
+
raise TypeError("Score matrix must be an integer ndarray")
|
|
142
158
|
self._matrix = score_matrix.astype(np.int32)
|
|
159
|
+
# If the score matrix was converted from a a float matrix,
|
|
160
|
+
# inf values would be converted to 2**31,
|
|
161
|
+
# which is probably undesired and gives overflow issues in the alignment
|
|
162
|
+
# functions
|
|
163
|
+
if (
|
|
164
|
+
np.any(self._matrix == np.iinfo(np.int32).max) or
|
|
165
|
+
np.any(self._matrix == np.iinfo(np.int32).min)
|
|
166
|
+
): # fmt: skip
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"Score values are too large. "
|
|
169
|
+
"Maybe it was converted from a float matrix containing inf values?"
|
|
170
|
+
)
|
|
143
171
|
elif isinstance(score_matrix, str):
|
|
144
172
|
matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
|
|
145
173
|
self._fill_with_matrix_dict(matrix_dict)
|
|
@@ -151,34 +179,18 @@ class SubstitutionMatrix(object):
|
|
|
151
179
|
# score matrix -> make the score matrix read-only
|
|
152
180
|
self._matrix.setflags(write=False)
|
|
153
181
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
def __eq__(self, item):
|
|
162
|
-
if not isinstance(item, SubstitutionMatrix):
|
|
163
|
-
return False
|
|
164
|
-
if self._alph1 != item.get_alphabet1():
|
|
165
|
-
return False
|
|
166
|
-
if self._alph2 != item.get_alphabet2():
|
|
167
|
-
return False
|
|
168
|
-
if not np.array_equal(self.score_matrix(), item.score_matrix()):
|
|
169
|
-
return False
|
|
170
|
-
return True
|
|
171
|
-
|
|
172
|
-
def __ne__(self, item):
|
|
173
|
-
return not self == item
|
|
182
|
+
@property
|
|
183
|
+
def shape(self):
|
|
184
|
+
"""
|
|
185
|
+
Get the shape (i.e. the length of both alphabets)
|
|
186
|
+
of the substitution matrix.
|
|
174
187
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
self._matrix[i, j] = int(matrix_dict[sym1, sym2])
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
shape : tuple
|
|
191
|
+
Matrix shape.
|
|
192
|
+
"""
|
|
193
|
+
return (len(self._alph1), len(self._alph2))
|
|
182
194
|
|
|
183
195
|
def get_alphabet1(self):
|
|
184
196
|
"""
|
|
@@ -280,26 +292,155 @@ class SubstitutionMatrix(object):
|
|
|
280
292
|
code2 = self._alph2.encode(symbol2)
|
|
281
293
|
return self._matrix[code1, code2]
|
|
282
294
|
|
|
283
|
-
def
|
|
295
|
+
def as_positional(self, sequence1, sequence2):
|
|
284
296
|
"""
|
|
285
|
-
|
|
286
|
-
|
|
297
|
+
Transform this substitution matrix and two sequences into positional
|
|
298
|
+
equivalents.
|
|
299
|
+
|
|
300
|
+
This means the new substitution matrix is position-specific: It has the lengths
|
|
301
|
+
of the sequences instead of the lengths of their alphabets.
|
|
302
|
+
Its scores represent the same scores as the original matrix, but now mapped
|
|
303
|
+
onto the positions of the sequences.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
sequence1, sequence2 : seq.Sequence, length=n
|
|
308
|
+
The sequences to create the positional equivalents from.
|
|
287
309
|
|
|
288
310
|
Returns
|
|
289
311
|
-------
|
|
290
|
-
|
|
291
|
-
|
|
312
|
+
pos_matrix : align.SubstitutionMatrix, shape=(n, n)
|
|
313
|
+
The position-specific substitution matrix.
|
|
314
|
+
pos_sequence1, pos_sequence2 : PositionalSequence, length=n
|
|
315
|
+
The positional sequences.
|
|
316
|
+
|
|
317
|
+
Notes
|
|
318
|
+
-----
|
|
319
|
+
After the transformation the substitution scores remain the same, i.e.
|
|
320
|
+
`substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to
|
|
321
|
+
`pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`.
|
|
322
|
+
|
|
323
|
+
Examples
|
|
324
|
+
--------
|
|
325
|
+
|
|
326
|
+
Run an alignment with the usual substitution matrix:
|
|
327
|
+
|
|
328
|
+
>>> seq1 = ProteinSequence("BIQTITE")
|
|
329
|
+
>>> seq2 = ProteinSequence("IQLITE")
|
|
330
|
+
>>> matrix = SubstitutionMatrix.std_protein_matrix()
|
|
331
|
+
>>> print(matrix)
|
|
332
|
+
A C D E F G H I K L M N P Q R S T V W Y B Z X *
|
|
333
|
+
A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 -2 -1 0 -4
|
|
334
|
+
C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 -3 -3 -2 -4
|
|
335
|
+
D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 4 1 -1 -4
|
|
336
|
+
E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 1 4 -1 -4
|
|
337
|
+
F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 -3 -3 -1 -4
|
|
338
|
+
G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 -1 -2 -1 -4
|
|
339
|
+
H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 0 0 -1 -4
|
|
340
|
+
I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -3 -3 -1 -4
|
|
341
|
+
K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 0 1 -1 -4
|
|
342
|
+
L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -4 -3 -1 -4
|
|
343
|
+
M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -3 -1 -1 -4
|
|
344
|
+
N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 3 0 -1 -4
|
|
345
|
+
P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 -2 -1 -2 -4
|
|
346
|
+
Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 0 3 -1 -4
|
|
347
|
+
R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 -1 0 -1 -4
|
|
348
|
+
S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 0 0 0 -4
|
|
349
|
+
T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 -1 -1 0 -4
|
|
350
|
+
V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -3 -2 -1 -4
|
|
351
|
+
W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 -4 -3 -2 -4
|
|
352
|
+
Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 -3 -2 -1 -4
|
|
353
|
+
B -2 -3 4 1 -3 -1 0 -3 0 -4 -3 3 -2 0 -1 0 -1 -3 -4 -3 4 1 -1 -4
|
|
354
|
+
Z -1 -3 1 4 -3 -2 0 -3 1 -3 -1 0 -1 3 0 0 -1 -2 -3 -2 1 4 -1 -4
|
|
355
|
+
X 0 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 -1 0 0 -1 -2 -1 -1 -1 -1 -4
|
|
356
|
+
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
|
|
357
|
+
>>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0]
|
|
358
|
+
>>> print(alignment)
|
|
359
|
+
BIQTITE
|
|
360
|
+
-IQLITE
|
|
361
|
+
|
|
362
|
+
Running the alignment with positional equivalents gives the same result:
|
|
363
|
+
|
|
364
|
+
>>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2)
|
|
365
|
+
>>> print(pos_matrix)
|
|
366
|
+
I Q L I T E
|
|
367
|
+
B -3 0 -4 -3 -1 1
|
|
368
|
+
I 4 -3 2 4 -1 -3
|
|
369
|
+
Q -3 5 -2 -3 -1 2
|
|
370
|
+
T -1 -1 -1 -1 5 -1
|
|
371
|
+
I 4 -3 2 4 -1 -3
|
|
372
|
+
T -1 -1 -1 -1 5 -1
|
|
373
|
+
E -3 2 -3 -3 -1 5
|
|
374
|
+
>>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0]
|
|
375
|
+
>>> print(pos_alignment)
|
|
376
|
+
BIQTITE
|
|
377
|
+
-IQLITE
|
|
378
|
+
|
|
379
|
+
Increase the substitution score for the first symbols in both sequences to align
|
|
380
|
+
to each other:
|
|
381
|
+
|
|
382
|
+
>>> score_matrix = pos_matrix.score_matrix().copy()
|
|
383
|
+
>>> score_matrix[0, 0] = 100
|
|
384
|
+
>>> biased_matrix = SubstitutionMatrix(
|
|
385
|
+
... pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix
|
|
386
|
+
... )
|
|
387
|
+
>>> print(biased_matrix)
|
|
388
|
+
I Q L I T E
|
|
389
|
+
B 100 0 -4 -3 -1 1
|
|
390
|
+
I 4 -3 2 4 -1 -3
|
|
391
|
+
Q -3 5 -2 -3 -1 2
|
|
392
|
+
T -1 -1 -1 -1 5 -1
|
|
393
|
+
I 4 -3 2 4 -1 -3
|
|
394
|
+
T -1 -1 -1 -1 5 -1
|
|
395
|
+
E -3 2 -3 -3 -1 5
|
|
396
|
+
>>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0]
|
|
397
|
+
>>> print(biased_alignment)
|
|
398
|
+
BIQTITE
|
|
399
|
+
I-QLITE
|
|
292
400
|
"""
|
|
293
|
-
|
|
401
|
+
pos_sequence1 = PositionalSequence(sequence1)
|
|
402
|
+
pos_sequence2 = PositionalSequence(sequence2)
|
|
403
|
+
|
|
404
|
+
pos_score_matrix = self._matrix[
|
|
405
|
+
tuple(_cartesian_product(sequence1.code, sequence2.code).T)
|
|
406
|
+
].reshape(len(sequence1), len(sequence2))
|
|
407
|
+
pos_matrix = SubstitutionMatrix(
|
|
408
|
+
pos_sequence1.get_alphabet(),
|
|
409
|
+
pos_sequence2.get_alphabet(),
|
|
410
|
+
pos_score_matrix,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return pos_matrix, pos_sequence1, pos_sequence2
|
|
414
|
+
|
|
415
|
+
def __repr__(self):
|
|
416
|
+
"""Represent SubstitutionMatrix as a string for debugging."""
|
|
417
|
+
return (
|
|
418
|
+
f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
|
|
419
|
+
f"np.{np.array_repr(self._matrix)})"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def __eq__(self, item):
|
|
423
|
+
if not isinstance(item, SubstitutionMatrix):
|
|
424
|
+
return False
|
|
425
|
+
if self._alph1 != item.get_alphabet1():
|
|
426
|
+
return False
|
|
427
|
+
if self._alph2 != item.get_alphabet2():
|
|
428
|
+
return False
|
|
429
|
+
if not np.array_equal(self.score_matrix(), item.score_matrix()):
|
|
430
|
+
return False
|
|
431
|
+
return True
|
|
432
|
+
|
|
433
|
+
def __ne__(self, item):
|
|
434
|
+
return not self == item
|
|
294
435
|
|
|
295
436
|
def __str__(self):
|
|
296
437
|
# Create matrix in NCBI format
|
|
297
438
|
string = " "
|
|
298
439
|
for symbol in self._alph2:
|
|
299
|
-
string += f" {symbol:>3}"
|
|
440
|
+
string += f" {str(symbol):>3}"
|
|
300
441
|
string += "\n"
|
|
301
442
|
for i, symbol in enumerate(self._alph1):
|
|
302
|
-
string += f"{symbol:>1}"
|
|
443
|
+
string += f"{str(symbol):>1}"
|
|
303
444
|
for j in range(len(self._alph2)):
|
|
304
445
|
string += f" {int(self._matrix[i,j]):>3d}"
|
|
305
446
|
string += "\n"
|
|
@@ -350,7 +491,7 @@ class SubstitutionMatrix(object):
|
|
|
350
491
|
matrix_dict : dict
|
|
351
492
|
A dictionary representing the substitution matrix.
|
|
352
493
|
"""
|
|
353
|
-
filename =
|
|
494
|
+
filename = _DB_DIR / f"{matrix_name}.mat"
|
|
354
495
|
with open(filename, "r") as f:
|
|
355
496
|
return SubstitutionMatrix.dict_from_str(f.read())
|
|
356
497
|
|
|
@@ -364,11 +505,10 @@ class SubstitutionMatrix(object):
|
|
|
364
505
|
db_list : list
|
|
365
506
|
List of matrix names in the internal database.
|
|
366
507
|
"""
|
|
367
|
-
|
|
368
|
-
# Remove '.mat' from files
|
|
369
|
-
return [file[:-4] for file in sorted(files)]
|
|
508
|
+
return [path.stem for path in _DB_DIR.glob("*.mat")]
|
|
370
509
|
|
|
371
510
|
@staticmethod
|
|
511
|
+
@functools.cache
|
|
372
512
|
def std_protein_matrix():
|
|
373
513
|
"""
|
|
374
514
|
Get the default :class:`SubstitutionMatrix` for protein sequence
|
|
@@ -379,9 +519,12 @@ class SubstitutionMatrix(object):
|
|
|
379
519
|
matrix : SubstitutionMatrix
|
|
380
520
|
Default matrix.
|
|
381
521
|
"""
|
|
382
|
-
return
|
|
522
|
+
return SubstitutionMatrix(
|
|
523
|
+
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
|
|
524
|
+
)
|
|
383
525
|
|
|
384
526
|
@staticmethod
|
|
527
|
+
@functools.cache
|
|
385
528
|
def std_nucleotide_matrix():
|
|
386
529
|
"""
|
|
387
530
|
Get the default :class:`SubstitutionMatrix` for DNA sequence
|
|
@@ -392,13 +535,88 @@ class SubstitutionMatrix(object):
|
|
|
392
535
|
matrix : SubstitutionMatrix
|
|
393
536
|
Default matrix.
|
|
394
537
|
"""
|
|
395
|
-
return
|
|
538
|
+
return SubstitutionMatrix(
|
|
539
|
+
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
|
|
540
|
+
)
|
|
396
541
|
|
|
542
|
+
@staticmethod
|
|
543
|
+
@functools.cache
|
|
544
|
+
def std_3di_matrix():
|
|
545
|
+
"""
|
|
546
|
+
Get the default :class:`SubstitutionMatrix` for 3Di sequence
|
|
547
|
+
alignments.
|
|
548
|
+
:footcite:`VanKempen2024`
|
|
397
549
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
550
|
+
Returns
|
|
551
|
+
-------
|
|
552
|
+
matrix : SubstitutionMatrix
|
|
553
|
+
Default matrix.
|
|
554
|
+
"""
|
|
555
|
+
# Import inside function to avoid circular import
|
|
556
|
+
from biotite.structure.alphabet.i3d import I3DSequence
|
|
557
|
+
|
|
558
|
+
return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
|
|
559
|
+
|
|
560
|
+
@staticmethod
|
|
561
|
+
@functools.cache
|
|
562
|
+
def std_protein_blocks_matrix(undefined_match=200, undefined_mismatch=-200):
|
|
563
|
+
"""
|
|
564
|
+
Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.
|
|
565
|
+
|
|
566
|
+
The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
undefined_match, undefined_mismatch : int, optional
|
|
571
|
+
The match and mismatch score for undefined symbols.
|
|
572
|
+
The default values were chosen arbitrarily, but are in the order of
|
|
573
|
+
magnitude of the other score values.
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
matrix : SubstitutionMatrix
|
|
578
|
+
Default matrix.
|
|
579
|
+
|
|
580
|
+
References
|
|
581
|
+
----------
|
|
582
|
+
|
|
583
|
+
.. footbibliography::
|
|
584
|
+
|
|
585
|
+
"""
|
|
586
|
+
from biotite.structure.alphabet.pb import ProteinBlocksSequence
|
|
587
|
+
|
|
588
|
+
alphabet = ProteinBlocksSequence.alphabet
|
|
589
|
+
undefined_symbol = ProteinBlocksSequence.undefined_symbol
|
|
590
|
+
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
|
|
591
|
+
# Add match/mismatch scores for undefined symbols residues
|
|
592
|
+
for symbol in alphabet:
|
|
593
|
+
if symbol == undefined_symbol:
|
|
594
|
+
continue
|
|
595
|
+
matrix_dict[symbol, undefined_symbol] = undefined_mismatch
|
|
596
|
+
matrix_dict[undefined_symbol, symbol] = undefined_mismatch
|
|
597
|
+
matrix_dict[undefined_symbol, undefined_symbol] = undefined_match
|
|
598
|
+
return SubstitutionMatrix(
|
|
599
|
+
alphabet,
|
|
600
|
+
alphabet,
|
|
601
|
+
matrix_dict,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
def _fill_with_matrix_dict(self, matrix_dict):
|
|
605
|
+
self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
|
|
606
|
+
for i in range(len(self._alph1)):
|
|
607
|
+
for j in range(len(self._alph2)):
|
|
608
|
+
sym1 = self._alph1.decode(i)
|
|
609
|
+
sym2 = self._alph2.decode(j)
|
|
610
|
+
self._matrix[i, j] = int(matrix_dict[sym1, sym2])
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def _cartesian_product(array1, array2):
|
|
614
|
+
"""
|
|
615
|
+
Create all combinations of elements from two arrays.
|
|
616
|
+
"""
|
|
617
|
+
return np.transpose(
|
|
618
|
+
[
|
|
619
|
+
np.repeat(array1, len(array2)),
|
|
620
|
+
np.tile(array2, len(array1)),
|
|
621
|
+
]
|
|
622
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# 3Di bit/2
|
|
2
|
+
# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
|
|
3
|
+
# Lambda (precomputed optional): 0.351568
|
|
4
|
+
a c d e f g h i k l m n p q r s t v w y
|
|
5
|
+
a 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2
|
|
6
|
+
c -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9
|
|
7
|
+
d 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2
|
|
8
|
+
e 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3
|
|
9
|
+
f 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4
|
|
10
|
+
g -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2
|
|
11
|
+
h -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3
|
|
12
|
+
i -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8
|
|
13
|
+
k -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8
|
|
14
|
+
l -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9
|
|
15
|
+
m -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9
|
|
16
|
+
n -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5
|
|
17
|
+
p -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5
|
|
18
|
+
q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5
|
|
19
|
+
r -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3
|
|
20
|
+
s -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9
|
|
21
|
+
t -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5
|
|
22
|
+
v -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11
|
|
23
|
+
w 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6
|
|
24
|
+
y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2013 Poulain, A. G. de Brevern
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# PB substitution matrix, adapted from PBxplore
|
|
2
|
+
a b c d e f g h i j k l m n o p
|
|
3
|
+
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
|
|
4
|
+
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
|
|
5
|
+
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
|
|
6
|
+
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
|
|
7
|
+
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
|
|
8
|
+
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
|
|
9
|
+
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
|
|
10
|
+
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
|
|
11
|
+
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
|
|
12
|
+
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
|
|
13
|
+
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
|
|
14
|
+
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
|
|
15
|
+
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
|
|
16
|
+
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
|
|
17
|
+
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
|
|
18
|
+
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
biotite/sequence/alphabet.py
CHANGED
|
Binary file
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
{
|
|
2
|
+
"comment": "Generated with 'gecos --matrix 3Di --name flower --lmin 60 --lmax 80 -f 3di_flower.json'",
|
|
3
|
+
"name": "flower",
|
|
4
|
+
"alphabet": [
|
|
5
|
+
"a",
|
|
6
|
+
"c",
|
|
7
|
+
"d",
|
|
8
|
+
"e",
|
|
9
|
+
"f",
|
|
10
|
+
"g",
|
|
11
|
+
"h",
|
|
12
|
+
"i",
|
|
13
|
+
"k",
|
|
14
|
+
"l",
|
|
15
|
+
"m",
|
|
16
|
+
"n",
|
|
17
|
+
"p",
|
|
18
|
+
"q",
|
|
19
|
+
"r",
|
|
20
|
+
"s",
|
|
21
|
+
"t",
|
|
22
|
+
"v",
|
|
23
|
+
"w",
|
|
24
|
+
"y"
|
|
25
|
+
],
|
|
26
|
+
"colors": {
|
|
27
|
+
"a": "#a189a1",
|
|
28
|
+
"c": "#ff5806",
|
|
29
|
+
"d": "#ab9a93",
|
|
30
|
+
"e": "#e754d5",
|
|
31
|
+
"f": "#8191b5",
|
|
32
|
+
"g": "#cbc7ae",
|
|
33
|
+
"h": "#dac1bc",
|
|
34
|
+
"i": "#5eaf6e",
|
|
35
|
+
"k": "#04c1fd",
|
|
36
|
+
"l": "#ff544b",
|
|
37
|
+
"m": "#07e560",
|
|
38
|
+
"n": "#f28d05",
|
|
39
|
+
"p": "#b68767",
|
|
40
|
+
"q": "#bc8277",
|
|
41
|
+
"r": "#eebe86",
|
|
42
|
+
"s": "#ffa103",
|
|
43
|
+
"t": "#a4c49a",
|
|
44
|
+
"v": "#ed6903",
|
|
45
|
+
"w": "#3a97d8",
|
|
46
|
+
"y": "#f7adfd"
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -94,27 +94,32 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
|
|
|
94
94
|
>>> print(color_scheme)
|
|
95
95
|
['#3737f5', '#37f537', '#f5f537', '#f53737']
|
|
96
96
|
"""
|
|
97
|
+
# Try exact alphabet match first
|
|
98
|
+
for scheme in _color_schemes:
|
|
99
|
+
if scheme["name"] == name and scheme["alphabet"] == alphabet:
|
|
100
|
+
return _fit_color_scheme(alphabet, scheme, default)
|
|
101
|
+
# If no exact match was found, try to find a scheme for an alphabet
|
|
102
|
+
# that extends the given alphabet
|
|
97
103
|
for scheme in _color_schemes:
|
|
98
104
|
if scheme["name"] == name and scheme["alphabet"].extends(alphabet):
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
colors = [color if color is not None else default for color in colors]
|
|
102
|
-
# Only return colors that are in scope of this alphabet
|
|
103
|
-
# and not the extended alphabet
|
|
104
|
-
return colors[: len(alphabet)]
|
|
105
|
+
return _fit_color_scheme(alphabet, scheme, default)
|
|
106
|
+
|
|
105
107
|
raise ValueError(f"Unkown scheme '{name}' for given alphabet")
|
|
106
108
|
|
|
107
109
|
|
|
108
|
-
def list_color_scheme_names(alphabet):
|
|
110
|
+
def list_color_scheme_names(alphabet, strict=False):
|
|
109
111
|
"""
|
|
110
112
|
Get a list of available color scheme names for a given alphabet.
|
|
111
113
|
|
|
112
114
|
Parameters
|
|
113
115
|
----------
|
|
114
116
|
alphabet : Alphabet
|
|
115
|
-
The
|
|
116
|
-
|
|
117
|
-
to
|
|
117
|
+
The alphabet to get the color scheme names for.
|
|
118
|
+
strict : bool, optional
|
|
119
|
+
If set to true, only schemes with an exact match to the given
|
|
120
|
+
alphabet are included in the list.
|
|
121
|
+
If set to false, schemes with an alphabet that extends the given
|
|
122
|
+
alphabet are also included.
|
|
118
123
|
|
|
119
124
|
Returns
|
|
120
125
|
-------
|
|
@@ -123,7 +128,9 @@ def list_color_scheme_names(alphabet):
|
|
|
123
128
|
"""
|
|
124
129
|
scheme_list = []
|
|
125
130
|
for scheme in _color_schemes:
|
|
126
|
-
if scheme["alphabet"]
|
|
131
|
+
if strict and scheme["alphabet"] == alphabet:
|
|
132
|
+
scheme_list.append(scheme["name"])
|
|
133
|
+
if not strict and scheme["alphabet"].extends(alphabet):
|
|
127
134
|
scheme_list.append(scheme["name"])
|
|
128
135
|
return scheme_list
|
|
129
136
|
|
|
@@ -135,3 +142,29 @@ _color_schemes = []
|
|
|
135
142
|
for file_name in glob.glob(_scheme_dir + os.sep + "*.json"):
|
|
136
143
|
scheme = load_color_scheme(file_name)
|
|
137
144
|
_color_schemes.append(scheme)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _fit_color_scheme(alphabet, color_scheme, default_color):
|
|
148
|
+
"""
|
|
149
|
+
Fit a color scheme to the given alphabet.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
alphabet : Alphabet
|
|
154
|
+
The alphabet to get the color scheme for.
|
|
155
|
+
color_scheme : dict
|
|
156
|
+
The color scheme.
|
|
157
|
+
default_color : str or tuple
|
|
158
|
+
The default color.
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
scheme : list of str
|
|
163
|
+
The colors from the scheme.
|
|
164
|
+
"""
|
|
165
|
+
colors = color_scheme["colors"]
|
|
166
|
+
# Replace None values with default color
|
|
167
|
+
colors = [color if color is not None else default_color for color in colors]
|
|
168
|
+
# Only return colors that are in scope of this alphabet
|
|
169
|
+
# and not the extended alphabet
|
|
170
|
+
return colors[: len(alphabet)]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|