biotite 1.0.1__cp311-cp311-win_amd64.whl → 1.2.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/application.py +3 -3
- biotite/application/autodock/app.py +1 -1
- biotite/application/blast/webapp.py +1 -1
- biotite/application/clustalo/app.py +1 -1
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +36 -2
- biotite/application/msaapp.py +10 -10
- biotite/application/muscle/app3.py +5 -18
- biotite/application/muscle/app5.py +5 -5
- biotite/application/sra/app.py +0 -5
- biotite/application/util.py +22 -2
- biotite/application/viennarna/rnaalifold.py +8 -8
- biotite/application/viennarna/rnaplot.py +9 -3
- biotite/application/viennarna/util.py +1 -1
- biotite/application/webapp.py +1 -1
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +191 -0
- biotite/database/entrez/dbnames.py +10 -0
- biotite/database/entrez/download.py +9 -10
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +5 -4
- biotite/database/pubchem/download.py +6 -6
- biotite/database/pubchem/error.py +10 -0
- biotite/database/pubchem/query.py +12 -23
- biotite/database/rcsb/download.py +3 -2
- biotite/database/rcsb/query.py +8 -9
- biotite/database/uniprot/check.py +22 -17
- biotite/database/uniprot/download.py +3 -6
- biotite/database/uniprot/query.py +4 -5
- biotite/file.py +14 -2
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +16 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +198 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1226 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +15 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +71 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/align/__init__.py +0 -4
- biotite/sequence/align/alignment.py +49 -14
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +26 -26
- biotite/sequence/align/cigar.py +2 -2
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +19 -2
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +58 -48
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +47 -47
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +10 -10
- biotite/sequence/align/matrix.py +284 -57
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +35 -35
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +2 -2
- biotite/sequence/align/statistics.py +1 -1
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +5 -2
- biotite/sequence/annotation.py +19 -13
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +1 -2
- biotite/sequence/graphics/alignment.py +25 -39
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/graphics/dendrogram.py +4 -2
- biotite/sequence/graphics/features.py +2 -2
- biotite/sequence/graphics/logo.py +10 -12
- biotite/sequence/io/fasta/convert.py +1 -2
- biotite/sequence/io/fasta/file.py +1 -1
- biotite/sequence/io/fastq/file.py +3 -3
- biotite/sequence/io/genbank/file.py +3 -3
- biotite/sequence/io/genbank/sequence.py +2 -0
- biotite/sequence/io/gff/convert.py +1 -1
- biotite/sequence/io/gff/file.py +1 -2
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +105 -29
- biotite/sequence/search.py +0 -1
- biotite/sequence/seqtypes.py +136 -8
- biotite/sequence/sequence.py +1 -2
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +6 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +163 -66
- biotite/structure/basepairs.py +26 -26
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +79 -25
- biotite/structure/box.py +19 -21
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +83 -67
- biotite/structure/chains.py +5 -37
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/compare.py +420 -13
- biotite/structure/density.py +1 -1
- biotite/structure/dotbracket.py +27 -28
- biotite/structure/filter.py +8 -8
- biotite/structure/geometry.py +74 -127
- biotite/structure/hbond.py +17 -19
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +24 -15
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -34
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +62 -19
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -22
- biotite/structure/info/radii.py +92 -22
- biotite/structure/info/standardize.py +4 -4
- biotite/structure/integrity.py +4 -6
- biotite/structure/io/general.py +2 -2
- biotite/structure/io/gro/file.py +8 -9
- biotite/structure/io/mol/convert.py +1 -1
- biotite/structure/io/mol/ctab.py +33 -28
- biotite/structure/io/mol/mol.py +1 -1
- biotite/structure/io/mol/sdf.py +80 -53
- biotite/structure/io/pdb/convert.py +4 -3
- biotite/structure/io/pdb/file.py +85 -25
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +36 -36
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +54 -15
- biotite/structure/io/pdbx/cif.py +92 -66
- biotite/structure/io/pdbx/component.py +15 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +410 -75
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +9 -6
- biotite/structure/io/util.py +38 -0
- biotite/structure/mechanics.py +0 -1
- biotite/structure/molecules.py +141 -156
- biotite/structure/pseudoknots.py +7 -13
- biotite/structure/repair.py +2 -4
- biotite/structure/residues.py +13 -24
- biotite/structure/rings.py +335 -0
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +2 -1
- biotite/structure/segments.py +69 -11
- biotite/structure/sequence.py +0 -1
- biotite/structure/sse.py +0 -2
- biotite/structure/superimpose.py +74 -62
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +12 -25
- biotite/structure/util.py +76 -4
- biotite/version.py +9 -4
- biotite/visualize.py +111 -1
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/align/matrix.py
CHANGED
|
@@ -2,14 +2,21 @@
|
|
|
2
2
|
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
|
+
__all__ = ["SubstitutionMatrix"]
|
|
5
6
|
__name__ = "biotite.sequence.align"
|
|
6
7
|
__author__ = "Patrick Kunzmann"
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
+
import functools
|
|
10
|
+
from pathlib import Path
|
|
9
11
|
import numpy as np
|
|
10
|
-
from biotite.sequence.seqtypes import
|
|
12
|
+
from biotite.sequence.seqtypes import (
|
|
13
|
+
NucleotideSequence,
|
|
14
|
+
PositionalSequence,
|
|
15
|
+
ProteinSequence,
|
|
16
|
+
)
|
|
11
17
|
|
|
12
|
-
|
|
18
|
+
# Directory of matrix files
|
|
19
|
+
_DB_DIR = Path(__file__).parent / "matrix_data"
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
class SubstitutionMatrix(object):
|
|
@@ -59,6 +66,11 @@ class SubstitutionMatrix(object):
|
|
|
59
66
|
- **RBLOSUM<n>_<BLOCKS>**
|
|
60
67
|
- **CorBLOSUM<n>_<BLOCKS>**
|
|
61
68
|
|
|
69
|
+
- Structural alphabet substitution matrices
|
|
70
|
+
|
|
71
|
+
- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
|
|
72
|
+
- **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
|
|
73
|
+
|
|
62
74
|
A list of all available matrix names is returned by
|
|
63
75
|
:meth:`list_db()`.
|
|
64
76
|
|
|
@@ -78,6 +90,11 @@ class SubstitutionMatrix(object):
|
|
|
78
90
|
or a dictionary mapping the symbol pairing to scores,
|
|
79
91
|
or a string referencing a matrix in the internal database.
|
|
80
92
|
|
|
93
|
+
Attributes
|
|
94
|
+
----------
|
|
95
|
+
shape : tuple
|
|
96
|
+
The shape of the substitution matrix.
|
|
97
|
+
|
|
81
98
|
Raises
|
|
82
99
|
------
|
|
83
100
|
KeyError
|
|
@@ -110,7 +127,7 @@ class SubstitutionMatrix(object):
|
|
|
110
127
|
Creating an identity substitution matrix via the score matrix:
|
|
111
128
|
|
|
112
129
|
>>> alph = NucleotideSequence.alphabet_unamb
|
|
113
|
-
>>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph)))
|
|
130
|
+
>>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph), dtype=int))
|
|
114
131
|
>>> print(matrix)
|
|
115
132
|
A C G T
|
|
116
133
|
A 1 0 0 0
|
|
@@ -124,9 +141,6 @@ class SubstitutionMatrix(object):
|
|
|
124
141
|
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
|
|
125
142
|
"""
|
|
126
143
|
|
|
127
|
-
# Directory of matrix files
|
|
128
|
-
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
|
|
129
|
-
|
|
130
144
|
def __init__(self, alphabet1, alphabet2, score_matrix):
|
|
131
145
|
self._alph1 = alphabet1
|
|
132
146
|
self._alph2 = alphabet2
|
|
@@ -139,46 +153,44 @@ class SubstitutionMatrix(object):
|
|
|
139
153
|
f"Matrix has shape {score_matrix.shape}, "
|
|
140
154
|
f"but {alph_shape} is required"
|
|
141
155
|
)
|
|
156
|
+
if not np.issubdtype(score_matrix.dtype, np.integer):
|
|
157
|
+
raise TypeError("Score matrix must be an integer ndarray")
|
|
142
158
|
self._matrix = score_matrix.astype(np.int32)
|
|
159
|
+
# If the score matrix was converted from a a float matrix,
|
|
160
|
+
# inf values would be converted to 2**31,
|
|
161
|
+
# which is probably undesired and gives overflow issues in the alignment
|
|
162
|
+
# functions
|
|
163
|
+
if (
|
|
164
|
+
np.any(self._matrix == np.iinfo(np.int32).max) or
|
|
165
|
+
np.any(self._matrix == np.iinfo(np.int32).min)
|
|
166
|
+
): # fmt: skip
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"Score values are too large. "
|
|
169
|
+
"Maybe it was converted from a float matrix containing inf values?"
|
|
170
|
+
)
|
|
143
171
|
elif isinstance(score_matrix, str):
|
|
144
172
|
matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
|
|
145
173
|
self._fill_with_matrix_dict(matrix_dict)
|
|
146
174
|
else:
|
|
147
175
|
raise TypeError(
|
|
148
|
-
"Matrix must be either a dictionary,
|
|
176
|
+
"Matrix must be either a dictionary, an 2-D ndarray or a string"
|
|
149
177
|
)
|
|
150
178
|
# This class is immutable and has a getter function for the
|
|
151
179
|
# score matrix -> make the score matrix read-only
|
|
152
180
|
self._matrix.setflags(write=False)
|
|
153
181
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
def __eq__(self, item):
|
|
162
|
-
if not isinstance(item, SubstitutionMatrix):
|
|
163
|
-
return False
|
|
164
|
-
if self._alph1 != item.get_alphabet1():
|
|
165
|
-
return False
|
|
166
|
-
if self._alph2 != item.get_alphabet2():
|
|
167
|
-
return False
|
|
168
|
-
if not np.array_equal(self.score_matrix(), item.score_matrix()):
|
|
169
|
-
return False
|
|
170
|
-
return True
|
|
171
|
-
|
|
172
|
-
def __ne__(self, item):
|
|
173
|
-
return not self == item
|
|
182
|
+
@property
|
|
183
|
+
def shape(self):
|
|
184
|
+
"""
|
|
185
|
+
Get the shape (i.e. the length of both alphabets)
|
|
186
|
+
of the substitution matrix.
|
|
174
187
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
self._matrix[i, j] = int(matrix_dict[sym1, sym2])
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
shape : tuple
|
|
191
|
+
Matrix shape.
|
|
192
|
+
"""
|
|
193
|
+
return (len(self._alph1), len(self._alph2))
|
|
182
194
|
|
|
183
195
|
def get_alphabet1(self):
|
|
184
196
|
"""
|
|
@@ -280,28 +292,157 @@ class SubstitutionMatrix(object):
|
|
|
280
292
|
code2 = self._alph2.encode(symbol2)
|
|
281
293
|
return self._matrix[code1, code2]
|
|
282
294
|
|
|
283
|
-
def
|
|
295
|
+
def as_positional(self, sequence1, sequence2):
|
|
284
296
|
"""
|
|
285
|
-
|
|
286
|
-
|
|
297
|
+
Transform this substitution matrix and two sequences into positional
|
|
298
|
+
equivalents.
|
|
299
|
+
|
|
300
|
+
This means the new substitution matrix is position-specific: It has the lengths
|
|
301
|
+
of the sequences instead of the lengths of their alphabets.
|
|
302
|
+
Its scores represent the same scores as the original matrix, but now mapped
|
|
303
|
+
onto the positions of the sequences.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
sequence1, sequence2 : seq.Sequence, length=n
|
|
308
|
+
The sequences to create the positional equivalents from.
|
|
287
309
|
|
|
288
310
|
Returns
|
|
289
311
|
-------
|
|
290
|
-
|
|
291
|
-
|
|
312
|
+
pos_matrix : align.SubstitutionMatrix, shape=(n, n)
|
|
313
|
+
The position-specific substitution matrix.
|
|
314
|
+
pos_sequence1, pos_sequence2 : PositionalSequence, length=n
|
|
315
|
+
The positional sequences.
|
|
316
|
+
|
|
317
|
+
Notes
|
|
318
|
+
-----
|
|
319
|
+
After the transformation the substitution scores remain the same, i.e.
|
|
320
|
+
`substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to
|
|
321
|
+
`pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`.
|
|
322
|
+
|
|
323
|
+
Examples
|
|
324
|
+
--------
|
|
325
|
+
|
|
326
|
+
Run an alignment with the usual substitution matrix:
|
|
327
|
+
|
|
328
|
+
>>> seq1 = ProteinSequence("BIQTITE")
|
|
329
|
+
>>> seq2 = ProteinSequence("IQLITE")
|
|
330
|
+
>>> matrix = SubstitutionMatrix.std_protein_matrix()
|
|
331
|
+
>>> print(matrix)
|
|
332
|
+
A C D E F G H I K L M N P Q R S T V W Y B Z X *
|
|
333
|
+
A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 -2 -1 0 -4
|
|
334
|
+
C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 -3 -3 -2 -4
|
|
335
|
+
D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 4 1 -1 -4
|
|
336
|
+
E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 1 4 -1 -4
|
|
337
|
+
F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 -3 -3 -1 -4
|
|
338
|
+
G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 -1 -2 -1 -4
|
|
339
|
+
H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 0 0 -1 -4
|
|
340
|
+
I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -3 -3 -1 -4
|
|
341
|
+
K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 0 1 -1 -4
|
|
342
|
+
L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -4 -3 -1 -4
|
|
343
|
+
M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -3 -1 -1 -4
|
|
344
|
+
N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 3 0 -1 -4
|
|
345
|
+
P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 -2 -1 -2 -4
|
|
346
|
+
Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 0 3 -1 -4
|
|
347
|
+
R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 -1 0 -1 -4
|
|
348
|
+
S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 0 0 0 -4
|
|
349
|
+
T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 -1 -1 0 -4
|
|
350
|
+
V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -3 -2 -1 -4
|
|
351
|
+
W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 -4 -3 -2 -4
|
|
352
|
+
Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 -3 -2 -1 -4
|
|
353
|
+
B -2 -3 4 1 -3 -1 0 -3 0 -4 -3 3 -2 0 -1 0 -1 -3 -4 -3 4 1 -1 -4
|
|
354
|
+
Z -1 -3 1 4 -3 -2 0 -3 1 -3 -1 0 -1 3 0 0 -1 -2 -3 -2 1 4 -1 -4
|
|
355
|
+
X 0 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 -1 0 0 -1 -2 -1 -1 -1 -1 -4
|
|
356
|
+
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
|
|
357
|
+
>>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0]
|
|
358
|
+
>>> print(alignment)
|
|
359
|
+
BIQTITE
|
|
360
|
+
-IQLITE
|
|
361
|
+
|
|
362
|
+
Running the alignment with positional equivalents gives the same result:
|
|
363
|
+
|
|
364
|
+
>>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2)
|
|
365
|
+
>>> print(pos_matrix)
|
|
366
|
+
I Q L I T E
|
|
367
|
+
B -3 0 -4 -3 -1 1
|
|
368
|
+
I 4 -3 2 4 -1 -3
|
|
369
|
+
Q -3 5 -2 -3 -1 2
|
|
370
|
+
T -1 -1 -1 -1 5 -1
|
|
371
|
+
I 4 -3 2 4 -1 -3
|
|
372
|
+
T -1 -1 -1 -1 5 -1
|
|
373
|
+
E -3 2 -3 -3 -1 5
|
|
374
|
+
>>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0]
|
|
375
|
+
>>> print(pos_alignment)
|
|
376
|
+
BIQTITE
|
|
377
|
+
-IQLITE
|
|
378
|
+
|
|
379
|
+
Increase the substitution score for the first symbols in both sequences to align
|
|
380
|
+
to each other:
|
|
381
|
+
|
|
382
|
+
>>> score_matrix = pos_matrix.score_matrix().copy()
|
|
383
|
+
>>> score_matrix[0, 0] = 100
|
|
384
|
+
>>> biased_matrix = SubstitutionMatrix(
|
|
385
|
+
... pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix
|
|
386
|
+
... )
|
|
387
|
+
>>> print(biased_matrix)
|
|
388
|
+
I Q L I T E
|
|
389
|
+
B 100 0 -4 -3 -1 1
|
|
390
|
+
I 4 -3 2 4 -1 -3
|
|
391
|
+
Q -3 5 -2 -3 -1 2
|
|
392
|
+
T -1 -1 -1 -1 5 -1
|
|
393
|
+
I 4 -3 2 4 -1 -3
|
|
394
|
+
T -1 -1 -1 -1 5 -1
|
|
395
|
+
E -3 2 -3 -3 -1 5
|
|
396
|
+
>>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0]
|
|
397
|
+
>>> print(biased_alignment)
|
|
398
|
+
BIQTITE
|
|
399
|
+
I-QLITE
|
|
292
400
|
"""
|
|
293
|
-
|
|
401
|
+
pos_sequence1 = PositionalSequence(sequence1)
|
|
402
|
+
pos_sequence2 = PositionalSequence(sequence2)
|
|
403
|
+
|
|
404
|
+
pos_score_matrix = self._matrix[
|
|
405
|
+
tuple(_cartesian_product(sequence1.code, sequence2.code).T)
|
|
406
|
+
].reshape(len(sequence1), len(sequence2))
|
|
407
|
+
pos_matrix = SubstitutionMatrix(
|
|
408
|
+
pos_sequence1.get_alphabet(),
|
|
409
|
+
pos_sequence2.get_alphabet(),
|
|
410
|
+
pos_score_matrix,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return pos_matrix, pos_sequence1, pos_sequence2
|
|
414
|
+
|
|
415
|
+
def __repr__(self):
|
|
416
|
+
"""Represent SubstitutionMatrix as a string for debugging."""
|
|
417
|
+
return (
|
|
418
|
+
f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
|
|
419
|
+
f"np.{np.array_repr(self._matrix)})"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def __eq__(self, item):
|
|
423
|
+
if not isinstance(item, SubstitutionMatrix):
|
|
424
|
+
return False
|
|
425
|
+
if self._alph1 != item.get_alphabet1():
|
|
426
|
+
return False
|
|
427
|
+
if self._alph2 != item.get_alphabet2():
|
|
428
|
+
return False
|
|
429
|
+
if not np.array_equal(self.score_matrix(), item.score_matrix()):
|
|
430
|
+
return False
|
|
431
|
+
return True
|
|
432
|
+
|
|
433
|
+
def __ne__(self, item):
|
|
434
|
+
return not self == item
|
|
294
435
|
|
|
295
436
|
def __str__(self):
|
|
296
437
|
# Create matrix in NCBI format
|
|
297
438
|
string = " "
|
|
298
439
|
for symbol in self._alph2:
|
|
299
|
-
string += f" {symbol:>3}"
|
|
440
|
+
string += f" {str(symbol):>3}"
|
|
300
441
|
string += "\n"
|
|
301
442
|
for i, symbol in enumerate(self._alph1):
|
|
302
|
-
string += f"{symbol:>1}"
|
|
443
|
+
string += f"{str(symbol):>1}"
|
|
303
444
|
for j in range(len(self._alph2)):
|
|
304
|
-
string += f" {int(self._matrix[i,j]):>3d}"
|
|
445
|
+
string += f" {int(self._matrix[i, j]):>3d}"
|
|
305
446
|
string += "\n"
|
|
306
447
|
# Remove terminal line break
|
|
307
448
|
string = string[:-1]
|
|
@@ -318,6 +459,11 @@ class SubstitutionMatrix(object):
|
|
|
318
459
|
The keys of the dictionary consist of tuples containing the
|
|
319
460
|
aligned symbols and the values are the corresponding scores.
|
|
320
461
|
|
|
462
|
+
Parameters
|
|
463
|
+
----------
|
|
464
|
+
string : str
|
|
465
|
+
The string containing the substitution matrix in NCBI format.
|
|
466
|
+
|
|
321
467
|
Returns
|
|
322
468
|
-------
|
|
323
469
|
matrix_dict : dict
|
|
@@ -345,12 +491,17 @@ class SubstitutionMatrix(object):
|
|
|
345
491
|
The keys of the dictionary consist of tuples containing the
|
|
346
492
|
aligned symbols and the values are the corresponding scores.
|
|
347
493
|
|
|
494
|
+
Parameters
|
|
495
|
+
----------
|
|
496
|
+
matrix_name : str
|
|
497
|
+
The name of the matrix in the internal database.
|
|
498
|
+
|
|
348
499
|
Returns
|
|
349
500
|
-------
|
|
350
501
|
matrix_dict : dict
|
|
351
502
|
A dictionary representing the substitution matrix.
|
|
352
503
|
"""
|
|
353
|
-
filename =
|
|
504
|
+
filename = _DB_DIR / f"{matrix_name}.mat"
|
|
354
505
|
with open(filename, "r") as f:
|
|
355
506
|
return SubstitutionMatrix.dict_from_str(f.read())
|
|
356
507
|
|
|
@@ -364,11 +515,10 @@ class SubstitutionMatrix(object):
|
|
|
364
515
|
db_list : list
|
|
365
516
|
List of matrix names in the internal database.
|
|
366
517
|
"""
|
|
367
|
-
|
|
368
|
-
# Remove '.mat' from files
|
|
369
|
-
return [file[:-4] for file in sorted(files)]
|
|
518
|
+
return [path.stem for path in _DB_DIR.glob("*.mat")]
|
|
370
519
|
|
|
371
520
|
@staticmethod
|
|
521
|
+
@functools.cache
|
|
372
522
|
def std_protein_matrix():
|
|
373
523
|
"""
|
|
374
524
|
Get the default :class:`SubstitutionMatrix` for protein sequence
|
|
@@ -379,9 +529,12 @@ class SubstitutionMatrix(object):
|
|
|
379
529
|
matrix : SubstitutionMatrix
|
|
380
530
|
Default matrix.
|
|
381
531
|
"""
|
|
382
|
-
return
|
|
532
|
+
return SubstitutionMatrix(
|
|
533
|
+
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
|
|
534
|
+
)
|
|
383
535
|
|
|
384
536
|
@staticmethod
|
|
537
|
+
@functools.cache
|
|
385
538
|
def std_nucleotide_matrix():
|
|
386
539
|
"""
|
|
387
540
|
Get the default :class:`SubstitutionMatrix` for DNA sequence
|
|
@@ -392,13 +545,87 @@ class SubstitutionMatrix(object):
|
|
|
392
545
|
matrix : SubstitutionMatrix
|
|
393
546
|
Default matrix.
|
|
394
547
|
"""
|
|
395
|
-
return
|
|
548
|
+
return SubstitutionMatrix(
|
|
549
|
+
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
|
|
550
|
+
)
|
|
396
551
|
|
|
552
|
+
@staticmethod
|
|
553
|
+
@functools.cache
|
|
554
|
+
def std_3di_matrix():
|
|
555
|
+
"""
|
|
556
|
+
Get the default :class:`SubstitutionMatrix` for 3Di sequence
|
|
557
|
+
alignments.
|
|
558
|
+
:footcite:`VanKempen2024`
|
|
397
559
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
560
|
+
Returns
|
|
561
|
+
-------
|
|
562
|
+
matrix : SubstitutionMatrix
|
|
563
|
+
Default matrix.
|
|
564
|
+
"""
|
|
565
|
+
# Import inside function to avoid circular import
|
|
566
|
+
from biotite.structure.alphabet.i3d import I3DSequence
|
|
567
|
+
|
|
568
|
+
return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
|
|
569
|
+
|
|
570
|
+
@staticmethod
|
|
571
|
+
@functools.cache
|
|
572
|
+
def std_protein_blocks_matrix(undefined_match=200, undefined_mismatch=-200):
|
|
573
|
+
"""
|
|
574
|
+
Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.
|
|
575
|
+
|
|
576
|
+
The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
|
|
577
|
+
|
|
578
|
+
Parameters
|
|
579
|
+
----------
|
|
580
|
+
undefined_match, undefined_mismatch : int, optional
|
|
581
|
+
The match and mismatch score for undefined symbols.
|
|
582
|
+
The default values were chosen arbitrarily, but are in the order of
|
|
583
|
+
magnitude of the other score values.
|
|
584
|
+
|
|
585
|
+
Returns
|
|
586
|
+
-------
|
|
587
|
+
matrix : SubstitutionMatrix
|
|
588
|
+
Default matrix.
|
|
589
|
+
|
|
590
|
+
References
|
|
591
|
+
----------
|
|
592
|
+
|
|
593
|
+
.. footbibliography::
|
|
594
|
+
"""
|
|
595
|
+
from biotite.structure.alphabet.pb import ProteinBlocksSequence
|
|
596
|
+
|
|
597
|
+
alphabet = ProteinBlocksSequence.alphabet
|
|
598
|
+
undefined_symbol = ProteinBlocksSequence.undefined_symbol
|
|
599
|
+
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
|
|
600
|
+
# Add match/mismatch scores for undefined symbols residues
|
|
601
|
+
for symbol in alphabet:
|
|
602
|
+
if symbol == undefined_symbol:
|
|
603
|
+
continue
|
|
604
|
+
matrix_dict[symbol, undefined_symbol] = undefined_mismatch
|
|
605
|
+
matrix_dict[undefined_symbol, symbol] = undefined_mismatch
|
|
606
|
+
matrix_dict[undefined_symbol, undefined_symbol] = undefined_match
|
|
607
|
+
return SubstitutionMatrix(
|
|
608
|
+
alphabet,
|
|
609
|
+
alphabet,
|
|
610
|
+
matrix_dict,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def _fill_with_matrix_dict(self, matrix_dict):
|
|
614
|
+
self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
|
|
615
|
+
for i in range(len(self._alph1)):
|
|
616
|
+
for j in range(len(self._alph2)):
|
|
617
|
+
sym1 = self._alph1.decode(i)
|
|
618
|
+
sym2 = self._alph2.decode(j)
|
|
619
|
+
self._matrix[i, j] = int(matrix_dict[sym1, sym2])
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _cartesian_product(array1, array2):
|
|
623
|
+
"""
|
|
624
|
+
Create all combinations of elements from two arrays.
|
|
625
|
+
"""
|
|
626
|
+
return np.transpose(
|
|
627
|
+
[
|
|
628
|
+
np.repeat(array1, len(array2)),
|
|
629
|
+
np.tile(array2, len(array1)),
|
|
630
|
+
]
|
|
631
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# 3Di bit/2
|
|
2
|
+
# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
|
|
3
|
+
# Lambda (precomputed optional): 0.351568
|
|
4
|
+
a c d e f g h i k l m n p q r s t v w y
|
|
5
|
+
a 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2
|
|
6
|
+
c -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9
|
|
7
|
+
d 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2
|
|
8
|
+
e 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3
|
|
9
|
+
f 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4
|
|
10
|
+
g -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2
|
|
11
|
+
h -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3
|
|
12
|
+
i -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8
|
|
13
|
+
k -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8
|
|
14
|
+
l -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9
|
|
15
|
+
m -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9
|
|
16
|
+
n -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5
|
|
17
|
+
p -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5
|
|
18
|
+
q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5
|
|
19
|
+
r -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3
|
|
20
|
+
s -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9
|
|
21
|
+
t -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5
|
|
22
|
+
v -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11
|
|
23
|
+
w 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6
|
|
24
|
+
y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2013 Poulain, A. G. de Brevern
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# PB substitution matrix, adapted from PBxplore
|
|
2
|
+
a b c d e f g h i j k l m n o p
|
|
3
|
+
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
|
|
4
|
+
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
|
|
5
|
+
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
|
|
6
|
+
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
|
|
7
|
+
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
|
|
8
|
+
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
|
|
9
|
+
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
|
|
10
|
+
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
|
|
11
|
+
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
|
|
12
|
+
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
|
|
13
|
+
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
|
|
14
|
+
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
|
|
15
|
+
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
|
|
16
|
+
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
|
|
17
|
+
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
|
|
18
|
+
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
|
|
Binary file
|
|
Binary file
|