biotite 0.41.1__cp312-cp312-win_amd64.whl → 1.0.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +36 -10
- biotite/application/application.py +22 -11
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +16 -5
- biotite/sequence/align/__init__.py +160 -6
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +35 -35
- biotite/sequence/align/pairwise.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +112 -126
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp312-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp312-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +64 -64
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +226 -240
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp312-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +88 -100
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp312-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp312-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +21 -7
- biotite/structure/info/groups.py +10 -15
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -52
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp312-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/METADATA +6 -6
- biotite-1.0.0.dist-info/RECORD +322 -0
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/WHEEL +1 -1
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.1.dist-info/RECORD +0 -340
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/align/matrix.py
CHANGED
|
@@ -5,11 +5,9 @@
|
|
|
5
5
|
__name__ = "biotite.sequence.align"
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
|
|
8
|
-
from ..sequence import Sequence
|
|
9
|
-
from ..seqtypes import NucleotideSequence, ProteinSequence
|
|
10
|
-
from ..alphabet import Alphabet
|
|
11
|
-
import numpy as np
|
|
12
8
|
import os
|
|
9
|
+
import numpy as np
|
|
10
|
+
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
13
11
|
|
|
14
12
|
__all__ = ["SubstitutionMatrix"]
|
|
15
13
|
|
|
@@ -21,54 +19,54 @@ class SubstitutionMatrix(object):
|
|
|
21
19
|
A :class:`SubstitutionMatrix` maps each possible pairing of a symbol
|
|
22
20
|
of a first alphabet with a symbol of a second alphabet to a score
|
|
23
21
|
(integer).
|
|
24
|
-
|
|
22
|
+
|
|
25
23
|
The class uses a 2-D (m x n) :class:`ndarray`
|
|
26
24
|
(dtype=:attr:`numpy.int32`),
|
|
27
25
|
where each element stores the score for a symbol pairing, indexed
|
|
28
26
|
by the symbol codes of the respective symbols in an *m*-length
|
|
29
27
|
alphabet 1 and an *n*-length alphabet 2.
|
|
30
|
-
|
|
28
|
+
|
|
31
29
|
There are 3 ways to creates instances:
|
|
32
|
-
|
|
30
|
+
|
|
33
31
|
At first a 2-D :class:`ndarray` containing the scores can be
|
|
34
32
|
directly provided.
|
|
35
|
-
|
|
33
|
+
|
|
36
34
|
Secondly a dictionary can be provided, where the keys are pairing
|
|
37
35
|
tuples and values are the corresponding scores.
|
|
38
36
|
The pairing tuples consist of a symbol of alphabet 1 as first
|
|
39
37
|
element and a symbol of alphabet 2 as second element. Parings have
|
|
40
38
|
to be provided for each possible combination.
|
|
41
|
-
|
|
39
|
+
|
|
42
40
|
At last a valid matrix name can be given, which is loaded from the
|
|
43
41
|
internal matrix database. The following matrices are avaliable:
|
|
44
|
-
|
|
42
|
+
|
|
45
43
|
- Nucleotide substitution matrices from NCBI database
|
|
46
44
|
- **NUC** - Also usable with ambiguous alphabet
|
|
47
|
-
|
|
45
|
+
|
|
48
46
|
- Protein substitution matrices from NCBI database
|
|
49
|
-
|
|
47
|
+
|
|
50
48
|
- **PAM<n>**
|
|
51
49
|
- **BLOSUM<n>**
|
|
52
50
|
- **MATCH** - Only differentiates between match and mismatch
|
|
53
51
|
- **IDENTITY** - Strongly penalizes mismatches
|
|
54
52
|
- **GONNET** - Not usable with default protein alphabet
|
|
55
53
|
- **DAYHOFF**
|
|
56
|
-
|
|
54
|
+
|
|
57
55
|
- Corrected protein substitution matrices :footcite:`Hess2016`,
|
|
58
56
|
**<BLOCKS>** is the BLOCKS version, the matrix is based on
|
|
59
|
-
|
|
57
|
+
|
|
60
58
|
- **BLOSUM<n>_<BLOCKS>**
|
|
61
59
|
- **RBLOSUM<n>_<BLOCKS>**
|
|
62
60
|
- **CorBLOSUM<n>_<BLOCKS>**
|
|
63
|
-
|
|
61
|
+
|
|
64
62
|
A list of all available matrix names is returned by
|
|
65
63
|
:meth:`list_db()`.
|
|
66
|
-
|
|
64
|
+
|
|
67
65
|
Since this class can handle two different alphabets, it is possible
|
|
68
66
|
to align two different types of sequences.
|
|
69
|
-
|
|
67
|
+
|
|
70
68
|
Objects of this class are immutable.
|
|
71
|
-
|
|
69
|
+
|
|
72
70
|
Parameters
|
|
73
71
|
----------
|
|
74
72
|
alphabet1 : Alphabet, length=m
|
|
@@ -79,23 +77,23 @@ class SubstitutionMatrix(object):
|
|
|
79
77
|
Either a symbol code indexed :class:`ndarray` containing the scores,
|
|
80
78
|
or a dictionary mapping the symbol pairing to scores,
|
|
81
79
|
or a string referencing a matrix in the internal database.
|
|
82
|
-
|
|
80
|
+
|
|
83
81
|
Raises
|
|
84
82
|
------
|
|
85
83
|
KeyError
|
|
86
84
|
If the matrix dictionary misses a symbol given in the alphabet.
|
|
87
|
-
|
|
85
|
+
|
|
88
86
|
References
|
|
89
87
|
----------
|
|
90
|
-
|
|
88
|
+
|
|
91
89
|
.. footbibliography::
|
|
92
|
-
|
|
90
|
+
|
|
93
91
|
Examples
|
|
94
92
|
--------
|
|
95
|
-
|
|
93
|
+
|
|
96
94
|
Creating a matrix for two different (nonsense) alphabets
|
|
97
95
|
via a matrix dictionary:
|
|
98
|
-
|
|
96
|
+
|
|
99
97
|
>>> alph1 = Alphabet(["foo","bar"])
|
|
100
98
|
>>> alph2 = Alphabet([1,2,3])
|
|
101
99
|
>>> matrix_dict = {("foo",1):5, ("foo",2):10, ("foo",3):15,
|
|
@@ -119,17 +117,16 @@ class SubstitutionMatrix(object):
|
|
|
119
117
|
C 0 1 0 0
|
|
120
118
|
G 0 0 1 0
|
|
121
119
|
T 0 0 0 1
|
|
122
|
-
|
|
120
|
+
|
|
123
121
|
Creating a matrix via database name:
|
|
124
|
-
|
|
122
|
+
|
|
125
123
|
>>> alph = ProteinSequence.alphabet
|
|
126
124
|
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
|
|
127
125
|
"""
|
|
128
|
-
|
|
126
|
+
|
|
129
127
|
# Directory of matrix files
|
|
130
|
-
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
|
131
|
-
|
|
132
|
-
|
|
128
|
+
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
|
|
129
|
+
|
|
133
130
|
def __init__(self, alphabet1, alphabet2, score_matrix):
|
|
134
131
|
self._alph1 = alphabet1
|
|
135
132
|
self._alph2 = alphabet2
|
|
@@ -147,16 +144,19 @@ class SubstitutionMatrix(object):
|
|
|
147
144
|
matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
|
|
148
145
|
self._fill_with_matrix_dict(matrix_dict)
|
|
149
146
|
else:
|
|
150
|
-
raise TypeError(
|
|
151
|
-
|
|
147
|
+
raise TypeError(
|
|
148
|
+
"Matrix must be either a dictionary, " "an 2-D ndarray or a string"
|
|
149
|
+
)
|
|
152
150
|
# This class is immutable and has a getter function for the
|
|
153
151
|
# score matrix -> make the score matrix read-only
|
|
154
152
|
self._matrix.setflags(write=False)
|
|
155
153
|
|
|
156
154
|
def __repr__(self):
|
|
157
155
|
"""Represent SubstitutionMatrix as a string for debugging."""
|
|
158
|
-
return
|
|
159
|
-
|
|
156
|
+
return (
|
|
157
|
+
f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
|
|
158
|
+
f"np.{np.array_repr(self._matrix)})"
|
|
159
|
+
)
|
|
160
160
|
|
|
161
161
|
def __eq__(self, item):
|
|
162
162
|
if not isinstance(item, SubstitutionMatrix):
|
|
@@ -173,40 +173,39 @@ class SubstitutionMatrix(object):
|
|
|
173
173
|
return not self == item
|
|
174
174
|
|
|
175
175
|
def _fill_with_matrix_dict(self, matrix_dict):
|
|
176
|
-
self._matrix = np.zeros((
|
|
177
|
-
dtype=np.int32)
|
|
176
|
+
self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
|
|
178
177
|
for i in range(len(self._alph1)):
|
|
179
178
|
for j in range(len(self._alph2)):
|
|
180
179
|
sym1 = self._alph1.decode(i)
|
|
181
180
|
sym2 = self._alph2.decode(j)
|
|
182
|
-
self._matrix[i,j] = int(matrix_dict[sym1, sym2])
|
|
183
|
-
|
|
181
|
+
self._matrix[i, j] = int(matrix_dict[sym1, sym2])
|
|
182
|
+
|
|
184
183
|
def get_alphabet1(self):
|
|
185
184
|
"""
|
|
186
|
-
Get the first alphabet.
|
|
187
|
-
|
|
185
|
+
Get the first alphabet.
|
|
186
|
+
|
|
188
187
|
Returns
|
|
189
188
|
-------
|
|
190
189
|
alphabet : Alphabet
|
|
191
190
|
The first alphabet.
|
|
192
191
|
"""
|
|
193
192
|
return self._alph1
|
|
194
|
-
|
|
193
|
+
|
|
195
194
|
def get_alphabet2(self):
|
|
196
195
|
"""
|
|
197
|
-
Get the second alphabet.
|
|
198
|
-
|
|
196
|
+
Get the second alphabet.
|
|
197
|
+
|
|
199
198
|
Returns
|
|
200
199
|
-------
|
|
201
200
|
alphabet : Alphabet
|
|
202
201
|
The second alphabet.
|
|
203
202
|
"""
|
|
204
203
|
return self._alph2
|
|
205
|
-
|
|
204
|
+
|
|
206
205
|
def score_matrix(self):
|
|
207
206
|
"""
|
|
208
207
|
Get the 2-D :class:`ndarray` containing the score values.
|
|
209
|
-
|
|
208
|
+
|
|
210
209
|
Returns
|
|
211
210
|
-------
|
|
212
211
|
matrix : ndarray, shape=(m,n), dtype=np.int32
|
|
@@ -214,12 +213,12 @@ class SubstitutionMatrix(object):
|
|
|
214
213
|
The array is read-only.
|
|
215
214
|
"""
|
|
216
215
|
return self._matrix
|
|
217
|
-
|
|
216
|
+
|
|
218
217
|
def transpose(self):
|
|
219
218
|
"""
|
|
220
219
|
Get a copy of this instance, where the alphabets are
|
|
221
220
|
interchanged.
|
|
222
|
-
|
|
221
|
+
|
|
223
222
|
Returns
|
|
224
223
|
-------
|
|
225
224
|
transposed : SubstitutionMatrix
|
|
@@ -229,7 +228,7 @@ class SubstitutionMatrix(object):
|
|
|
229
228
|
new_alph2 = self._alph1
|
|
230
229
|
new_matrix = np.transpose(self._matrix)
|
|
231
230
|
return SubstitutionMatrix(new_alph1, new_alph2, new_matrix)
|
|
232
|
-
|
|
231
|
+
|
|
233
232
|
def is_symmetric(self):
|
|
234
233
|
"""
|
|
235
234
|
Check whether the substitution matrix is symmetric,
|
|
@@ -242,35 +241,36 @@ class SubstitutionMatrix(object):
|
|
|
242
241
|
True, if both alphabets are identical and the score matrix
|
|
243
242
|
is symmetric, false otherwise.
|
|
244
243
|
"""
|
|
245
|
-
return
|
|
246
|
-
|
|
247
|
-
|
|
244
|
+
return self._alph1 == self._alph2 and np.array_equal(
|
|
245
|
+
self._matrix, np.transpose(self._matrix)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
248
|
def get_score_by_code(self, code1, code2):
|
|
249
249
|
"""
|
|
250
250
|
Get the substitution score of two symbols,
|
|
251
251
|
represented by their code.
|
|
252
|
-
|
|
252
|
+
|
|
253
253
|
Parameters
|
|
254
254
|
----------
|
|
255
255
|
code1, code2 : int
|
|
256
256
|
Symbol codes of the two symbols to be aligned.
|
|
257
|
-
|
|
257
|
+
|
|
258
258
|
Returns
|
|
259
259
|
-------
|
|
260
260
|
score : int
|
|
261
261
|
The substitution / alignment score.
|
|
262
262
|
"""
|
|
263
263
|
return self._matrix[code1, code2]
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
def get_score(self, symbol1, symbol2):
|
|
266
266
|
"""
|
|
267
267
|
Get the substitution score of two symbols.
|
|
268
|
-
|
|
268
|
+
|
|
269
269
|
Parameters
|
|
270
270
|
----------
|
|
271
271
|
symbol1, symbol2 : object
|
|
272
272
|
Symbols to be aligned.
|
|
273
|
-
|
|
273
|
+
|
|
274
274
|
Returns
|
|
275
275
|
-------
|
|
276
276
|
score : int
|
|
@@ -279,19 +279,19 @@ class SubstitutionMatrix(object):
|
|
|
279
279
|
code1 = self._alph1.encode(symbol1)
|
|
280
280
|
code2 = self._alph2.encode(symbol2)
|
|
281
281
|
return self._matrix[code1, code2]
|
|
282
|
-
|
|
282
|
+
|
|
283
283
|
def shape(self):
|
|
284
284
|
"""
|
|
285
285
|
Get the shape (i.e. the length of both alphabets)
|
|
286
286
|
of the subsitution matrix.
|
|
287
|
-
|
|
287
|
+
|
|
288
288
|
Returns
|
|
289
289
|
-------
|
|
290
290
|
shape : tuple
|
|
291
291
|
Matrix shape.
|
|
292
292
|
"""
|
|
293
293
|
return (len(self._alph1), len(self._alph2))
|
|
294
|
-
|
|
294
|
+
|
|
295
295
|
def __str__(self):
|
|
296
296
|
# Create matrix in NCBI format
|
|
297
297
|
string = " "
|
|
@@ -306,18 +306,18 @@ class SubstitutionMatrix(object):
|
|
|
306
306
|
# Remove terminal line break
|
|
307
307
|
string = string[:-1]
|
|
308
308
|
return string
|
|
309
|
-
|
|
309
|
+
|
|
310
310
|
@staticmethod
|
|
311
311
|
def dict_from_str(string):
|
|
312
312
|
"""
|
|
313
313
|
Create a matrix dictionary from a string in NCBI matrix format.
|
|
314
|
-
|
|
314
|
+
|
|
315
315
|
Symbols of the first alphabet are taken from the left column,
|
|
316
316
|
symbols of the second alphabet are taken from the top row.
|
|
317
|
-
|
|
317
|
+
|
|
318
318
|
The keys of the dictionary consist of tuples containing the
|
|
319
319
|
aligned symbols and the values are the corresponding scores.
|
|
320
|
-
|
|
320
|
+
|
|
321
321
|
Returns
|
|
322
322
|
-------
|
|
323
323
|
matrix_dict : dict
|
|
@@ -329,22 +329,22 @@ class SubstitutionMatrix(object):
|
|
|
329
329
|
symbols2 = [e for e in lines[0].split()]
|
|
330
330
|
scores = np.array([line.split()[1:] for line in lines[1:]]).astype(int)
|
|
331
331
|
scores = np.transpose(scores)
|
|
332
|
-
|
|
332
|
+
|
|
333
333
|
matrix_dict = {}
|
|
334
334
|
for i in range(len(symbols1)):
|
|
335
335
|
for j in range(len(symbols2)):
|
|
336
|
-
matrix_dict[(symbols1[i], symbols2[j])] = scores[i,j]
|
|
336
|
+
matrix_dict[(symbols1[i], symbols2[j])] = scores[i, j]
|
|
337
337
|
return matrix_dict
|
|
338
|
-
|
|
338
|
+
|
|
339
339
|
@staticmethod
|
|
340
340
|
def dict_from_db(matrix_name):
|
|
341
341
|
"""
|
|
342
342
|
Create a matrix dictionary from a valid matrix name in the
|
|
343
343
|
internal matrix database.
|
|
344
|
-
|
|
344
|
+
|
|
345
345
|
The keys of the dictionary consist of tuples containing the
|
|
346
346
|
aligned symbols and the values are the corresponding scores.
|
|
347
|
-
|
|
347
|
+
|
|
348
348
|
Returns
|
|
349
349
|
-------
|
|
350
350
|
matrix_dict : dict
|
|
@@ -353,12 +353,12 @@ class SubstitutionMatrix(object):
|
|
|
353
353
|
filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
|
|
354
354
|
with open(filename, "r") as f:
|
|
355
355
|
return SubstitutionMatrix.dict_from_str(f.read())
|
|
356
|
-
|
|
356
|
+
|
|
357
357
|
@staticmethod
|
|
358
358
|
def list_db():
|
|
359
359
|
"""
|
|
360
360
|
List all matrix names in the internal database.
|
|
361
|
-
|
|
361
|
+
|
|
362
362
|
Returns
|
|
363
363
|
-------
|
|
364
364
|
db_list : list
|
|
@@ -367,27 +367,26 @@ class SubstitutionMatrix(object):
|
|
|
367
367
|
files = os.listdir(SubstitutionMatrix._db_dir)
|
|
368
368
|
# Remove '.mat' from files
|
|
369
369
|
return [file[:-4] for file in sorted(files)]
|
|
370
|
-
|
|
371
|
-
|
|
370
|
+
|
|
372
371
|
@staticmethod
|
|
373
372
|
def std_protein_matrix():
|
|
374
373
|
"""
|
|
375
374
|
Get the default :class:`SubstitutionMatrix` for protein sequence
|
|
376
375
|
alignments, which is BLOSUM62.
|
|
377
|
-
|
|
376
|
+
|
|
378
377
|
Returns
|
|
379
378
|
-------
|
|
380
379
|
matrix : SubstitutionMatrix
|
|
381
380
|
Default matrix.
|
|
382
381
|
"""
|
|
383
382
|
return _matrix_blosum62
|
|
384
|
-
|
|
383
|
+
|
|
385
384
|
@staticmethod
|
|
386
385
|
def std_nucleotide_matrix():
|
|
387
386
|
"""
|
|
388
387
|
Get the default :class:`SubstitutionMatrix` for DNA sequence
|
|
389
388
|
alignments.
|
|
390
|
-
|
|
389
|
+
|
|
391
390
|
Returns
|
|
392
391
|
-------
|
|
393
392
|
matrix : SubstitutionMatrix
|
|
@@ -395,11 +394,11 @@ class SubstitutionMatrix(object):
|
|
|
395
394
|
"""
|
|
396
395
|
return _matrix_nuc
|
|
397
396
|
|
|
398
|
-
# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
|
|
399
|
-
_matrix_blosum62 = SubstitutionMatrix(ProteinSequence.alphabet,
|
|
400
|
-
ProteinSequence.alphabet,
|
|
401
|
-
"BLOSUM62")
|
|
402
|
-
_matrix_nuc = SubstitutionMatrix(NucleotideSequence.alphabet_amb,
|
|
403
|
-
NucleotideSequence.alphabet_amb,
|
|
404
|
-
"NUC")
|
|
405
397
|
|
|
398
|
+
# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
|
|
399
|
+
_matrix_blosum62 = SubstitutionMatrix(
|
|
400
|
+
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
|
|
401
|
+
)
|
|
402
|
+
_matrix_nuc = SubstitutionMatrix(
|
|
403
|
+
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
|
|
404
|
+
)
|
|
Binary file
|
|
@@ -39,9 +39,9 @@ cdef float32 MAX_FLOAT = np.finfo(np.float32).max
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class GapSymbol:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
_instance = None
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
def __init__(self):
|
|
46
46
|
if GapSymbol._instance is not None:
|
|
47
47
|
raise ValueError(
|
|
@@ -49,16 +49,16 @@ class GapSymbol:
|
|
|
49
49
|
)
|
|
50
50
|
else:
|
|
51
51
|
GapSymbol._instance = self
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
@staticmethod
|
|
54
54
|
def instance():
|
|
55
55
|
if GapSymbol._instance is None:
|
|
56
56
|
GapSymbol._instance = GapSymbol()
|
|
57
57
|
return GapSymbol._instance
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
def __str__(self):
|
|
60
60
|
return "-"
|
|
61
|
-
|
|
61
|
+
|
|
62
62
|
def __hash__(self):
|
|
63
63
|
return 0
|
|
64
64
|
|
|
@@ -69,13 +69,13 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
69
69
|
align_multiple(sequences, matrix, gap_penalty=-10,
|
|
70
70
|
terminal_penalty=True, distances=None,
|
|
71
71
|
guide_tree=None)
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
Perform a multiple sequence alignment using a progressive
|
|
74
74
|
alignment algorithm. :footcite:`Feng1987`
|
|
75
75
|
|
|
76
76
|
Based on pairwise sequence distances a guide tree is constructed.
|
|
77
77
|
The sequences are progessively aligned according to the tree,
|
|
78
|
-
following the rule 'Once a gap, always a gap'.
|
|
78
|
+
following the rule 'Once a gap, always a gap'.
|
|
79
79
|
|
|
80
80
|
Parameters
|
|
81
81
|
----------
|
|
@@ -124,7 +124,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
124
124
|
distance_matrix : ndarray, shape=(n,n), dtype=float32
|
|
125
125
|
The pairwise distance matrix used to construct the guide tree.
|
|
126
126
|
Equal to `distances` if provided.
|
|
127
|
-
|
|
127
|
+
|
|
128
128
|
Notes
|
|
129
129
|
-----
|
|
130
130
|
The similarity to distance conversion is performed according to the
|
|
@@ -137,14 +137,14 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
137
137
|
\right)
|
|
138
138
|
|
|
139
139
|
.. math:: S_{a,b}^{max} = \frac{ S_{a,a} + S_{b,b} }{ 2 }
|
|
140
|
-
|
|
140
|
+
|
|
141
141
|
.. math:: S_{a,b}^{rand} = \frac{1}{L_{a,b}}
|
|
142
142
|
\left(
|
|
143
143
|
\sum_{x \in \Omega} \sum_{y \in \Omega}
|
|
144
144
|
s_{x,y} \cdot N_a(x) \cdot N_b(y)
|
|
145
145
|
\right)
|
|
146
146
|
+ N_{a,b}^{open} \cdot p^{open} + N_{a,b}^{ext} \cdot p^{ext}
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
:math:`D_{a,b}` - The distance between the sequences *a* and *b*.
|
|
149
149
|
|
|
150
150
|
:math:`S_{a,b}` - The similarity score between the sequences *a* and *b*.
|
|
@@ -164,17 +164,17 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
164
164
|
|
|
165
165
|
In rare cases of extremely unrelated sequences, :math:`S_{a,b}`
|
|
166
166
|
can be lower than :math:`S_{a,b}^{rand}`.
|
|
167
|
-
In this case the
|
|
167
|
+
In this case the logarithm cannot be calculated and a
|
|
168
168
|
:class:`ValueError` is raised.
|
|
169
169
|
|
|
170
170
|
References
|
|
171
171
|
----------
|
|
172
|
-
|
|
172
|
+
|
|
173
173
|
.. footbibliography::
|
|
174
174
|
|
|
175
175
|
Examples
|
|
176
176
|
--------
|
|
177
|
-
|
|
177
|
+
|
|
178
178
|
>>> seq1 = ProteinSequence("BIQTITE")
|
|
179
179
|
>>> seq2 = ProteinSequence("TITANITE")
|
|
180
180
|
>>> seq3 = ProteinSequence("BISMITE")
|
|
@@ -232,11 +232,11 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
232
232
|
else:
|
|
233
233
|
# Assure that every node in the guide tree is binary
|
|
234
234
|
guide_tree = as_binary(guide_tree)
|
|
235
|
-
|
|
235
|
+
|
|
236
236
|
# Create new matrix with neutral gap symbol
|
|
237
237
|
gap_symbol = GapSymbol.instance()
|
|
238
238
|
new_alphabet = Alphabet(
|
|
239
|
-
matrix.get_alphabet1().get_symbols() +
|
|
239
|
+
matrix.get_alphabet1().get_symbols() + (gap_symbol,)
|
|
240
240
|
)
|
|
241
241
|
new_score_matrix = np.zeros(
|
|
242
242
|
(len(new_alphabet), len(new_alphabet)), dtype=np.int32
|
|
@@ -275,7 +275,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
275
275
|
]
|
|
276
276
|
for i in range(len(aligned_seqs)):
|
|
277
277
|
aligned_seqs[i].code = aligned_seq_codes[i]
|
|
278
|
-
|
|
278
|
+
|
|
279
279
|
# Reorder alignmets into original alignemnt
|
|
280
280
|
new_order = np.argsort(order)
|
|
281
281
|
aligned_seqs = [aligned_seqs[pos] for pos in new_order]
|
|
@@ -290,7 +290,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
290
290
|
Create all pairwise alignments for the given sequences and use the
|
|
291
291
|
method proposed by Feng & Doolittle to calculate the pairwise
|
|
292
292
|
distance matrix
|
|
293
|
-
|
|
293
|
+
|
|
294
294
|
Parameters
|
|
295
295
|
----------
|
|
296
296
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -306,7 +306,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
306
306
|
terminal_penalty : bool
|
|
307
307
|
Whether to or not count terminal gap penalties for the
|
|
308
308
|
alignments.
|
|
309
|
-
|
|
309
|
+
|
|
310
310
|
Returns
|
|
311
311
|
-------
|
|
312
312
|
distances : ndarray, shape=(n,n), dtype=float32
|
|
@@ -332,7 +332,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
332
332
|
)[0]
|
|
333
333
|
scores[i,j] = alignment.score
|
|
334
334
|
alignments[i,j] = alignment
|
|
335
|
-
|
|
335
|
+
|
|
336
336
|
### Distance calculation from similarity scores ###
|
|
337
337
|
# Calculate the occurences of each symbol code in each sequence
|
|
338
338
|
# This is used later for the random score
|
|
@@ -364,7 +364,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
364
364
|
cdef CodeType[:] seq_code1, seq_code2
|
|
365
365
|
cdef CodeType code1, code2
|
|
366
366
|
cdef float32 score_rand, score_max
|
|
367
|
-
|
|
367
|
+
|
|
368
368
|
# Calculate distance
|
|
369
369
|
# i and j are indicating the alignment between the sequences i and j
|
|
370
370
|
for i in range(scores_v.shape[0]):
|
|
@@ -405,14 +405,14 @@ def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
|
|
|
405
405
|
"""
|
|
406
406
|
Count the number of gap openings and gap extensions in an alignment
|
|
407
407
|
trace.
|
|
408
|
-
|
|
408
|
+
|
|
409
409
|
Parameters
|
|
410
410
|
----------
|
|
411
411
|
trace_v : ndarary, shape=(n,2), dtype=int
|
|
412
412
|
The alignemnt trace.
|
|
413
413
|
terminal_penalty : bool
|
|
414
414
|
Whether to or not count terminal gap penalties.
|
|
415
|
-
|
|
415
|
+
|
|
416
416
|
Returns
|
|
417
417
|
-------
|
|
418
418
|
gap_open_count, gap_ext_count: int
|
|
@@ -440,7 +440,7 @@ def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
|
|
|
440
440
|
if start_index == -1 or stop_index == -1:
|
|
441
441
|
return 0, 0
|
|
442
442
|
trace_v = trace_v[start_index : stop_index]
|
|
443
|
-
|
|
443
|
+
|
|
444
444
|
if trace_v[0,0] == -1:
|
|
445
445
|
gap_open_count += 1
|
|
446
446
|
if trace_v[0,1] == -1:
|
|
@@ -471,7 +471,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
471
471
|
The gaps inserted in this pairwise alignment are also inserted
|
|
472
472
|
into all other sequences in the respective sub-MSA at the same
|
|
473
473
|
position.
|
|
474
|
-
|
|
474
|
+
|
|
475
475
|
Parameters
|
|
476
476
|
----------
|
|
477
477
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -490,13 +490,13 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
490
490
|
matrix : SubstitutionMatrix
|
|
491
491
|
The substitution matrix used for the alignments.
|
|
492
492
|
gap_symbol_code : int
|
|
493
|
-
The symbol code for the gap symbol.
|
|
493
|
+
The symbol code for the gap symbol.
|
|
494
494
|
gap_penalty : int or tuple(int, int)
|
|
495
495
|
A linear or affine gap penalty for the alignments.
|
|
496
496
|
terminal_penalty : bool
|
|
497
497
|
Whether to or not count terminal gap penalties for the
|
|
498
498
|
alignments.
|
|
499
|
-
|
|
499
|
+
|
|
500
500
|
Returns
|
|
501
501
|
-------
|
|
502
502
|
order : ndarray, shape=(m,), dtype=int
|
|
@@ -515,7 +515,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
515
515
|
cdef int32[:] indices1_v, indices2_v
|
|
516
516
|
cdef np.ndarray incides1, incides2
|
|
517
517
|
cdef list aligned_seqs1, aligned_seqs2
|
|
518
|
-
|
|
518
|
+
|
|
519
519
|
if tree_node.is_leaf():
|
|
520
520
|
# Child node -> Cannot do an alignment
|
|
521
521
|
# -> Just return the sequence corresponding to the leaf node
|
|
@@ -523,7 +523,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
523
523
|
# when neutral gap character is inserted
|
|
524
524
|
return np.array([tree_node.index], dtype=np.int32), \
|
|
525
525
|
[sequences[tree_node.index].copy()]
|
|
526
|
-
|
|
526
|
+
|
|
527
527
|
else:
|
|
528
528
|
# Multiple alignment of sequences corresponding to both child nodes
|
|
529
529
|
child1, child2 = tree_node.children
|
|
@@ -537,7 +537,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
537
537
|
gap_symbol_code, gap_penalty, terminal_penalty
|
|
538
538
|
)
|
|
539
539
|
indices2_v = incides2
|
|
540
|
-
|
|
540
|
+
|
|
541
541
|
# Find sequence pair with lowest distance
|
|
542
542
|
dist_min = MAX_FLOAT
|
|
543
543
|
for i in range(indices1_v.shape[0]):
|
|
@@ -554,7 +554,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
554
554
|
gap_penalty, terminal_penalty, max_number=1
|
|
555
555
|
)[0]
|
|
556
556
|
# Place neutral gap symbol for position of new gaps
|
|
557
|
-
# in both sequence groups
|
|
557
|
+
# in both sequence groups
|
|
558
558
|
for i in range(len(aligned_seqs1)):
|
|
559
559
|
seq = aligned_seqs1[i]
|
|
560
560
|
seq.code = _replace_gaps(
|
|
@@ -580,7 +580,7 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
580
580
|
|
|
581
581
|
The replacement is required by the progressive alignment algorithm
|
|
582
582
|
to be able to align gapped sequences with each other.
|
|
583
|
-
|
|
583
|
+
|
|
584
584
|
Parameters
|
|
585
585
|
----------
|
|
586
586
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -592,8 +592,8 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
592
592
|
seq_code : ndarary, shape=(n,)
|
|
593
593
|
The sequence code representing the given sequence.
|
|
594
594
|
gap_symbol_code : int
|
|
595
|
-
The symbol code for the gap symbol.
|
|
596
|
-
|
|
595
|
+
The symbol code for the gap symbol.
|
|
596
|
+
|
|
597
597
|
Returns
|
|
598
598
|
-------
|
|
599
599
|
new_seq_code : ndarary, shape=(m,)
|
|
@@ -609,12 +609,12 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
609
609
|
partial_trace_v.shape[0], dtype=seq_code.dtype
|
|
610
610
|
)
|
|
611
611
|
cdef CodeType[:] new_seq_code_v = new_seq_code
|
|
612
|
-
|
|
612
|
+
|
|
613
613
|
for i in range(partial_trace_v.shape[0]):
|
|
614
614
|
index = partial_trace_v[i]
|
|
615
615
|
if index == -1:
|
|
616
616
|
new_seq_code_v[i] = gap_symbol_code
|
|
617
617
|
else:
|
|
618
618
|
new_seq_code_v[i] = seq_code[index]
|
|
619
|
-
|
|
619
|
+
|
|
620
620
|
return new_seq_code
|
|
Binary file
|
|
Binary file
|