biotite 0.41.2__cp311-cp311-win_amd64.whl → 1.0.1__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +246 -236
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +83 -78
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +140 -110
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +260 -258
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +90 -107
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/METADATA +6 -5
- biotite-1.0.1.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -5,16 +5,22 @@
|
|
|
5
5
|
__name__ = "biotite.sequence.align"
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
|
|
8
|
-
import numpy as np
|
|
9
8
|
import numbers
|
|
10
|
-
import copy
|
|
11
9
|
import textwrap
|
|
12
|
-
from
|
|
13
|
-
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
import numpy as np
|
|
12
|
+
from biotite.sequence.alphabet import LetterAlphabet
|
|
14
13
|
|
|
15
|
-
__all__ = [
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Alignment",
|
|
16
|
+
"get_codes",
|
|
17
|
+
"get_symbols",
|
|
18
|
+
"get_sequence_identity",
|
|
19
|
+
"get_pairwise_sequence_identity",
|
|
20
|
+
"score",
|
|
21
|
+
"find_terminal_gaps",
|
|
22
|
+
"remove_terminal_gaps",
|
|
23
|
+
]
|
|
18
24
|
|
|
19
25
|
|
|
20
26
|
class Alignment(object):
|
|
@@ -22,7 +28,7 @@ class Alignment(object):
|
|
|
22
28
|
An :class:`Alignment` object stores information about which symbols
|
|
23
29
|
of *n* sequences are aligned to each other and it stores the
|
|
24
30
|
corresponding alignment score.
|
|
25
|
-
|
|
31
|
+
|
|
26
32
|
Instead of saving a list of aligned symbols, this class saves the
|
|
27
33
|
original *n* sequences, that were aligned, and a so called *trace*,
|
|
28
34
|
which indicate the aligned symbols of these sequences.
|
|
@@ -31,16 +37,16 @@ class Alignment(object):
|
|
|
31
37
|
Each element of the trace is the index in the corresponding
|
|
32
38
|
sequence.
|
|
33
39
|
A gap is represented by the value -1.
|
|
34
|
-
|
|
40
|
+
|
|
35
41
|
Furthermore this class provides multiple utility functions for
|
|
36
42
|
conversion into strings in order to make the alignment human
|
|
37
43
|
readable.
|
|
38
|
-
|
|
44
|
+
|
|
39
45
|
Unless an :class:`Alignment` object is the result of an multiple
|
|
40
46
|
sequence alignment, the object will contain only two sequences.
|
|
41
|
-
|
|
47
|
+
|
|
42
48
|
All attributes of this class are publicly accessible.
|
|
43
|
-
|
|
49
|
+
|
|
44
50
|
Parameters
|
|
45
51
|
----------
|
|
46
52
|
sequences : list
|
|
@@ -49,7 +55,7 @@ class Alignment(object):
|
|
|
49
55
|
The alignment trace.
|
|
50
56
|
score : int, optional
|
|
51
57
|
Alignment score.
|
|
52
|
-
|
|
58
|
+
|
|
53
59
|
Attributes
|
|
54
60
|
----------
|
|
55
61
|
sequences : list
|
|
@@ -58,10 +64,10 @@ class Alignment(object):
|
|
|
58
64
|
The alignment trace.
|
|
59
65
|
score : int
|
|
60
66
|
Alignment score.
|
|
61
|
-
|
|
67
|
+
|
|
62
68
|
Examples
|
|
63
69
|
--------
|
|
64
|
-
|
|
70
|
+
|
|
65
71
|
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
66
72
|
>>> seq2 = NucleotideSequence("TCATGC")
|
|
67
73
|
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
@@ -95,8 +101,10 @@ class Alignment(object):
|
|
|
95
101
|
|
|
96
102
|
def __repr__(self):
|
|
97
103
|
"""Represent Alignment a string for debugging."""
|
|
98
|
-
return
|
|
99
|
-
|
|
104
|
+
return (
|
|
105
|
+
f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], "
|
|
106
|
+
f"np.{np.array_repr(self.trace)}, score={self.score})"
|
|
107
|
+
)
|
|
100
108
|
|
|
101
109
|
def _gapped_str(self, seq_index):
|
|
102
110
|
seq_str = ""
|
|
@@ -107,11 +115,11 @@ class Alignment(object):
|
|
|
107
115
|
else:
|
|
108
116
|
seq_str += "-"
|
|
109
117
|
return seq_str
|
|
110
|
-
|
|
118
|
+
|
|
111
119
|
def get_gapped_sequences(self):
|
|
112
120
|
"""
|
|
113
121
|
Get a the string representation of the gapped sequences.
|
|
114
|
-
|
|
122
|
+
|
|
115
123
|
Returns
|
|
116
124
|
-------
|
|
117
125
|
sequences : list of str
|
|
@@ -119,7 +127,7 @@ class Alignment(object):
|
|
|
119
127
|
as in `Alignment.sequences`.
|
|
120
128
|
"""
|
|
121
129
|
return [self._gapped_str(i) for i in range(len(self.sequences))]
|
|
122
|
-
|
|
130
|
+
|
|
123
131
|
def __str__(self):
|
|
124
132
|
# Check if any of the sequences
|
|
125
133
|
# has an non-single letter alphabet
|
|
@@ -143,32 +151,33 @@ class Alignment(object):
|
|
|
143
151
|
return ali_str[:-2]
|
|
144
152
|
else:
|
|
145
153
|
return super().__str__()
|
|
146
|
-
|
|
154
|
+
|
|
147
155
|
def __getitem__(self, index):
|
|
148
156
|
if isinstance(index, tuple):
|
|
149
157
|
if len(index) > 2:
|
|
150
158
|
raise IndexError("Only 1D or 2D indices are allowed")
|
|
151
|
-
if isinstance(index[0], numbers.Integral) or
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
if isinstance(index[0], numbers.Integral) or isinstance(
|
|
160
|
+
index[0], numbers.Integral
|
|
161
|
+
):
|
|
162
|
+
raise IndexError(
|
|
163
|
+
"Integers are invalid indices for alignments, "
|
|
164
|
+
"a single sequence or alignment column cannot be "
|
|
165
|
+
"selected"
|
|
166
|
+
)
|
|
158
167
|
return Alignment(
|
|
159
168
|
Alignment._index_sequences(self.sequences, index[1]),
|
|
160
169
|
self.trace[index],
|
|
161
|
-
self.score
|
|
170
|
+
self.score,
|
|
162
171
|
)
|
|
163
172
|
else:
|
|
164
173
|
return Alignment(self.sequences, self.trace[index], self.score)
|
|
165
|
-
|
|
174
|
+
|
|
166
175
|
def __iter__(self):
|
|
167
176
|
raise TypeError("'Alignment' object is not iterable")
|
|
168
|
-
|
|
177
|
+
|
|
169
178
|
def __len__(self):
|
|
170
179
|
return len(self.trace)
|
|
171
|
-
|
|
180
|
+
|
|
172
181
|
def __eq__(self, item):
|
|
173
182
|
if not isinstance(item, Alignment):
|
|
174
183
|
return False
|
|
@@ -179,45 +188,41 @@ class Alignment(object):
|
|
|
179
188
|
if self.score != item.score:
|
|
180
189
|
return False
|
|
181
190
|
return True
|
|
182
|
-
|
|
191
|
+
|
|
183
192
|
@staticmethod
|
|
184
193
|
def _index_sequences(sequences, index):
|
|
185
|
-
if isinstance(index, (list, tuple)) or
|
|
186
|
-
|
|
187
|
-
|
|
194
|
+
if isinstance(index, (list, tuple)) or (
|
|
195
|
+
isinstance(index, np.ndarray) and index.dtype != bool
|
|
196
|
+
):
|
|
197
|
+
return [sequences[i] for i in index]
|
|
188
198
|
elif isinstance(index, np.ndarray) and index.dtype == bool:
|
|
189
199
|
return [seq for seq, mask in zip(sequences, index) if mask]
|
|
190
200
|
if isinstance(index, slice):
|
|
191
201
|
return sequences[index]
|
|
192
202
|
else:
|
|
193
|
-
raise IndexError(
|
|
194
|
-
|
|
195
|
-
)
|
|
196
|
-
|
|
203
|
+
raise IndexError(f"Invalid alignment index type '{type(index).__name__}'")
|
|
204
|
+
|
|
197
205
|
@staticmethod
|
|
198
206
|
def trace_from_strings(seq_str_list):
|
|
199
207
|
"""
|
|
200
208
|
Create a trace from strings that represent aligned sequences.
|
|
201
|
-
|
|
209
|
+
|
|
202
210
|
Parameters
|
|
203
211
|
----------
|
|
204
212
|
seq_str_list : list of str
|
|
205
213
|
The strings, where each each one represents a sequence
|
|
206
214
|
(with gaps) in an alignment.
|
|
207
215
|
A ``-`` is interpreted as gap.
|
|
208
|
-
|
|
216
|
+
|
|
209
217
|
Returns
|
|
210
218
|
-------
|
|
211
219
|
trace : ndarray, dtype=int, shape=(n,2)
|
|
212
220
|
The created trace.
|
|
213
221
|
"""
|
|
214
222
|
if len(seq_str_list) < 2:
|
|
215
|
-
raise ValueError(
|
|
216
|
-
"An alignment must contain at least two sequences"
|
|
217
|
-
)
|
|
223
|
+
raise ValueError("An alignment must contain at least two sequences")
|
|
218
224
|
seq_i = np.zeros(len(seq_str_list))
|
|
219
|
-
trace = np.full((
|
|
220
|
-
-1, dtype=int)
|
|
225
|
+
trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int)
|
|
221
226
|
# Get length of string (same length for all strings)
|
|
222
227
|
# rather than length of list
|
|
223
228
|
for pos_i in range(len(seq_str_list[0])):
|
|
@@ -238,22 +243,22 @@ def get_codes(alignment):
|
|
|
238
243
|
Instead of the indices of the aligned symbols (trace), the return
|
|
239
244
|
value contains the corresponding symbol codes for each index.
|
|
240
245
|
Gaps are still represented by *-1*.
|
|
241
|
-
|
|
246
|
+
|
|
242
247
|
Parameters
|
|
243
248
|
----------
|
|
244
249
|
alignment : Alignment
|
|
245
250
|
The alignment to get the sequence codes for.
|
|
246
|
-
|
|
251
|
+
|
|
247
252
|
Returns
|
|
248
253
|
-------
|
|
249
254
|
codes : ndarray, dtype=int, shape=(n,m)
|
|
250
255
|
The sequence codes for the alignment.
|
|
251
256
|
The shape is *(n,m)* for *n* sequences and *m* alignment cloumn.
|
|
252
257
|
The array uses *-1* values for gaps.
|
|
253
|
-
|
|
258
|
+
|
|
254
259
|
Examples
|
|
255
260
|
--------
|
|
256
|
-
|
|
261
|
+
|
|
257
262
|
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
258
263
|
>>> seq2 = NucleotideSequence("TCATGC")
|
|
259
264
|
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
@@ -267,14 +272,17 @@ def get_codes(alignment):
|
|
|
267
272
|
"""
|
|
268
273
|
trace = alignment.trace
|
|
269
274
|
sequences = alignment.sequences
|
|
270
|
-
|
|
275
|
+
|
|
271
276
|
# The number of sequences is the first dimension
|
|
272
|
-
codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=
|
|
277
|
+
codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=np.int64)
|
|
273
278
|
for i in range(len(sequences)):
|
|
279
|
+
# Mark -1 explicitly as int64 to avoid that the unsigned dtype
|
|
280
|
+
# of the sequence code is used
|
|
281
|
+
# (https://numpy.org/neps/nep-0050-scalar-promotion.html)
|
|
274
282
|
codes[i] = np.where(
|
|
275
|
-
trace[:,i] != -1, sequences[i].code[trace[:,i]], -1
|
|
283
|
+
trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1)
|
|
276
284
|
)
|
|
277
|
-
|
|
285
|
+
|
|
278
286
|
return np.stack(codes)
|
|
279
287
|
|
|
280
288
|
|
|
@@ -283,24 +291,24 @@ def get_symbols(alignment):
|
|
|
283
291
|
Similar to :func:`get_codes()`, but contains the decoded symbols
|
|
284
292
|
instead of codes.
|
|
285
293
|
Gaps are still represented by *None* values.
|
|
286
|
-
|
|
294
|
+
|
|
287
295
|
Parameters
|
|
288
296
|
----------
|
|
289
297
|
alignment : Alignment
|
|
290
298
|
The alignment to get the symbols for.
|
|
291
|
-
|
|
299
|
+
|
|
292
300
|
Returns
|
|
293
301
|
-------
|
|
294
302
|
symbols : list of list
|
|
295
303
|
The nested list of symbols.
|
|
296
|
-
|
|
304
|
+
|
|
297
305
|
See Also
|
|
298
306
|
--------
|
|
299
307
|
get_codes
|
|
300
308
|
|
|
301
309
|
Examples
|
|
302
310
|
--------
|
|
303
|
-
|
|
311
|
+
|
|
304
312
|
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
305
313
|
>>> seq2 = NucleotideSequence("TCATGC")
|
|
306
314
|
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
@@ -317,8 +325,8 @@ def get_symbols(alignment):
|
|
|
317
325
|
alphabet = alignment.sequences[i].get_alphabet()
|
|
318
326
|
codes_wo_gaps = codes[i, codes[i] != -1]
|
|
319
327
|
symbols_wo_gaps = alphabet.decode_multiple(codes_wo_gaps)
|
|
320
|
-
if
|
|
321
|
-
symbols_wo_gaps =
|
|
328
|
+
if isinstance(symbols_wo_gaps, np.ndarray):
|
|
329
|
+
symbols_wo_gaps = symbols_wo_gaps.tolist()
|
|
322
330
|
symbols_for_seq = np.full(len(codes[i]), None, dtype=object)
|
|
323
331
|
symbols_for_seq[codes[i] != -1] = symbols_wo_gaps
|
|
324
332
|
symbols[i] = symbols_for_seq.tolist()
|
|
@@ -331,7 +339,7 @@ def get_sequence_identity(alignment, mode="not_terminal"):
|
|
|
331
339
|
|
|
332
340
|
The identity is equal to the matches divided by a measure for the
|
|
333
341
|
length of the alignment that depends on the `mode` parameter.
|
|
334
|
-
|
|
342
|
+
|
|
335
343
|
Parameters
|
|
336
344
|
----------
|
|
337
345
|
alignment : Alignment
|
|
@@ -348,12 +356,12 @@ def get_sequence_identity(alignment, mode="not_terminal"):
|
|
|
348
356
|
length of the shortest sequence.
|
|
349
357
|
|
|
350
358
|
Default is *not_terminal*.
|
|
351
|
-
|
|
359
|
+
|
|
352
360
|
Returns
|
|
353
361
|
-------
|
|
354
362
|
identity : float
|
|
355
363
|
The sequence identity, ranging between 0 and 1.
|
|
356
|
-
|
|
364
|
+
|
|
357
365
|
See also
|
|
358
366
|
--------
|
|
359
367
|
get_pairwise_sequence_identity
|
|
@@ -363,12 +371,12 @@ def get_sequence_identity(alignment, mode="not_terminal"):
|
|
|
363
371
|
# Count matches
|
|
364
372
|
matches = 0
|
|
365
373
|
for i in range(codes.shape[1]):
|
|
366
|
-
column = codes[:,i]
|
|
374
|
+
column = codes[:, i]
|
|
367
375
|
# One unique value -> all symbols match
|
|
368
376
|
unique_symbols = np.unique(column)
|
|
369
377
|
if len(unique_symbols) == 1 and unique_symbols[0] != -1:
|
|
370
378
|
matches += 1
|
|
371
|
-
|
|
379
|
+
|
|
372
380
|
# Calculate length
|
|
373
381
|
if mode == "all":
|
|
374
382
|
length = len(alignment)
|
|
@@ -394,7 +402,7 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
|
|
|
394
402
|
|
|
395
403
|
The identity is equal to the matches divided by a measure for the
|
|
396
404
|
length of the alignment that depends on the `mode` parameter.
|
|
397
|
-
|
|
405
|
+
|
|
398
406
|
Parameters
|
|
399
407
|
----------
|
|
400
408
|
alignment : Alignment, length=n
|
|
@@ -411,12 +419,12 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
|
|
|
411
419
|
length of the shortest one of the two sequences.
|
|
412
420
|
|
|
413
421
|
Default is *not_terminal*.
|
|
414
|
-
|
|
422
|
+
|
|
415
423
|
Returns
|
|
416
424
|
-------
|
|
417
425
|
identity : ndarray, dtype=float, shape=(n,n)
|
|
418
426
|
The pairwise sequence identity, ranging between 0 and 1.
|
|
419
|
-
|
|
427
|
+
|
|
420
428
|
See also
|
|
421
429
|
--------
|
|
422
430
|
get_sequence_identity
|
|
@@ -427,9 +435,11 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
|
|
|
427
435
|
# Count matches
|
|
428
436
|
# Calculate at which positions the sequences are identical
|
|
429
437
|
# and are not gaps
|
|
430
|
-
equality_matrix = (
|
|
431
|
-
|
|
432
|
-
|
|
438
|
+
equality_matrix = (
|
|
439
|
+
(codes[:, np.newaxis, :] == codes[np.newaxis, :, :])
|
|
440
|
+
& (codes[:, np.newaxis, :] != -1)
|
|
441
|
+
& (codes[np.newaxis, :, :] != -1)
|
|
442
|
+
)
|
|
433
443
|
# Sum these positions up
|
|
434
444
|
matches = np.count_nonzero(equality_matrix, axis=-1)
|
|
435
445
|
|
|
@@ -441,24 +451,23 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
|
|
|
441
451
|
for i in range(n_seq):
|
|
442
452
|
for j in range(n_seq):
|
|
443
453
|
# Find latest start and earliest stop of all sequences
|
|
444
|
-
start, stop = find_terminal_gaps(alignment[:, [i,j]])
|
|
454
|
+
start, stop = find_terminal_gaps(alignment[:, [i, j]])
|
|
445
455
|
if stop <= start:
|
|
446
456
|
raise ValueError(
|
|
447
457
|
"Cannot calculate non-terminal identity, "
|
|
448
458
|
"as the two sequences have no overlap"
|
|
449
459
|
)
|
|
450
|
-
length[i,j] = stop - start
|
|
460
|
+
length[i, j] = stop - start
|
|
451
461
|
elif mode == "shortest":
|
|
452
462
|
length = np.zeros((n_seq, n_seq))
|
|
453
463
|
for i in range(n_seq):
|
|
454
464
|
for j in range(n_seq):
|
|
455
|
-
length[i,j] = min(
|
|
456
|
-
len(alignment.sequences[i]),
|
|
457
|
-
|
|
458
|
-
])
|
|
465
|
+
length[i, j] = min(
|
|
466
|
+
[len(alignment.sequences[i]), len(alignment.sequences[j])]
|
|
467
|
+
)
|
|
459
468
|
else:
|
|
460
469
|
raise ValueError(f"'{mode}' is an invalid calculation mode")
|
|
461
|
-
|
|
470
|
+
|
|
462
471
|
return matches / length
|
|
463
472
|
|
|
464
473
|
|
|
@@ -468,7 +477,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
|
|
|
468
477
|
|
|
469
478
|
If the alignment contains more than two sequences,
|
|
470
479
|
all pairwise scores are counted.
|
|
471
|
-
|
|
480
|
+
|
|
472
481
|
Parameters
|
|
473
482
|
----------
|
|
474
483
|
alignment : Alignment
|
|
@@ -485,7 +494,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
|
|
|
485
494
|
terminal_penalty : bool, optional
|
|
486
495
|
If true, gap penalties are applied to terminal gaps.
|
|
487
496
|
(Default: True)
|
|
488
|
-
|
|
497
|
+
|
|
489
498
|
Returns
|
|
490
499
|
-------
|
|
491
500
|
score : int
|
|
@@ -503,18 +512,18 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
|
|
|
503
512
|
# Do not count self-similarity
|
|
504
513
|
# and do not count similarity twice (not S(i,j) and S(j,i))
|
|
505
514
|
for i in range(codes.shape[0]):
|
|
506
|
-
for j in range(i+1, codes.shape[0]):
|
|
515
|
+
for j in range(i + 1, codes.shape[0]):
|
|
507
516
|
code_i = column[i]
|
|
508
517
|
code_j = column[j]
|
|
509
518
|
# Ignore gaps
|
|
510
519
|
if code_i != -1 and code_j != -1:
|
|
511
520
|
score += matrix[code_i, code_j]
|
|
512
|
-
|
|
521
|
+
|
|
513
522
|
# Sum gap penalties
|
|
514
|
-
if
|
|
523
|
+
if isinstance(gap_penalty, numbers.Real):
|
|
515
524
|
gap_open = gap_penalty
|
|
516
525
|
gap_ext = gap_penalty
|
|
517
|
-
elif
|
|
526
|
+
elif isinstance(gap_penalty, Sequence):
|
|
518
527
|
gap_open = gap_penalty[0]
|
|
519
528
|
gap_ext = gap_penalty[1]
|
|
520
529
|
else:
|
|
@@ -590,15 +599,15 @@ def find_terminal_gaps(alignment):
|
|
|
590
599
|
"""
|
|
591
600
|
trace = alignment.trace
|
|
592
601
|
# Find for each sequence the positions of non-gap symbols
|
|
593
|
-
no_gap_pos = [np.where(trace[:,i] != -1)[0] for i in range(trace.shape[1])]
|
|
602
|
+
no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])]
|
|
594
603
|
# Find for each sequence the positions of the sequence start and end
|
|
595
604
|
# in the alignment
|
|
596
|
-
firsts = [no_gap_pos[i][0
|
|
597
|
-
lasts
|
|
605
|
+
firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])]
|
|
606
|
+
lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
|
|
598
607
|
# The terminal gaps are before all sequences start and after any
|
|
599
608
|
# sequence ends
|
|
600
609
|
# Use exclusive stop -> -1
|
|
601
|
-
return np.max(firsts), np.min(lasts) + 1
|
|
610
|
+
return np.max(firsts).item(), np.min(lasts).item() + 1
|
|
602
611
|
|
|
603
612
|
|
|
604
613
|
def remove_terminal_gaps(alignment):
|
|
@@ -655,4 +664,4 @@ def remove_terminal_gaps(alignment):
|
|
|
655
664
|
"Cannot remove terminal gaps, since at least two sequences have "
|
|
656
665
|
"no overlap and the resulting alignment would be empty"
|
|
657
666
|
)
|
|
658
|
-
return alignment[start
|
|
667
|
+
return alignment[start:stop]
|
|
Binary file
|
|
@@ -6,11 +6,12 @@ __name__ = "biotite.sequence.align"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["bucket_number"]
|
|
8
8
|
|
|
9
|
-
from os.path import
|
|
9
|
+
from os.path import dirname, join, realpath
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
|
|
13
12
|
_primes = None
|
|
13
|
+
|
|
14
|
+
|
|
14
15
|
def bucket_number(n_kmers, load_factor=0.8):
|
|
15
16
|
"""
|
|
16
17
|
Find an appropriate number of buckets for a :class:`BucketKmerTable`
|
|
@@ -54,16 +55,17 @@ def bucket_number(n_kmers, load_factor=0.8):
|
|
|
54
55
|
"""
|
|
55
56
|
global _primes
|
|
56
57
|
if _primes is None:
|
|
57
|
-
with open(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
with open(join(dirname(realpath(__file__)), "primes.txt")) as file:
|
|
59
|
+
_primes = np.array(
|
|
60
|
+
[
|
|
61
|
+
int(line)
|
|
62
|
+
for line in file.read().splitlines()
|
|
63
|
+
if len(line) != 0 and line[0] != "#"
|
|
64
|
+
]
|
|
65
|
+
)
|
|
64
66
|
|
|
65
67
|
number = int(n_kmers / load_factor)
|
|
66
68
|
index = np.searchsorted(_primes, number, side="left")
|
|
67
69
|
if index == len(_primes):
|
|
68
70
|
raise ValueError("Number of buckets too large")
|
|
69
|
-
return _primes[index]
|
|
71
|
+
return _primes[index]
|