biotite 0.41.2__cp312-cp312-win_amd64.whl → 1.0.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +1 -1
- biotite/application/application.py +20 -10
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +3 -3
- biotite/sequence/align/__init__.py +2 -2
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +1 -1
- biotite/sequence/align/pairwise.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +51 -65
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp312-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp312-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +15 -17
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +221 -235
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp312-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +29 -32
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp312-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp312-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +4 -5
- biotite/structure/info/groups.py +1 -3
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -44
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp312-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/METADATA +5 -5
- biotite-1.0.0.dist-info/RECORD +322 -0
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.2.dist-info/RECORD +0 -340
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/WHEEL +0 -0
- {biotite-0.41.2.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -10,10 +10,8 @@ __all__ = ["MinimizerSelector", "SyncmerSelector", "CachedSyncmerSelector",
|
|
|
10
10
|
cimport cython
|
|
11
11
|
cimport numpy as np
|
|
12
12
|
|
|
13
|
-
from numbers import Integral
|
|
14
13
|
import numpy as np
|
|
15
14
|
from .kmeralphabet import KmerAlphabet
|
|
16
|
-
from ..alphabet import AlphabetError
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
ctypedef np.int64_t int64
|
|
@@ -21,7 +19,7 @@ ctypedef np.uint32_t uint32
|
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
# Obtained from 'np.iinfo(np.int64).max'
|
|
24
|
-
|
|
22
|
+
cdef int64 MAX_INT_64 = 9223372036854775807
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
class MinimizerSelector:
|
|
@@ -54,7 +52,7 @@ class MinimizerSelector:
|
|
|
54
52
|
This standard order is often the lexicographical order, which is
|
|
55
53
|
known to yield suboptimal *density* in many cases
|
|
56
54
|
:footcite:`Roberts2004`.
|
|
57
|
-
|
|
55
|
+
|
|
58
56
|
Attributes
|
|
59
57
|
----------
|
|
60
58
|
kmer_alphabet : KmerAlphabet
|
|
@@ -73,7 +71,7 @@ class MinimizerSelector:
|
|
|
73
71
|
|
|
74
72
|
References
|
|
75
73
|
----------
|
|
76
|
-
|
|
74
|
+
|
|
77
75
|
.. footbibliography::
|
|
78
76
|
|
|
79
77
|
Examples
|
|
@@ -122,12 +120,12 @@ class MinimizerSelector:
|
|
|
122
120
|
self._window = window
|
|
123
121
|
self._kmer_alph = kmer_alphabet
|
|
124
122
|
self._permutation = permutation
|
|
125
|
-
|
|
123
|
+
|
|
126
124
|
|
|
127
125
|
@property
|
|
128
126
|
def kmer_alphabet(self):
|
|
129
127
|
return self._kmer_alph
|
|
130
|
-
|
|
128
|
+
|
|
131
129
|
@property
|
|
132
130
|
def window(self):
|
|
133
131
|
return self._window
|
|
@@ -135,7 +133,7 @@ class MinimizerSelector:
|
|
|
135
133
|
@property
|
|
136
134
|
def permutation(self):
|
|
137
135
|
return self._permutation
|
|
138
|
-
|
|
136
|
+
|
|
139
137
|
|
|
140
138
|
def select(self, sequence, bint alphabet_check=True):
|
|
141
139
|
"""
|
|
@@ -154,7 +152,7 @@ class MinimizerSelector:
|
|
|
154
152
|
of the sequence and the alphabet of the
|
|
155
153
|
:class:`MinimizerSelector`
|
|
156
154
|
is not checked to gain additional performance.
|
|
157
|
-
|
|
155
|
+
|
|
158
156
|
Returns
|
|
159
157
|
-------
|
|
160
158
|
minimizer_indices : ndarray, dtype=np.uint32
|
|
@@ -162,7 +160,7 @@ class MinimizerSelector:
|
|
|
162
160
|
minimizers : ndarray, dtype=np.int64
|
|
163
161
|
The *k-mers* that are the selected minimizers, returned as
|
|
164
162
|
*k-mer* code.
|
|
165
|
-
|
|
163
|
+
|
|
166
164
|
Notes
|
|
167
165
|
-----
|
|
168
166
|
Duplicate minimizers are omitted, i.e. if two windows have the
|
|
@@ -176,7 +174,7 @@ class MinimizerSelector:
|
|
|
176
174
|
)
|
|
177
175
|
kmers = self._kmer_alph.create_kmers(sequence.code)
|
|
178
176
|
return self.select_from_kmers(kmers)
|
|
179
|
-
|
|
177
|
+
|
|
180
178
|
|
|
181
179
|
def select_from_kmers(self, kmers):
|
|
182
180
|
"""
|
|
@@ -191,7 +189,7 @@ class MinimizerSelector:
|
|
|
191
189
|
minimizers in.
|
|
192
190
|
The *k-mer* codes correspond to the *k-mers* encoded by the
|
|
193
191
|
given `kmer_alphabet`.
|
|
194
|
-
|
|
192
|
+
|
|
195
193
|
Returns
|
|
196
194
|
-------
|
|
197
195
|
minimizer_indices : ndarray, dtype=np.uint32
|
|
@@ -199,7 +197,7 @@ class MinimizerSelector:
|
|
|
199
197
|
appears.
|
|
200
198
|
minimizers : ndarray, dtype=np.int64
|
|
201
199
|
The corresponding *k-mers* codes of the minimizers.
|
|
202
|
-
|
|
200
|
+
|
|
203
201
|
Notes
|
|
204
202
|
-----
|
|
205
203
|
Duplicate minimizers are omitted, i.e. if two windows have the
|
|
@@ -267,7 +265,7 @@ class SyncmerSelector:
|
|
|
267
265
|
*k-mer*.
|
|
268
266
|
By default, the minimum position needs to be at the start of the
|
|
269
267
|
*k-mer*, which is termed *open syncmer*.
|
|
270
|
-
|
|
268
|
+
|
|
271
269
|
Attributes
|
|
272
270
|
----------
|
|
273
271
|
alphabet : Alphabet
|
|
@@ -276,7 +274,7 @@ class SyncmerSelector:
|
|
|
276
274
|
The :class:`KmerAlphabet` for *k* and *s*, respectively.
|
|
277
275
|
permutation : Permutation
|
|
278
276
|
The permutation.
|
|
279
|
-
|
|
277
|
+
|
|
280
278
|
See also
|
|
281
279
|
--------
|
|
282
280
|
CachedSyncmerSelector
|
|
@@ -291,7 +289,7 @@ class SyncmerSelector:
|
|
|
291
289
|
|
|
292
290
|
References
|
|
293
291
|
----------
|
|
294
|
-
|
|
292
|
+
|
|
295
293
|
.. footbibliography::
|
|
296
294
|
|
|
297
295
|
Examples
|
|
@@ -337,7 +335,7 @@ class SyncmerSelector:
|
|
|
337
335
|
self._alphabet = alphabet
|
|
338
336
|
self._kmer_alph = KmerAlphabet(alphabet, k)
|
|
339
337
|
self._smer_alph = KmerAlphabet(alphabet, s)
|
|
340
|
-
|
|
338
|
+
|
|
341
339
|
self._permutation = permutation
|
|
342
340
|
|
|
343
341
|
self._offset = np.asarray(offset, dtype=np.int64)
|
|
@@ -353,7 +351,7 @@ class SyncmerSelector:
|
|
|
353
351
|
)
|
|
354
352
|
if len(np.unique(self._offset)) != len(self._offset):
|
|
355
353
|
raise ValueError("Offset must contain unique values")
|
|
356
|
-
|
|
354
|
+
|
|
357
355
|
|
|
358
356
|
@property
|
|
359
357
|
def alphabet(self):
|
|
@@ -362,7 +360,7 @@ class SyncmerSelector:
|
|
|
362
360
|
@property
|
|
363
361
|
def kmer_alphabet(self):
|
|
364
362
|
return self._kmer_alph
|
|
365
|
-
|
|
363
|
+
|
|
366
364
|
@property
|
|
367
365
|
def smer_alphabet(self):
|
|
368
366
|
return self._smer_alph
|
|
@@ -370,7 +368,7 @@ class SyncmerSelector:
|
|
|
370
368
|
@property
|
|
371
369
|
def permutation(self):
|
|
372
370
|
return self._permutation
|
|
373
|
-
|
|
371
|
+
|
|
374
372
|
|
|
375
373
|
def select(self, sequence, bint alphabet_check=True):
|
|
376
374
|
"""
|
|
@@ -389,7 +387,7 @@ class SyncmerSelector:
|
|
|
389
387
|
of the sequence and the alphabet of the
|
|
390
388
|
:class:`SyncmerSelector`
|
|
391
389
|
is not checked to gain additional performance.
|
|
392
|
-
|
|
390
|
+
|
|
393
391
|
Returns
|
|
394
392
|
-------
|
|
395
393
|
syncmer_indices : ndarray, dtype=np.uint32
|
|
@@ -428,7 +426,7 @@ class SyncmerSelector:
|
|
|
428
426
|
relative_min_pos = min_pos - np.arange(len(kmers))
|
|
429
427
|
syncmer_pos = self._filter_syncmer_pos(relative_min_pos)
|
|
430
428
|
return syncmer_pos, kmers[syncmer_pos]
|
|
431
|
-
|
|
429
|
+
|
|
432
430
|
|
|
433
431
|
def select_from_kmers(self, kmers):
|
|
434
432
|
"""
|
|
@@ -442,7 +440,7 @@ class SyncmerSelector:
|
|
|
442
440
|
----------
|
|
443
441
|
kmers : ndarray, dtype=np.int64
|
|
444
442
|
The *k-mer* codes to select the syncmers from.
|
|
445
|
-
|
|
443
|
+
|
|
446
444
|
Returns
|
|
447
445
|
-------
|
|
448
446
|
syncmer_indices : ndarray, dtype=np.uint32
|
|
@@ -459,9 +457,9 @@ class SyncmerSelector:
|
|
|
459
457
|
:class:`Sequence` objects.
|
|
460
458
|
"""
|
|
461
459
|
cdef int64 i
|
|
462
|
-
|
|
460
|
+
|
|
463
461
|
symbol_codes_for_each_kmer = self._kmer_alph.split(kmers)
|
|
464
|
-
|
|
462
|
+
|
|
465
463
|
cdef int64[:] min_pos = np.zeros(
|
|
466
464
|
len(symbol_codes_for_each_kmer), dtype=np.int64
|
|
467
465
|
)
|
|
@@ -477,10 +475,10 @@ class SyncmerSelector:
|
|
|
477
475
|
f"sort keys for {len(smers)} s-mers"
|
|
478
476
|
)
|
|
479
477
|
min_pos[i] = np.argmin(ordering)
|
|
480
|
-
|
|
478
|
+
|
|
481
479
|
syncmer_pos = self._filter_syncmer_pos(min_pos)
|
|
482
480
|
return syncmer_pos, kmers[syncmer_pos]
|
|
483
|
-
|
|
481
|
+
|
|
484
482
|
|
|
485
483
|
def _filter_syncmer_pos(self, min_pos):
|
|
486
484
|
"""
|
|
@@ -538,7 +536,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
538
536
|
*k-mer*.
|
|
539
537
|
By default, the minimum position needs to be at the start of the
|
|
540
538
|
*k-mer*, which is termed *open syncmer*.
|
|
541
|
-
|
|
539
|
+
|
|
542
540
|
Attributes
|
|
543
541
|
----------
|
|
544
542
|
alphabet : Alphabet
|
|
@@ -547,7 +545,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
547
545
|
The :class:`KmerAlphabet` for *k* and *s*, respectively.
|
|
548
546
|
permutation : Permutation
|
|
549
547
|
The permutation.
|
|
550
|
-
|
|
548
|
+
|
|
551
549
|
See also
|
|
552
550
|
--------
|
|
553
551
|
SyncmerSelector
|
|
@@ -562,7 +560,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
562
560
|
|
|
563
561
|
References
|
|
564
562
|
----------
|
|
565
|
-
|
|
563
|
+
|
|
566
564
|
.. footbibliography::
|
|
567
565
|
|
|
568
566
|
Examples
|
|
@@ -584,7 +582,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
584
582
|
>>> print(["".join(kmer_alph.decode(kmer)) for kmer in syncmers])
|
|
585
583
|
['GGCAA', 'AAGTG', 'AGTGA', 'GTGAC']
|
|
586
584
|
"""
|
|
587
|
-
|
|
585
|
+
|
|
588
586
|
def __init__(self, alphabet, k, s, permutation=None, offset=(0,)):
|
|
589
587
|
super().__init__(alphabet, k, s, permutation, offset)
|
|
590
588
|
# Check for all possible *k-mers*, whether they are syncmers
|
|
@@ -593,7 +591,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
593
591
|
# Convert the index array into a boolean mask
|
|
594
592
|
self._syncmer_mask = np.zeros(len(self.kmer_alphabet), dtype=bool)
|
|
595
593
|
self._syncmer_mask[syncmer_indices] = True
|
|
596
|
-
|
|
594
|
+
|
|
597
595
|
|
|
598
596
|
def select(self, sequence, bint alphabet_check=True):
|
|
599
597
|
"""
|
|
@@ -612,7 +610,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
612
610
|
of the sequence and the alphabet of the
|
|
613
611
|
:class:`CachedSyncmerSelector`
|
|
614
612
|
is not checked to gain additional performance.
|
|
615
|
-
|
|
613
|
+
|
|
616
614
|
Returns
|
|
617
615
|
-------
|
|
618
616
|
syncmer_indices : ndarray, dtype=np.uint32
|
|
@@ -628,7 +626,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
628
626
|
)
|
|
629
627
|
kmers = self.kmer_alphabet.create_kmers(sequence.code)
|
|
630
628
|
return self.select_from_kmers(kmers)
|
|
631
|
-
|
|
629
|
+
|
|
632
630
|
|
|
633
631
|
def select_from_kmers(self, kmers):
|
|
634
632
|
"""
|
|
@@ -642,7 +640,7 @@ class CachedSyncmerSelector(SyncmerSelector):
|
|
|
642
640
|
----------
|
|
643
641
|
kmers : ndarray, dtype=np.int64
|
|
644
642
|
The *k-mer* codes to select the syncmers from.
|
|
645
|
-
|
|
643
|
+
|
|
646
644
|
Returns
|
|
647
645
|
-------
|
|
648
646
|
syncmer_indices : ndarray, dtype=np.uint32
|
|
@@ -660,7 +658,7 @@ class MincodeSelector:
|
|
|
660
658
|
|
|
661
659
|
Selects the :math:`1/\text{compression}` *smallest* *k-mers* from
|
|
662
660
|
:class:`KmerAlphabet`. :footcite:`Edgar2021`
|
|
663
|
-
|
|
661
|
+
|
|
664
662
|
'*Small*' refers to the lexicographical order, or alternatively a
|
|
665
663
|
custom order if `permutation` is given.
|
|
666
664
|
The *Mincode* approach tries to reduce the number of *k-mers* from a
|
|
@@ -682,7 +680,7 @@ class MincodeSelector:
|
|
|
682
680
|
By default, the standard order of the :class:`KmerAlphabet` is
|
|
683
681
|
used.
|
|
684
682
|
This standard order is often the lexicographical order.
|
|
685
|
-
|
|
683
|
+
|
|
686
684
|
Attributes
|
|
687
685
|
----------
|
|
688
686
|
kmer_alphabet : KmerAlphabet
|
|
@@ -695,10 +693,10 @@ class MincodeSelector:
|
|
|
695
693
|
All *k-mers*, that are smaller than this value are selected.
|
|
696
694
|
permutation : Permutation
|
|
697
695
|
The permutation.
|
|
698
|
-
|
|
696
|
+
|
|
699
697
|
References
|
|
700
698
|
----------
|
|
701
|
-
|
|
699
|
+
|
|
702
700
|
.. footbibliography::
|
|
703
701
|
|
|
704
702
|
Examples
|
|
@@ -735,12 +733,12 @@ class MincodeSelector:
|
|
|
735
733
|
permutation_offset = permutation.min
|
|
736
734
|
permutation_range = permutation.max - permutation.min + 1
|
|
737
735
|
self._threshold = permutation_offset + permutation_range / compression
|
|
738
|
-
|
|
736
|
+
|
|
739
737
|
|
|
740
738
|
@property
|
|
741
739
|
def kmer_alphabet(self):
|
|
742
740
|
return self._kmer_alph
|
|
743
|
-
|
|
741
|
+
|
|
744
742
|
@property
|
|
745
743
|
def compression(self):
|
|
746
744
|
return self._compression
|
|
@@ -752,7 +750,7 @@ class MincodeSelector:
|
|
|
752
750
|
@property
|
|
753
751
|
def permutation(self):
|
|
754
752
|
return self._permutation
|
|
755
|
-
|
|
753
|
+
|
|
756
754
|
|
|
757
755
|
def select(self, sequence, bint alphabet_check=True):
|
|
758
756
|
"""
|
|
@@ -771,7 +769,7 @@ class MincodeSelector:
|
|
|
771
769
|
of the sequence and the alphabet of the
|
|
772
770
|
:class:`MincodeSelector`
|
|
773
771
|
is not checked to gain additional performance.
|
|
774
|
-
|
|
772
|
+
|
|
775
773
|
Returns
|
|
776
774
|
-------
|
|
777
775
|
mincode_indices : ndarray, dtype=np.uint32
|
|
@@ -786,7 +784,7 @@ class MincodeSelector:
|
|
|
786
784
|
)
|
|
787
785
|
kmers = self._kmer_alph.create_kmers(sequence.code)
|
|
788
786
|
return self.select_from_kmers(kmers)
|
|
789
|
-
|
|
787
|
+
|
|
790
788
|
|
|
791
789
|
def select_from_kmers(self, kmers):
|
|
792
790
|
"""
|
|
@@ -800,7 +798,7 @@ class MincodeSelector:
|
|
|
800
798
|
----------
|
|
801
799
|
kmers : ndarray, dtype=np.int64
|
|
802
800
|
The *k-mer* codes to select the *Mincode k-mers* from.
|
|
803
|
-
|
|
801
|
+
|
|
804
802
|
Returns
|
|
805
803
|
-------
|
|
806
804
|
mincode_indices : ndarray, dtype=np.uint32
|
|
@@ -820,7 +818,7 @@ class MincodeSelector:
|
|
|
820
818
|
|
|
821
819
|
mincode_pos = ordering < self._threshold
|
|
822
820
|
return mincode_pos, kmers[mincode_pos]
|
|
823
|
-
|
|
821
|
+
|
|
824
822
|
|
|
825
823
|
@cython.boundscheck(False)
|
|
826
824
|
@cython.wraparound(False)
|
|
@@ -835,7 +833,7 @@ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window,
|
|
|
835
833
|
instead of 'x - (window-1)/2' to 'x + (window-1)/2'.
|
|
836
834
|
"""
|
|
837
835
|
cdef uint32 seq_i
|
|
838
|
-
|
|
836
|
+
|
|
839
837
|
cdef uint32 n_windows = kmers.shape[0] - (window - 1)
|
|
840
838
|
# Pessimistic array allocation size
|
|
841
839
|
# -> Expect that every window has a new minimizer
|
|
@@ -865,14 +863,14 @@ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window,
|
|
|
865
863
|
reverse_argcummin = reverse_argcummins[seq_i]
|
|
866
864
|
forward_cummin = ordering[forward_argcummin]
|
|
867
865
|
reverse_cummin = ordering[reverse_argcummin]
|
|
868
|
-
|
|
866
|
+
|
|
869
867
|
# At ties the leftmost position is taken,
|
|
870
868
|
# which stems from the reverse pass
|
|
871
869
|
if forward_cummin < reverse_cummin:
|
|
872
870
|
combined_argcummin = forward_argcummin
|
|
873
871
|
else:
|
|
874
872
|
combined_argcummin = reverse_argcummin
|
|
875
|
-
|
|
873
|
+
|
|
876
874
|
# If the same minimizer position was observed before, the
|
|
877
875
|
# duplicate is simply ignored, if 'include_duplicates' is false
|
|
878
876
|
if include_duplicates or combined_argcummin != prev_argcummin:
|
|
@@ -899,7 +897,7 @@ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size):
|
|
|
899
897
|
cdef uint32 current_min_i = 0
|
|
900
898
|
cdef int64 current_min, current_val
|
|
901
899
|
cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
|
|
902
|
-
|
|
900
|
+
|
|
903
901
|
# Any actual value will be smaller than this placeholder
|
|
904
902
|
current_min = MAX_INT_64
|
|
905
903
|
for seq_i in range(values.shape[0]):
|
|
@@ -911,7 +909,7 @@ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size):
|
|
|
911
909
|
current_min_i = seq_i
|
|
912
910
|
current_min = current_val
|
|
913
911
|
min_pos[seq_i] = current_min_i
|
|
914
|
-
|
|
912
|
+
|
|
915
913
|
return min_pos
|
|
916
914
|
|
|
917
915
|
@cython.boundscheck(False)
|
|
@@ -930,7 +928,7 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size):
|
|
|
930
928
|
- There are issues in selecting the leftmost argument
|
|
931
929
|
- An offset is necessary to ensure alignment of chunks with forward
|
|
932
930
|
pass
|
|
933
|
-
|
|
931
|
+
|
|
934
932
|
Hence, a separate 'reverse' variant of the function was implemented.
|
|
935
933
|
"""
|
|
936
934
|
cdef uint32 seq_i
|
|
@@ -938,7 +936,7 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size):
|
|
|
938
936
|
cdef uint32 current_min_i = 0
|
|
939
937
|
cdef int64 current_min, current_val
|
|
940
938
|
cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32)
|
|
941
|
-
|
|
939
|
+
|
|
942
940
|
current_min = MAX_INT_64
|
|
943
941
|
for seq_i in reversed(range(values.shape[0])):
|
|
944
942
|
# The chunk beginning is a small difference to forward
|
|
@@ -952,5 +950,5 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size):
|
|
|
952
950
|
current_min_i = seq_i
|
|
953
951
|
current_min = current_val
|
|
954
952
|
min_pos[seq_i] = current_min_i
|
|
955
|
-
|
|
953
|
+
|
|
956
954
|
return min_pos
|
|
@@ -7,8 +7,8 @@ __author__ = "Patrick Kunzmann"
|
|
|
7
7
|
__all__ = ["EValueEstimator"]
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
|
-
from
|
|
11
|
-
from .
|
|
10
|
+
from biotite.sequence.align.pairwise import align_optimal
|
|
11
|
+
from biotite.sequence.seqtypes import GeneralSequence
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class EValueEstimator:
|
|
@@ -29,7 +29,7 @@ class EValueEstimator:
|
|
|
29
29
|
of random sequence alignments in :meth:`from_samples()`
|
|
30
30
|
:footcite:`Altschul1986`, which may be time consuming.
|
|
31
31
|
If these parameters are known, the constructor can be used instead.
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
Based on the sampled parameters, the decadic logarithm of the
|
|
34
34
|
E-value can be quickly calculated via :meth:`log_evalue()`.
|
|
35
35
|
|
|
@@ -39,7 +39,7 @@ class EValueEstimator:
|
|
|
39
39
|
The :math:`\lambda` parameter.
|
|
40
40
|
k : float
|
|
41
41
|
The :math:`K` parameter.
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
Notes
|
|
44
44
|
-----
|
|
45
45
|
The calculated E-value is a rough estimation that gets more
|
|
@@ -102,8 +102,9 @@ class EValueEstimator:
|
|
|
102
102
|
self._k = k
|
|
103
103
|
|
|
104
104
|
@staticmethod
|
|
105
|
-
def from_samples(
|
|
106
|
-
|
|
105
|
+
def from_samples(
|
|
106
|
+
alphabet, matrix, gap_penalty, frequencies, sample_length=1000, sample_size=1000
|
|
107
|
+
):
|
|
107
108
|
r"""
|
|
108
109
|
Create an :class:`EValueEstimator` with :math:`\lambda` and
|
|
109
110
|
:math:`K` estimated via sampling alignments of random sequences
|
|
@@ -137,13 +138,13 @@ class EValueEstimator:
|
|
|
137
138
|
The number of sampled sequences.
|
|
138
139
|
The accuracy of the estimated parameters and E-values,
|
|
139
140
|
but also the runtime increases with the sample size.
|
|
140
|
-
|
|
141
|
+
|
|
141
142
|
Returns
|
|
142
143
|
-------
|
|
143
144
|
estimator : EValueEstimator
|
|
144
145
|
A :class:`EValueEstimator` with sampled :math:`\lambda` and
|
|
145
146
|
:math:`K` parameters.
|
|
146
|
-
|
|
147
|
+
|
|
147
148
|
Notes
|
|
148
149
|
-----
|
|
149
150
|
The sampling process generates random sequences based on
|
|
@@ -167,15 +168,15 @@ class EValueEstimator:
|
|
|
167
168
|
raise ValueError("A symmetric substitution matrix is required")
|
|
168
169
|
if not matrix.get_alphabet1().extends(alphabet):
|
|
169
170
|
raise ValueError(
|
|
170
|
-
"The substitution matrix is not compatible "
|
|
171
|
-
"with the given alphabet"
|
|
171
|
+
"The substitution matrix is not compatible " "with the given alphabet"
|
|
172
172
|
)
|
|
173
|
-
score_matrix = matrix.score_matrix()[:len(alphabet), :len(alphabet)]
|
|
174
|
-
if
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
173
|
+
score_matrix = matrix.score_matrix()[: len(alphabet), : len(alphabet)]
|
|
174
|
+
if (
|
|
175
|
+
np.sum(
|
|
176
|
+
score_matrix * frequencies[np.newaxis, :] * frequencies[:, np.newaxis]
|
|
177
|
+
)
|
|
178
|
+
>= 0
|
|
179
|
+
):
|
|
179
180
|
raise ValueError(
|
|
180
181
|
"Invalid substitution matrix, the expected similarity "
|
|
181
182
|
"score between two random symbols is not negative"
|
|
@@ -183,9 +184,7 @@ class EValueEstimator:
|
|
|
183
184
|
|
|
184
185
|
# Generate the sequence code for the random sequences
|
|
185
186
|
random_sequence_code = np.random.choice(
|
|
186
|
-
len(alphabet),
|
|
187
|
-
size=(sample_size, 2, sample_length),
|
|
188
|
-
p=frequencies
|
|
187
|
+
len(alphabet), size=(sample_size, 2, sample_length), p=frequencies
|
|
189
188
|
)
|
|
190
189
|
|
|
191
190
|
# Sample the alignments of random sequences
|
|
@@ -193,28 +192,27 @@ class EValueEstimator:
|
|
|
193
192
|
for i in range(sample_size):
|
|
194
193
|
seq1 = GeneralSequence(alphabet)
|
|
195
194
|
seq2 = GeneralSequence(alphabet)
|
|
196
|
-
seq1.code = random_sequence_code[i,0]
|
|
197
|
-
seq2.code = random_sequence_code[i,1]
|
|
195
|
+
seq1.code = random_sequence_code[i, 0]
|
|
196
|
+
seq2.code = random_sequence_code[i, 1]
|
|
198
197
|
sample_scores[i] = align_optimal(
|
|
199
|
-
seq1, seq2, matrix,
|
|
200
|
-
local=True, gap_penalty=gap_penalty, max_number=1
|
|
198
|
+
seq1, seq2, matrix, local=True, gap_penalty=gap_penalty, max_number=1
|
|
201
199
|
)[0].score
|
|
202
|
-
|
|
200
|
+
|
|
203
201
|
# Use method of moments to estimate parameters
|
|
204
202
|
lam = np.pi / np.sqrt(6 * np.var(sample_scores))
|
|
205
203
|
u = np.mean(sample_scores) - np.euler_gamma / lam
|
|
206
204
|
k = np.exp(lam * u) / sample_length**2
|
|
207
|
-
|
|
205
|
+
|
|
208
206
|
return EValueEstimator(lam, k)
|
|
209
207
|
|
|
210
208
|
@property
|
|
211
209
|
def lam(self):
|
|
212
210
|
return self._lam
|
|
213
|
-
|
|
211
|
+
|
|
214
212
|
@property
|
|
215
213
|
def k(self):
|
|
216
214
|
return self._k
|
|
217
|
-
|
|
215
|
+
|
|
218
216
|
def log_evalue(self, score, seq1_length, seq2_length):
|
|
219
217
|
r"""
|
|
220
218
|
Calculate the decadic logarithm of the E-value for a given
|
|
@@ -223,11 +221,11 @@ class EValueEstimator:
|
|
|
223
221
|
The E-value and the logarithm of the E-value is calculated as
|
|
224
222
|
|
|
225
223
|
.. math::
|
|
226
|
-
|
|
224
|
+
|
|
227
225
|
E = Kmn e^{-\lambda s}
|
|
228
226
|
|
|
229
227
|
\log_{10} E = (\log_{10} Kmn) - \frac{\lambda s}{\ln 10},
|
|
230
|
-
|
|
228
|
+
|
|
231
229
|
where :math:`s` is the similarity score and :math:`m` and
|
|
232
230
|
:math:`n` are the lengths of the aligned sequences.
|
|
233
231
|
|
|
@@ -245,12 +243,12 @@ class EValueEstimator:
|
|
|
245
243
|
this is usually either the combined length of all sequences
|
|
246
244
|
in the database or the length of the hit sequence multiplied
|
|
247
245
|
by the number of sequences in the database.
|
|
248
|
-
|
|
246
|
+
|
|
249
247
|
Returns
|
|
250
248
|
-------
|
|
251
249
|
log_e : float
|
|
252
250
|
The decadic logarithm of the E-value.
|
|
253
|
-
|
|
251
|
+
|
|
254
252
|
Notes
|
|
255
253
|
-----
|
|
256
254
|
This method returns the logarithm of the E-value instead of
|
|
@@ -261,5 +259,6 @@ class EValueEstimator:
|
|
|
261
259
|
seq1_length = np.asarray(seq1_length)
|
|
262
260
|
seq2_length = np.asarray(seq2_length)
|
|
263
261
|
|
|
264
|
-
return np.log10(
|
|
265
|
-
|
|
262
|
+
return np.log10(
|
|
263
|
+
self._k * seq1_length * seq2_length
|
|
264
|
+
) - self._lam * score / np.log(10)
|
|
Binary file
|