biotite 0.41.1__cp312-cp312-win_amd64.whl → 1.0.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +2 -3
- biotite/application/__init__.py +36 -10
- biotite/application/application.py +22 -11
- biotite/application/autodock/__init__.py +1 -1
- biotite/application/autodock/app.py +74 -79
- biotite/application/blast/__init__.py +1 -1
- biotite/application/blast/alignment.py +19 -10
- biotite/application/blast/webapp.py +92 -85
- biotite/application/clustalo/__init__.py +1 -1
- biotite/application/clustalo/app.py +46 -61
- biotite/application/dssp/__init__.py +1 -1
- biotite/application/dssp/app.py +8 -11
- biotite/application/localapp.py +62 -60
- biotite/application/mafft/__init__.py +1 -1
- biotite/application/mafft/app.py +16 -22
- biotite/application/msaapp.py +78 -89
- biotite/application/muscle/__init__.py +1 -1
- biotite/application/muscle/app3.py +50 -64
- biotite/application/muscle/app5.py +23 -31
- biotite/application/sra/__init__.py +1 -1
- biotite/application/sra/app.py +64 -68
- biotite/application/tantan/__init__.py +1 -1
- biotite/application/tantan/app.py +22 -45
- biotite/application/util.py +7 -9
- biotite/application/viennarna/rnaalifold.py +34 -28
- biotite/application/viennarna/rnafold.py +24 -39
- biotite/application/viennarna/rnaplot.py +36 -21
- biotite/application/viennarna/util.py +17 -12
- biotite/application/webapp.py +13 -14
- biotite/copyable.py +13 -13
- biotite/database/__init__.py +1 -1
- biotite/database/entrez/__init__.py +1 -1
- biotite/database/entrez/check.py +2 -3
- biotite/database/entrez/dbnames.py +7 -5
- biotite/database/entrez/download.py +55 -49
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +62 -23
- biotite/database/error.py +2 -1
- biotite/database/pubchem/__init__.py +1 -1
- biotite/database/pubchem/download.py +43 -45
- biotite/database/pubchem/error.py +2 -2
- biotite/database/pubchem/query.py +34 -31
- biotite/database/pubchem/throttle.py +3 -4
- biotite/database/rcsb/__init__.py +1 -1
- biotite/database/rcsb/download.py +44 -52
- biotite/database/rcsb/query.py +85 -80
- biotite/database/uniprot/check.py +6 -3
- biotite/database/uniprot/download.py +6 -11
- biotite/database/uniprot/query.py +115 -31
- biotite/file.py +12 -31
- biotite/sequence/__init__.py +16 -5
- biotite/sequence/align/__init__.py +160 -6
- biotite/sequence/align/alignment.py +99 -90
- biotite/sequence/align/banded.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/buckets.py +12 -10
- biotite/sequence/align/cigar.py +43 -52
- biotite/sequence/align/kmeralphabet.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +55 -51
- biotite/sequence/align/kmersimilarity.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3 -2
- biotite/sequence/align/localgapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +81 -82
- biotite/sequence/align/multiple.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +35 -35
- biotite/sequence/align/pairwise.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +12 -4
- biotite/sequence/align/selector.cp312-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +52 -54
- biotite/sequence/align/statistics.py +32 -33
- biotite/sequence/align/tracetable.cp312-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +112 -126
- biotite/sequence/annotation.py +78 -77
- biotite/sequence/codec.cp312-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +90 -79
- biotite/sequence/graphics/__init__.py +1 -1
- biotite/sequence/graphics/alignment.py +184 -103
- biotite/sequence/graphics/colorschemes.py +10 -12
- biotite/sequence/graphics/dendrogram.py +79 -34
- biotite/sequence/graphics/features.py +133 -99
- biotite/sequence/graphics/logo.py +22 -28
- biotite/sequence/graphics/plasmid.py +229 -178
- biotite/sequence/io/fasta/__init__.py +1 -1
- biotite/sequence/io/fasta/convert.py +44 -33
- biotite/sequence/io/fasta/file.py +42 -55
- biotite/sequence/io/fastq/__init__.py +1 -1
- biotite/sequence/io/fastq/convert.py +11 -14
- biotite/sequence/io/fastq/file.py +68 -112
- biotite/sequence/io/genbank/__init__.py +2 -2
- biotite/sequence/io/genbank/annotation.py +12 -20
- biotite/sequence/io/genbank/file.py +74 -76
- biotite/sequence/io/genbank/metadata.py +74 -62
- biotite/sequence/io/genbank/sequence.py +13 -14
- biotite/sequence/io/general.py +39 -30
- biotite/sequence/io/gff/__init__.py +2 -2
- biotite/sequence/io/gff/convert.py +10 -15
- biotite/sequence/io/gff/file.py +81 -65
- biotite/sequence/phylo/__init__.py +1 -1
- biotite/sequence/phylo/nj.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp312-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp312-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +57 -28
- biotite/sequence/search.py +17 -15
- biotite/sequence/seqtypes.py +200 -164
- biotite/sequence/sequence.py +64 -64
- biotite/structure/__init__.py +3 -3
- biotite/structure/atoms.py +226 -240
- biotite/structure/basepairs.py +260 -271
- biotite/structure/bonds.cp312-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +88 -100
- biotite/structure/box.py +67 -71
- biotite/structure/celllist.cp312-win_amd64.pyd +0 -0
- biotite/structure/chains.py +55 -39
- biotite/structure/charges.cp312-win_amd64.pyd +0 -0
- biotite/structure/compare.py +32 -32
- biotite/structure/density.py +13 -18
- biotite/structure/dotbracket.py +20 -22
- biotite/structure/error.py +10 -2
- biotite/structure/filter.py +82 -77
- biotite/structure/geometry.py +130 -119
- biotite/structure/graphics/atoms.py +60 -43
- biotite/structure/graphics/rna.py +81 -68
- biotite/structure/hbond.py +112 -93
- biotite/structure/info/__init__.py +0 -2
- biotite/structure/info/atoms.py +10 -11
- biotite/structure/info/bonds.py +41 -43
- biotite/structure/info/ccd.py +21 -7
- biotite/structure/info/groups.py +10 -15
- biotite/structure/info/masses.py +5 -10
- biotite/structure/info/misc.py +1 -1
- biotite/structure/info/radii.py +20 -20
- biotite/structure/info/standardize.py +15 -26
- biotite/structure/integrity.py +18 -71
- biotite/structure/io/__init__.py +3 -4
- biotite/structure/io/dcd/__init__.py +1 -1
- biotite/structure/io/dcd/file.py +22 -20
- biotite/structure/io/general.py +47 -61
- biotite/structure/io/gro/__init__.py +1 -1
- biotite/structure/io/gro/file.py +73 -72
- biotite/structure/io/mol/__init__.py +1 -1
- biotite/structure/io/mol/convert.py +8 -11
- biotite/structure/io/mol/ctab.py +37 -36
- biotite/structure/io/mol/header.py +14 -10
- biotite/structure/io/mol/mol.py +9 -53
- biotite/structure/io/mol/sdf.py +47 -50
- biotite/structure/io/netcdf/__init__.py +1 -1
- biotite/structure/io/netcdf/file.py +24 -23
- biotite/structure/io/pdb/__init__.py +1 -1
- biotite/structure/io/pdb/convert.py +32 -20
- biotite/structure/io/pdb/file.py +151 -172
- biotite/structure/io/pdb/hybrid36.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/__init__.py +1 -1
- biotite/structure/io/pdbqt/convert.py +17 -11
- biotite/structure/io/pdbqt/file.py +128 -80
- biotite/structure/io/pdbx/__init__.py +1 -2
- biotite/structure/io/pdbx/bcif.py +36 -52
- biotite/structure/io/pdbx/cif.py +64 -62
- biotite/structure/io/pdbx/component.py +10 -16
- biotite/structure/io/pdbx/convert.py +235 -246
- biotite/structure/io/pdbx/encoding.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/trajfile.py +76 -93
- biotite/structure/io/trr/__init__.py +1 -1
- biotite/structure/io/trr/file.py +12 -15
- biotite/structure/io/xtc/__init__.py +1 -1
- biotite/structure/io/xtc/file.py +11 -14
- biotite/structure/mechanics.py +9 -11
- biotite/structure/molecules.py +3 -4
- biotite/structure/pseudoknots.py +53 -67
- biotite/structure/rdf.py +23 -21
- biotite/structure/repair.py +137 -86
- biotite/structure/residues.py +26 -16
- biotite/structure/sasa.cp312-win_amd64.pyd +0 -0
- biotite/structure/{resutil.py → segments.py} +24 -23
- biotite/structure/sequence.py +10 -11
- biotite/structure/sse.py +100 -119
- biotite/structure/superimpose.py +39 -77
- biotite/structure/transform.py +97 -71
- biotite/structure/util.py +11 -13
- biotite/version.py +2 -2
- biotite/visualize.py +69 -55
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/METADATA +6 -6
- biotite-1.0.0.dist-info/RECORD +322 -0
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/WHEEL +1 -1
- biotite/structure/io/ctab.py +0 -72
- biotite/structure/io/mmtf/__init__.py +0 -21
- biotite/structure/io/mmtf/assembly.py +0 -214
- biotite/structure/io/mmtf/convertarray.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +0 -341
- biotite/structure/io/mmtf/convertfile.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +0 -501
- biotite/structure/io/mmtf/decode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.pyx +0 -152
- biotite/structure/io/mmtf/encode.cp312-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.pyx +0 -183
- biotite/structure/io/mmtf/file.py +0 -233
- biotite/structure/io/npz/__init__.py +0 -20
- biotite/structure/io/npz/file.py +0 -152
- biotite/structure/io/pdbx/legacy.py +0 -267
- biotite/structure/io/tng/__init__.py +0 -13
- biotite/structure/io/tng/file.py +0 -46
- biotite/temp.py +0 -86
- biotite-0.41.1.dist-info/RECORD +0 -340
- {biotite-0.41.1.dist-info → biotite-1.0.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/align/cigar.py
CHANGED
|
@@ -8,13 +8,14 @@ __all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
|
|
|
8
8
|
|
|
9
9
|
import enum
|
|
10
10
|
import numpy as np
|
|
11
|
-
from .alignment import Alignment, get_codes
|
|
11
|
+
from biotite.sequence.align.alignment import Alignment, get_codes
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class CigarOp(enum.IntEnum):
|
|
15
15
|
"""
|
|
16
16
|
An enum for the different CIGAR operations.
|
|
17
17
|
"""
|
|
18
|
+
|
|
18
19
|
MATCH = 0
|
|
19
20
|
INSERTION = 1
|
|
20
21
|
DELETION = 2
|
|
@@ -46,23 +47,23 @@ class CigarOp(enum.IntEnum):
|
|
|
46
47
|
def to_cigar_symbol(self):
|
|
47
48
|
return _op_to_str[self]
|
|
48
49
|
|
|
50
|
+
|
|
49
51
|
_str_to_op = {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
52
|
+
"M": CigarOp.MATCH,
|
|
53
|
+
"I": CigarOp.INSERTION,
|
|
54
|
+
"D": CigarOp.DELETION,
|
|
55
|
+
"N": CigarOp.INTRON,
|
|
56
|
+
"S": CigarOp.SOFT_CLIP,
|
|
57
|
+
"H": CigarOp.HARD_CLIP,
|
|
58
|
+
"P": CigarOp.PADDING,
|
|
59
|
+
"=": CigarOp.EQUAL,
|
|
60
|
+
"X": CigarOp.DIFFERENT,
|
|
61
|
+
"B": CigarOp.BACK,
|
|
62
|
+
}
|
|
61
63
|
_op_to_str = {v: k for k, v in _str_to_op.items()}
|
|
62
64
|
|
|
63
65
|
|
|
64
|
-
def read_alignment_from_cigar(cigar, position,
|
|
65
|
-
reference_sequence, segment_sequence):
|
|
66
|
+
def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence):
|
|
66
67
|
"""
|
|
67
68
|
Create an :class:`Alignment` from a CIGAR string.
|
|
68
69
|
|
|
@@ -147,20 +148,16 @@ def read_alignment_from_cigar(cigar, position,
|
|
|
147
148
|
else:
|
|
148
149
|
operations = np.asarray(cigar, dtype=int)
|
|
149
150
|
if operations.ndim != 2:
|
|
150
|
-
raise ValueError(
|
|
151
|
-
"Expected array with shape (n,2)"
|
|
152
|
-
)
|
|
151
|
+
raise ValueError("Expected array with shape (n,2)")
|
|
153
152
|
if operations.shape[1] != 2:
|
|
154
|
-
raise ValueError(
|
|
155
|
-
"Expected (operation, length) pairs"
|
|
156
|
-
)
|
|
153
|
+
raise ValueError("Expected (operation, length) pairs")
|
|
157
154
|
|
|
158
155
|
if len(operations) == 0:
|
|
159
156
|
return Alignment(
|
|
160
157
|
[reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
|
|
161
158
|
)
|
|
162
159
|
|
|
163
|
-
trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
|
|
160
|
+
trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int)
|
|
164
161
|
clip_mask = np.ones(trace.shape[0], dtype=bool)
|
|
165
162
|
|
|
166
163
|
i = 0
|
|
@@ -187,19 +184,23 @@ def read_alignment_from_cigar(cigar, position,
|
|
|
187
184
|
elif op == CigarOp.HARD_CLIP:
|
|
188
185
|
clip_mask[i : i + length] = False
|
|
189
186
|
else:
|
|
190
|
-
raise ValueError(
|
|
191
|
-
f"CIGAR operation {op} is not implemented"
|
|
192
|
-
)
|
|
187
|
+
raise ValueError(f"CIGAR operation {op} is not implemented")
|
|
193
188
|
i += length
|
|
194
189
|
# Remove clipped positions
|
|
195
190
|
trace = trace[clip_mask]
|
|
196
191
|
return Alignment([reference_sequence, segment_sequence], trace)
|
|
197
192
|
|
|
198
193
|
|
|
199
|
-
def write_alignment_to_cigar(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
194
|
+
def write_alignment_to_cigar(
|
|
195
|
+
alignment,
|
|
196
|
+
reference_index=0,
|
|
197
|
+
segment_index=1,
|
|
198
|
+
introns=(),
|
|
199
|
+
distinguish_matches=False,
|
|
200
|
+
hard_clip=False,
|
|
201
|
+
include_terminal_gaps=False,
|
|
202
|
+
as_string=True,
|
|
203
|
+
):
|
|
203
204
|
"""
|
|
204
205
|
Convert an :class:`Alignment` into a CIGAR string.
|
|
205
206
|
|
|
@@ -293,10 +294,10 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
|
|
|
293
294
|
|
|
294
295
|
>>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
|
|
295
296
|
>>> for op, length in op_tuples:
|
|
296
|
-
... print(CigarOp(op), length)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
297
|
+
... print(CigarOp(op).name, length)
|
|
298
|
+
MATCH 9
|
|
299
|
+
DELETION 2
|
|
300
|
+
MATCH 12
|
|
300
301
|
"""
|
|
301
302
|
if not include_terminal_gaps:
|
|
302
303
|
alignment = _remove_terminal_segment_gaps(alignment, segment_index)
|
|
@@ -305,8 +306,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
|
|
|
305
306
|
seg_trace = alignment.trace[:, segment_index]
|
|
306
307
|
operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
|
|
307
308
|
|
|
308
|
-
insertion_mask =
|
|
309
|
-
deletion_mask =
|
|
309
|
+
insertion_mask = ref_trace == -1
|
|
310
|
+
deletion_mask = seg_trace == -1
|
|
310
311
|
if np.any(insertion_mask & deletion_mask):
|
|
311
312
|
raise ValueError(
|
|
312
313
|
"Alignment contains insertion and deletion at the same position"
|
|
@@ -318,35 +319,27 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
|
|
|
318
319
|
intron_mask = np.zeros(operations.shape[0], dtype=bool)
|
|
319
320
|
for start, stop in introns:
|
|
320
321
|
if start >= stop:
|
|
321
|
-
raise ValueError(
|
|
322
|
-
"Intron start must be smaller than intron stop"
|
|
323
|
-
)
|
|
322
|
+
raise ValueError("Intron start must be smaller than intron stop")
|
|
324
323
|
if start < 0:
|
|
325
|
-
raise ValueError(
|
|
326
|
-
"Intron start must not be negative"
|
|
327
|
-
)
|
|
324
|
+
raise ValueError("Intron start must not be negative")
|
|
328
325
|
intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
|
|
329
326
|
if np.any(intron_mask & ~deletion_mask):
|
|
330
|
-
raise ValueError(
|
|
331
|
-
"Introns must be within gaps in the reference sequence"
|
|
332
|
-
)
|
|
327
|
+
raise ValueError("Introns must be within gaps in the reference sequence")
|
|
333
328
|
operations[intron_mask] = CigarOp.INTRON
|
|
334
329
|
|
|
335
330
|
if distinguish_matches:
|
|
336
331
|
symbol_codes = get_codes(alignment)
|
|
337
332
|
ref_codes = symbol_codes[reference_index, :]
|
|
338
333
|
seg_codes = symbol_codes[segment_index, :]
|
|
339
|
-
equal_mask =
|
|
340
|
-
match_mask =
|
|
334
|
+
equal_mask = ref_codes == seg_codes
|
|
335
|
+
match_mask = operations == CigarOp.MATCH
|
|
341
336
|
operations[equal_mask & match_mask] = CigarOp.EQUAL
|
|
342
337
|
operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
|
|
343
338
|
|
|
344
339
|
op_tuples = _aggregate_consecutive(operations)
|
|
345
340
|
|
|
346
341
|
clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
|
|
347
|
-
start_clip_length, end_clip_length = _find_clipped_bases(
|
|
348
|
-
alignment, segment_index
|
|
349
|
-
)
|
|
342
|
+
start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index)
|
|
350
343
|
if start_clip_length != 0:
|
|
351
344
|
start_clip = [(clip_op, start_clip_length)]
|
|
352
345
|
else:
|
|
@@ -386,9 +379,7 @@ def _find_clipped_bases(alignment, segment_index):
|
|
|
386
379
|
# all previous bases are clipped...
|
|
387
380
|
start_clip_length = seg_trace[0]
|
|
388
381
|
# ...and the same applies for the last base
|
|
389
|
-
end_clip_length = (
|
|
390
|
-
len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
|
|
391
|
-
)
|
|
382
|
+
end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
|
|
392
383
|
return start_clip_length, end_clip_length
|
|
393
384
|
|
|
394
385
|
|
|
@@ -431,4 +422,4 @@ def _op_tuples_from_cigar(cigar):
|
|
|
431
422
|
op = CigarOp.from_cigar_symbol(char)
|
|
432
423
|
op_tuples.append((op, count))
|
|
433
424
|
count = ""
|
|
434
|
-
return np.array(op_tuples, dtype=int)
|
|
425
|
+
return np.array(op_tuples, dtype=int)
|
|
Binary file
|
|
@@ -33,7 +33,7 @@ class KmerAlphabet(Alphabet):
|
|
|
33
33
|
|
|
34
34
|
This type of alphabet uses *k-mers* as symbols, i.e. all
|
|
35
35
|
combinations of *k* symbols from its *base alphabet*.
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
It's primary use is its :meth:`create_kmers()` method, that iterates
|
|
38
38
|
over all overlapping *k-mers* in a :class:`Sequence` and encodes
|
|
39
39
|
each one into its corresponding *k-mer* symbol code
|
|
@@ -68,7 +68,7 @@ class KmerAlphabet(Alphabet):
|
|
|
68
68
|
integers, that indicate the *informative* positions.
|
|
69
69
|
For a continuous *k-mer* the `spacing` would be
|
|
70
70
|
``[0, 1, 2,...]``.
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
Attributes
|
|
73
73
|
----------
|
|
74
74
|
base_alphabet : Alphabet
|
|
@@ -79,7 +79,7 @@ class KmerAlphabet(Alphabet):
|
|
|
79
79
|
spacing : None or ndarray, dtype=int
|
|
80
80
|
The *k-mer* model in array form, if spaced *k-mers* are used,
|
|
81
81
|
``None`` otherwise.
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
Notes
|
|
84
84
|
-----
|
|
85
85
|
The symbol code for a *k-mer* :math:`s` calculates as
|
|
@@ -94,7 +94,7 @@ class KmerAlphabet(Alphabet):
|
|
|
94
94
|
|
|
95
95
|
References
|
|
96
96
|
----------
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
.. footbibliography::
|
|
99
99
|
|
|
100
100
|
Examples
|
|
@@ -103,11 +103,11 @@ class KmerAlphabet(Alphabet):
|
|
|
103
103
|
|
|
104
104
|
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
|
|
105
105
|
>>> print(base_alphabet.get_symbols())
|
|
106
|
-
|
|
106
|
+
('A', 'C', 'G', 'T')
|
|
107
107
|
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
|
|
108
108
|
>>> print(kmer_alphabet.get_symbols())
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT')
|
|
110
|
+
|
|
111
111
|
Encode and decode *k-mers*:
|
|
112
112
|
|
|
113
113
|
>>> print(kmer_alphabet.encode("TC"))
|
|
@@ -127,7 +127,7 @@ class KmerAlphabet(Alphabet):
|
|
|
127
127
|
[3 1]
|
|
128
128
|
|
|
129
129
|
Encode all overlapping continuous k-mers of a sequence:
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
>>> sequence = NucleotideSequence("ATTGCT")
|
|
132
132
|
>>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
|
|
133
133
|
>>> print(kmer_codes)
|
|
@@ -146,7 +146,7 @@ class KmerAlphabet(Alphabet):
|
|
|
146
146
|
>>> print([s[0] + s[1] + "_" + s[2] for s in strings])
|
|
147
147
|
['BI_T', 'IQ_I', 'QT_T', 'TI_E']
|
|
148
148
|
"""
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
def __init__(self, base_alphabet, k, spacing=None):
|
|
151
151
|
if not isinstance(base_alphabet, Alphabet):
|
|
152
152
|
raise TypeError(
|
|
@@ -157,7 +157,7 @@ class KmerAlphabet(Alphabet):
|
|
|
157
157
|
raise ValueError("k must be at least 2")
|
|
158
158
|
self._base_alph = base_alphabet
|
|
159
159
|
self._k = k
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
base_alph_len = len(self._base_alph)
|
|
162
162
|
self._radix_multiplier = np.array(
|
|
163
163
|
[base_alph_len**n for n in reversed(range(0, self._k))],
|
|
@@ -166,10 +166,10 @@ class KmerAlphabet(Alphabet):
|
|
|
166
166
|
|
|
167
167
|
if spacing is None:
|
|
168
168
|
self._spacing = None
|
|
169
|
-
|
|
169
|
+
|
|
170
170
|
elif isinstance(spacing, str):
|
|
171
171
|
self._spacing = _to_array_form(spacing)
|
|
172
|
-
|
|
172
|
+
|
|
173
173
|
else:
|
|
174
174
|
self._spacing = np.array(spacing, dtype=np.int64)
|
|
175
175
|
self._spacing.sort()
|
|
@@ -181,13 +181,13 @@ class KmerAlphabet(Alphabet):
|
|
|
181
181
|
raise ValueError(
|
|
182
182
|
"Spacing model contains duplicate values"
|
|
183
183
|
)
|
|
184
|
-
|
|
184
|
+
|
|
185
185
|
if spacing is not None and len(self._spacing) != self._k:
|
|
186
186
|
raise ValueError(
|
|
187
187
|
f"Expected {self._k} informative positions, "
|
|
188
188
|
f"but got {len(self._spacing)} positions in spacing"
|
|
189
189
|
)
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
|
|
192
192
|
@property
|
|
193
193
|
def base_alphabet(self):
|
|
@@ -196,11 +196,11 @@ class KmerAlphabet(Alphabet):
|
|
|
196
196
|
@property
|
|
197
197
|
def k(self):
|
|
198
198
|
return self._k
|
|
199
|
-
|
|
199
|
+
|
|
200
200
|
@property
|
|
201
201
|
def spacing(self):
|
|
202
202
|
return None if self._spacing is None else self._spacing.copy()
|
|
203
|
-
|
|
203
|
+
|
|
204
204
|
|
|
205
205
|
def get_symbols(self):
|
|
206
206
|
"""
|
|
@@ -210,10 +210,10 @@ class KmerAlphabet(Alphabet):
|
|
|
210
210
|
|
|
211
211
|
Returns
|
|
212
212
|
-------
|
|
213
|
-
symbols :
|
|
214
|
-
A
|
|
213
|
+
symbols : tuple
|
|
214
|
+
A tuple of all *k-mer* symbols, i.e. all possible
|
|
215
215
|
combinations of *k* symbols from its *base alphabet*.
|
|
216
|
-
|
|
216
|
+
|
|
217
217
|
Notes
|
|
218
218
|
-----
|
|
219
219
|
In contrast the base :class:`Alphabet` and
|
|
@@ -224,10 +224,10 @@ class KmerAlphabet(Alphabet):
|
|
|
224
224
|
to be created first.
|
|
225
225
|
"""
|
|
226
226
|
if isinstance(self._base_alph, LetterAlphabet):
|
|
227
|
-
return ["".join(self.decode(code)) for code in range(len(self))]
|
|
227
|
+
return tuple(["".join(self.decode(code)) for code in range(len(self))])
|
|
228
228
|
else:
|
|
229
|
-
return [list(self.decode(code)) for code in range(len(self))]
|
|
230
|
-
|
|
229
|
+
return tuple([list(self.decode(code)) for code in range(len(self))])
|
|
230
|
+
|
|
231
231
|
|
|
232
232
|
def extends(self, alphabet):
|
|
233
233
|
# A KmerAlphabet cannot really extend another KmerAlphabet:
|
|
@@ -237,15 +237,15 @@ class KmerAlphabet(Alphabet):
|
|
|
237
237
|
# A KmerAlphabet can only 'extend' another KmerAlphabet,
|
|
238
238
|
# if the two alphabets are equal
|
|
239
239
|
return alphabet == self
|
|
240
|
-
|
|
240
|
+
|
|
241
241
|
|
|
242
242
|
def encode(self, symbol):
|
|
243
243
|
return self.fuse(self._base_alph.encode_multiple(symbol))
|
|
244
|
-
|
|
244
|
+
|
|
245
245
|
|
|
246
246
|
def decode(self, code):
|
|
247
247
|
return self._base_alph.decode_multiple(self.split(code))
|
|
248
|
-
|
|
248
|
+
|
|
249
249
|
|
|
250
250
|
def fuse(self, codes):
|
|
251
251
|
"""
|
|
@@ -261,7 +261,7 @@ class KmerAlphabet(Alphabet):
|
|
|
261
261
|
----------
|
|
262
262
|
codes : ndarray, dtype=int, shape=(k,) or shape=(n,k)
|
|
263
263
|
The symbol codes from the base alphabet to be fused.
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
Returns
|
|
266
266
|
-------
|
|
267
267
|
kmer_codes : int or ndarray, dtype=np.int64, shape=(n,)
|
|
@@ -292,13 +292,13 @@ class KmerAlphabet(Alphabet):
|
|
|
292
292
|
)
|
|
293
293
|
if np.any(codes > len(self._base_alph)):
|
|
294
294
|
raise AlphabetError("Given k-mer(s) contains invalid symbol code")
|
|
295
|
-
|
|
295
|
+
|
|
296
296
|
orig_shape = codes.shape
|
|
297
297
|
codes = np.atleast_2d(codes)
|
|
298
298
|
kmer_code = np.sum(self._radix_multiplier * codes, axis=-1)
|
|
299
299
|
# The last dimension is removed since it collpased in np.sum
|
|
300
300
|
return kmer_code.reshape(orig_shape[:-1])
|
|
301
|
-
|
|
301
|
+
|
|
302
302
|
def split(self, kmer_code):
|
|
303
303
|
"""
|
|
304
304
|
split(kmer_code)
|
|
@@ -313,7 +313,7 @@ class KmerAlphabet(Alphabet):
|
|
|
313
313
|
----------
|
|
314
314
|
kmer_code : int or ndarray, dtype=int, shape=(n,)
|
|
315
315
|
The *k-mer* code(s).
|
|
316
|
-
|
|
316
|
+
|
|
317
317
|
Returns
|
|
318
318
|
-------
|
|
319
319
|
codes : ndarray, dtype=np.uint64, shape=(k,) or shape=(n,k)
|
|
@@ -341,13 +341,13 @@ class KmerAlphabet(Alphabet):
|
|
|
341
341
|
raise AlphabetError(
|
|
342
342
|
f"Given k-mer symbol code is invalid for this alphabet"
|
|
343
343
|
)
|
|
344
|
-
|
|
344
|
+
|
|
345
345
|
orig_shape = np.shape(kmer_code)
|
|
346
346
|
split_codes = self._split(
|
|
347
347
|
np.atleast_1d(kmer_code).astype(np.int64, copy=False)
|
|
348
348
|
)
|
|
349
349
|
return split_codes.reshape(orig_shape + (self._k,))
|
|
350
|
-
|
|
350
|
+
|
|
351
351
|
@cython.boundscheck(False)
|
|
352
352
|
@cython.wraparound(False)
|
|
353
353
|
@cython.cdivision(True)
|
|
@@ -360,7 +360,7 @@ class KmerAlphabet(Alphabet):
|
|
|
360
360
|
cdef uint64[:,:] split_codes = np.empty(
|
|
361
361
|
(codes.shape[0], self._k), dtype=np.uint64
|
|
362
362
|
)
|
|
363
|
-
|
|
363
|
+
|
|
364
364
|
cdef int k = self._k
|
|
365
365
|
for i in range(codes.shape[0]):
|
|
366
366
|
code = codes[i]
|
|
@@ -369,9 +369,9 @@ class KmerAlphabet(Alphabet):
|
|
|
369
369
|
symbol_code = code // val
|
|
370
370
|
split_codes[i,n] = symbol_code
|
|
371
371
|
code -= symbol_code * val
|
|
372
|
-
|
|
372
|
+
|
|
373
373
|
return np.asarray(split_codes)
|
|
374
|
-
|
|
374
|
+
|
|
375
375
|
|
|
376
376
|
def kmer_array_length(self, int64 length):
|
|
377
377
|
"""
|
|
@@ -385,7 +385,7 @@ class KmerAlphabet(Alphabet):
|
|
|
385
385
|
----------
|
|
386
386
|
length : int
|
|
387
387
|
The length of the hypothetical sequence
|
|
388
|
-
|
|
388
|
+
|
|
389
389
|
Returns
|
|
390
390
|
-------
|
|
391
391
|
kmer_length : int
|
|
@@ -400,7 +400,7 @@ class KmerAlphabet(Alphabet):
|
|
|
400
400
|
spacing = self._spacing
|
|
401
401
|
max_offset = self._spacing[len(spacing)-1] + 1
|
|
402
402
|
return length - max_offset + 1
|
|
403
|
-
|
|
403
|
+
|
|
404
404
|
|
|
405
405
|
def create_kmers(self, seq_code):
|
|
406
406
|
"""
|
|
@@ -418,7 +418,7 @@ class KmerAlphabet(Alphabet):
|
|
|
418
418
|
-------
|
|
419
419
|
kmer_codes : ndarray, dtype=int64
|
|
420
420
|
The symbol codes for the *k-mers*.
|
|
421
|
-
|
|
421
|
+
|
|
422
422
|
Examples
|
|
423
423
|
--------
|
|
424
424
|
|
|
@@ -435,7 +435,7 @@ class KmerAlphabet(Alphabet):
|
|
|
435
435
|
return self._create_continuous_kmers(seq_code)
|
|
436
436
|
else:
|
|
437
437
|
return self._create_spaced_kmers(seq_code)
|
|
438
|
-
|
|
438
|
+
|
|
439
439
|
@cython.boundscheck(False)
|
|
440
440
|
@cython.wraparound(False)
|
|
441
441
|
def _create_continuous_kmers(self, CodeType[:] seq_code not None):
|
|
@@ -460,7 +460,7 @@ class KmerAlphabet(Alphabet):
|
|
|
460
460
|
cdef int64[:] kmers = np.empty(
|
|
461
461
|
self.kmer_array_length(len(seq_code)), dtype=np.int64
|
|
462
462
|
)
|
|
463
|
-
|
|
463
|
+
|
|
464
464
|
cdef CodeType code
|
|
465
465
|
cdef int64 kmer, prev_kmer
|
|
466
466
|
# Compute first k-mer using naive approach
|
|
@@ -471,7 +471,7 @@ class KmerAlphabet(Alphabet):
|
|
|
471
471
|
raise AlphabetError(f"Symbol code {code} is out of range")
|
|
472
472
|
kmer += radix_multiplier[i] * code
|
|
473
473
|
kmers[0] = kmer
|
|
474
|
-
|
|
474
|
+
|
|
475
475
|
# Compute all following k-mers from the previous one
|
|
476
476
|
prev_kmer = kmer
|
|
477
477
|
for i in range(1, kmers.shape[0]):
|
|
@@ -481,7 +481,7 @@ class KmerAlphabet(Alphabet):
|
|
|
481
481
|
kmer = (
|
|
482
482
|
(
|
|
483
483
|
# Remove first symbol
|
|
484
|
-
(prev_kmer - seq_code[i - 1] * end_radix_multiplier)
|
|
484
|
+
(prev_kmer - seq_code[i - 1] * end_radix_multiplier)
|
|
485
485
|
# Shift k-mer to left
|
|
486
486
|
* alphabet_length
|
|
487
487
|
)
|
|
@@ -490,9 +490,9 @@ class KmerAlphabet(Alphabet):
|
|
|
490
490
|
)
|
|
491
491
|
kmers[i] = kmer
|
|
492
492
|
prev_kmer = kmer
|
|
493
|
-
|
|
493
|
+
|
|
494
494
|
return np.asarray(kmers)
|
|
495
|
-
|
|
495
|
+
|
|
496
496
|
@cython.boundscheck(False)
|
|
497
497
|
@cython.wraparound(False)
|
|
498
498
|
def _create_spaced_kmers(self, CodeType[:] seq_code not None):
|
|
@@ -515,7 +515,7 @@ class KmerAlphabet(Alphabet):
|
|
|
515
515
|
cdef int64[:] kmers = np.empty(
|
|
516
516
|
self.kmer_array_length(len(seq_code)), dtype=np.int64
|
|
517
517
|
)
|
|
518
|
-
|
|
518
|
+
|
|
519
519
|
cdef CodeType code
|
|
520
520
|
cdef int64 kmer
|
|
521
521
|
cdef int64 offset
|
|
@@ -528,18 +528,18 @@ class KmerAlphabet(Alphabet):
|
|
|
528
528
|
raise AlphabetError(f"Symbol code {code} is out of range")
|
|
529
529
|
kmer += radix_multiplier[j] * code
|
|
530
530
|
kmers[i] = kmer
|
|
531
|
-
|
|
531
|
+
|
|
532
532
|
return np.asarray(kmers)
|
|
533
|
-
|
|
533
|
+
|
|
534
534
|
|
|
535
535
|
def __str__(self):
|
|
536
536
|
return str(self.get_symbols())
|
|
537
|
-
|
|
537
|
+
|
|
538
538
|
|
|
539
539
|
def __repr__(self):
|
|
540
540
|
return f"KmerAlphabet({repr(self._base_alph)}, " \
|
|
541
541
|
f"{self._k}, {repr(self._spacing)})"
|
|
542
|
-
|
|
542
|
+
|
|
543
543
|
|
|
544
544
|
def __eq__(self, item):
|
|
545
545
|
if item is self:
|
|
@@ -550,15 +550,19 @@ class KmerAlphabet(Alphabet):
|
|
|
550
550
|
return False
|
|
551
551
|
if self._k != item._k:
|
|
552
552
|
return False
|
|
553
|
-
|
|
553
|
+
|
|
554
554
|
if self._spacing is None:
|
|
555
555
|
if item._spacing is not None:
|
|
556
556
|
return False
|
|
557
557
|
elif np.any(self._spacing != item._spacing):
|
|
558
558
|
return False
|
|
559
|
-
|
|
559
|
+
|
|
560
560
|
return True
|
|
561
|
-
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def __hash__(self):
|
|
564
|
+
return hash((self._base_alph, self._k, tuple(self._spacing.tolist())))
|
|
565
|
+
|
|
562
566
|
|
|
563
567
|
def __len__(self):
|
|
564
568
|
return int(len(self._base_alph) ** self._k)
|
|
Binary file
|
|
Binary file
|
|
@@ -1352,7 +1352,8 @@ cdef class KmerTable:
|
|
|
1352
1352
|
|
|
1353
1353
|
|
|
1354
1354
|
def __iter__(self):
|
|
1355
|
-
|
|
1355
|
+
for kmer in self.get_kmers():
|
|
1356
|
+
yield kmer.item()
|
|
1356
1357
|
|
|
1357
1358
|
|
|
1358
1359
|
def __reversed__(self):
|
|
@@ -3394,7 +3395,7 @@ def _to_string(table):
|
|
|
3394
3395
|
else:
|
|
3395
3396
|
symbols = str(tuple(symbols))
|
|
3396
3397
|
line = symbols + ": " + ", ".join(
|
|
3397
|
-
[str(
|
|
3398
|
+
[str((ref_id.item(), pos.item())) for ref_id, pos in table[kmer]]
|
|
3398
3399
|
)
|
|
3399
3400
|
lines.append(line)
|
|
3400
3401
|
return "\n".join(lines)
|
|
Binary file
|
|
Binary file
|