panxpress 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panxpress/cuckoo_filter_utils.py +207 -0
- panxpress/dnaencode.py +277 -0
- panxpress/dnaencode_fast.py +156 -0
- panxpress/fastcash_info.py +156 -0
- panxpress/fastcash_main.py +271 -0
- panxpress/fastcash_weak_ptr.py +489 -0
- panxpress/hash_new.py +742 -0
- panxpress/hashfunctions.py +221 -0
- panxpress/io/binaryio.py +62 -0
- panxpress/io/fastaio.py +461 -0
- panxpress/io/fastqio.py +596 -0
- panxpress/io/filterio.py +112 -0
- panxpress/io/generaldsio.py +46 -0
- panxpress/io/generalio.py +252 -0
- panxpress/io/hashio.py +514 -0
- panxpress/io/seqio.py +200 -0
- panxpress/io/textio.py +21 -0
- panxpress/io/xorio.py +94 -0
- panxpress/kmers.py +474 -0
- panxpress/lowlevel/aligned_arrays.py +42 -0
- panxpress/lowlevel/bitarray.py +228 -0
- panxpress/lowlevel/conpro.py +504 -0
- panxpress/lowlevel/debug.py +97 -0
- panxpress/lowlevel/intbitarray.py +252 -0
- panxpress/lowlevel/libc.py +174 -0
- panxpress/lowlevel/llvm.py +638 -0
- panxpress/lowlevel/lowlevelfunctions.txt +1 -0
- panxpress/lowlevel/numbautils.py +25 -0
- panxpress/lowlevel/packedarray.py +186 -0
- panxpress/mask.py +69 -0
- panxpress/mathutils.py +296 -0
- panxpress/panxpress/config/index.yaml +7 -0
- panxpress/panxpress/panxpress_build_reference.py +475 -0
- panxpress/panxpress/panxpress_correct_gff.py +1342 -0
- panxpress/panxpress/panxpress_index.py +286 -0
- panxpress/panxpress/panxpress_main.py +308 -0
- panxpress/panxpress/panxpress_map_parallel.py +480 -0
- panxpress/parameters.py +63 -0
- panxpress/srhash.py +594 -0
- panxpress/subtable_hashfunctions.py +395 -0
- panxpress/values/panxpress.py +87 -0
- panxpress-0.2.dist-info/METADATA +374 -0
- panxpress-0.2.dist-info/RECORD +47 -0
- panxpress-0.2.dist-info/WHEEL +5 -0
- panxpress-0.2.dist-info/entry_points.txt +3 -0
- panxpress-0.2.dist-info/licenses/LICENSE +21 -0
- panxpress-0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities, pointer and bit magic for cuckoo filters
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from math import log2
|
|
6
|
+
from numba import njit, uint8, uint16, uint32, uint64
|
|
7
|
+
import numba as nb
|
|
8
|
+
import numpy as np
|
|
9
|
+
from llvmlite import ir
|
|
10
|
+
from .mathutils import bitsfor
|
|
11
|
+
from .lowlevel.llvm import compile_cttz
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def compute_masks(bits_per_slot, slots):
|
|
15
|
+
mask1 = ('0' * (bits_per_slot - 1) + '1') * slots
|
|
16
|
+
mask2 = ('1' + '0' * (bits_per_slot - 1)) * slots
|
|
17
|
+
return uint64(int(mask1, 2)), uint64(int(mask2, 2))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compute_fp_mask(bits_per_slot, fingerprint_bits, slots):
|
|
21
|
+
fp_mask = uint64(0)
|
|
22
|
+
for s in range(1, slots):
|
|
23
|
+
fp_mask |= (s << (s * bits_per_slot + fingerprint_bits))
|
|
24
|
+
return uint64(fp_mask)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def compute_choice_fp_mask(bits_per_slot, fingerprint_bits, slots):
|
|
28
|
+
window_bits = bitsfor(slots)
|
|
29
|
+
fp_mask = uint64(0)
|
|
30
|
+
for s in range(0, slots):
|
|
31
|
+
fp_mask |= (s << (s * bits_per_slot + fingerprint_bits))
|
|
32
|
+
fp_mask |= 1 << (s * bits_per_slot + fingerprint_bits + window_bits)
|
|
33
|
+
for s in range(0, slots):
|
|
34
|
+
fp_mask |= (s << ((slots + s) * bits_per_slot + fingerprint_bits))
|
|
35
|
+
return uint64(fp_mask)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def compute_choice_mask(bits_per_slot, fingerprint_bits, slots):
|
|
39
|
+
fp_mask = uint64(0)
|
|
40
|
+
for s in range(0, slots):
|
|
41
|
+
fp_mask |= 1 << (s * bits_per_slot + fingerprint_bits)
|
|
42
|
+
return uint64(fp_mask)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def compile_lookup(bits_per_slot, slots, windowed=True, choice=False):
|
|
46
|
+
fingerprint_bits = uint64(bits_per_slot - int(log2(slots)) - 1 if windowed else bits_per_slot - 1)
|
|
47
|
+
if choice:
|
|
48
|
+
assert bits_per_slot * slots * 2 <= 64
|
|
49
|
+
m1, m2 = compute_masks(bits_per_slot, 2 * slots)
|
|
50
|
+
if windowed:
|
|
51
|
+
fp_mask = compute_choice_fp_mask(bits_per_slot, fingerprint_bits, slots)
|
|
52
|
+
else:
|
|
53
|
+
fp_mask = compute_choice_mask(bits_per_slot, fingerprint_bits, slots)
|
|
54
|
+
else:
|
|
55
|
+
assert bits_per_slot * slots <= 64
|
|
56
|
+
m1, m2 = compute_masks(bits_per_slot, slots)
|
|
57
|
+
fp_mask = compute_fp_mask(bits_per_slot, fingerprint_bits, slots) if windowed else uint64(0)
|
|
58
|
+
|
|
59
|
+
@njit(nogil=True)
|
|
60
|
+
def haszero(x):
|
|
61
|
+
return uint64(((x) - m1) & (~(x)) & m2)
|
|
62
|
+
|
|
63
|
+
@njit(nogil=True)
|
|
64
|
+
def hasvalue(window, fp):
|
|
65
|
+
x = haszero(window ^ ((m1 * fp) | fp_mask))
|
|
66
|
+
return x != 0
|
|
67
|
+
|
|
68
|
+
return hasvalue
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def compile_get_empty_slot(bits_per_slot, slots):
|
|
72
|
+
assert bits_per_slot * slots <= 64
|
|
73
|
+
count_trailing_zeros = compile_cttz('uint64')
|
|
74
|
+
m1, m2 = compute_masks(bits_per_slot, slots)
|
|
75
|
+
|
|
76
|
+
@njit(nogil=True)
|
|
77
|
+
def haszero(x):
|
|
78
|
+
return (((x) - m1) & (~(x)) & m2)
|
|
79
|
+
|
|
80
|
+
@njit(nogil=True, locals=dict(x=uint64, slot=uint64))
|
|
81
|
+
def get_empty_slot(window):
|
|
82
|
+
x = haszero(window)
|
|
83
|
+
slot = count_trailing_zeros(x) // bits_per_slot
|
|
84
|
+
return slot
|
|
85
|
+
|
|
86
|
+
return get_empty_slot
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def compile_load_value(nbits):
|
|
90
|
+
# fast version for multiples of 8 (cast byte counter)
|
|
91
|
+
if nbits in [8, 16, 32, 64]:
|
|
92
|
+
signatures = {8: uint8, 16: uint16, 32: uint32, 64: uint64}
|
|
93
|
+
pointer = ir.IntType(nbits).as_pointer()
|
|
94
|
+
int_type = signatures[nbits]
|
|
95
|
+
|
|
96
|
+
@nb.extending.intrinsic
|
|
97
|
+
def address_to_value(typingctx, src):
|
|
98
|
+
""" returns the value stored at a given memory address """
|
|
99
|
+
sig = int_type(src)
|
|
100
|
+
def codegen(cgctx, builder, sig, args):
|
|
101
|
+
ptr = builder.inttoptr(args[0], pointer)
|
|
102
|
+
return builder.load(ptr)
|
|
103
|
+
return sig, codegen
|
|
104
|
+
|
|
105
|
+
@njit(nogil=True, locals=dict(address=uint64))
|
|
106
|
+
def get_value(fltr, start):
|
|
107
|
+
address = fltr.ctypes.data + (uint64(start) >> 3)
|
|
108
|
+
return address_to_value(address)
|
|
109
|
+
|
|
110
|
+
return get_value
|
|
111
|
+
|
|
112
|
+
# slow version if nbits is not a multiple of 8
|
|
113
|
+
elif 0 < nbits <= 64:
|
|
114
|
+
padded_nbits = ((int(nbits) + 7) & (-8)) + 8
|
|
115
|
+
int_type = uint64
|
|
116
|
+
mask = ir.Constant(ir.IntType(64), int(nbits * '1', 2))
|
|
117
|
+
finalcast = ir.IntType(64)
|
|
118
|
+
if padded_nbits <= 64:
|
|
119
|
+
cast = ir.IntType(64)
|
|
120
|
+
pointer = ir.IntType(64).as_pointer()
|
|
121
|
+
else:
|
|
122
|
+
cast = ir.IntType(128)
|
|
123
|
+
pointer = ir.IntType(128).as_pointer()
|
|
124
|
+
|
|
125
|
+
@nb.extending.intrinsic
|
|
126
|
+
def address_to_value(typingctx, address, offset):
|
|
127
|
+
""" returns the value stored at a given memory address """
|
|
128
|
+
sig = int_type(address, offset)
|
|
129
|
+
def codegen(cgctx, builder, sig, args):
|
|
130
|
+
ptr = builder.inttoptr(args[0], pointer)
|
|
131
|
+
value = builder.load(ptr)
|
|
132
|
+
shift = builder.zext(args[1], cast)
|
|
133
|
+
value = builder.lshr(value, shift)
|
|
134
|
+
value = builder.trunc(value, finalcast)
|
|
135
|
+
return builder.and_(value, mask)
|
|
136
|
+
return sig, codegen
|
|
137
|
+
|
|
138
|
+
@njit(nogil=True, locals=dict(address=uint64, offset=uint64))
|
|
139
|
+
def get_value(array, pos):
|
|
140
|
+
address = array.ctypes.data + (uint64(pos) >> 3)
|
|
141
|
+
return address_to_value(address, (uint64(pos) & 7))
|
|
142
|
+
|
|
143
|
+
return get_value
|
|
144
|
+
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
def compile_store_value(nbits):
|
|
148
|
+
# fast version for multiples of 8 (cast byte counter)
|
|
149
|
+
if nbits in [8, 16, 32, 64]:
|
|
150
|
+
pointer = ir.IntType(nbits).as_pointer()
|
|
151
|
+
signatures = {8: uint8, 16: uint16, 32: uint32, 64: uint64}
|
|
152
|
+
int_type = signatures[nbits]
|
|
153
|
+
|
|
154
|
+
@nb.extending.intrinsic
|
|
155
|
+
def store_value_at_address(typingctx, address, value):
|
|
156
|
+
"""returns the value stored at a given memory address """
|
|
157
|
+
sig = nb.void(nb.types.uintp, int_type)
|
|
158
|
+
def codegen(cgctx, builder, sig, args):
|
|
159
|
+
ptr = builder.inttoptr(args[0], pointer)
|
|
160
|
+
builder.store(args[1], ptr)
|
|
161
|
+
return sig, codegen
|
|
162
|
+
|
|
163
|
+
@njit(nogil=True, locals=dict(pos=uint64))
|
|
164
|
+
def store_value(array, pos, value):
|
|
165
|
+
pos = array.ctypes.data + (uint64(pos) >> 3)
|
|
166
|
+
store_value_at_address(pos, value)
|
|
167
|
+
|
|
168
|
+
return store_value
|
|
169
|
+
|
|
170
|
+
# slow version if nbits is not a multiple of 8
|
|
171
|
+
elif 0 < nbits <= 64:
|
|
172
|
+
padded_nbits = ((int(nbits) + 7) & (-8)) + 8
|
|
173
|
+
if padded_nbits <= 64:
|
|
174
|
+
cast = ir.IntType(64)
|
|
175
|
+
pointer = ir.IntType(64).as_pointer()
|
|
176
|
+
ones = ir.Constant(ir.IntType(64), int(2**64 - 1))
|
|
177
|
+
mask = ir.Constant(ir.IntType(64), 2**int(nbits) - 1)
|
|
178
|
+
elif padded_nbits <= 128:
|
|
179
|
+
cast = ir.IntType(128)
|
|
180
|
+
pointer = ir.IntType(128).as_pointer()
|
|
181
|
+
ones = ir.Constant(ir.IntType(128), 2**128 - 1)
|
|
182
|
+
mask = ir.Constant(ir.IntType(128), 2**int(nbits) - 1)
|
|
183
|
+
|
|
184
|
+
@nb.extending.intrinsic
|
|
185
|
+
def store_value_at_address(typingctx, address, value, shift):
|
|
186
|
+
"""returns the value stored at a given memory address """
|
|
187
|
+
sig = nb.void(nb.types.uintp, address, value, shift)
|
|
188
|
+
def codegen(cgctx, builder, sig, args):
|
|
189
|
+
ptr = builder.inttoptr(args[0], pointer)
|
|
190
|
+
value = builder.load(ptr)
|
|
191
|
+
zero_mask = builder.shl(mask, builder.zext(args[2], cast))
|
|
192
|
+
zero_mask = builder.xor(zero_mask, ones)
|
|
193
|
+
value = builder.and_(value, zero_mask)
|
|
194
|
+
insert = builder.shl(args[1], args[2])
|
|
195
|
+
value = builder.or_(value, builder.zext(insert, cast))
|
|
196
|
+
builder.store(value, ptr)
|
|
197
|
+
return sig, codegen
|
|
198
|
+
|
|
199
|
+
@njit(nogil=True, locals=dict(address=uint64, shift=uint64, value=uint64))
|
|
200
|
+
def store_value(array, pos, value):
|
|
201
|
+
address = array.ctypes.data + (uint64(pos) >> 3)
|
|
202
|
+
shift = uint64(pos) & 7
|
|
203
|
+
store_value_at_address(address, value, shift)
|
|
204
|
+
|
|
205
|
+
return store_value
|
|
206
|
+
|
|
207
|
+
raise NotImplementedError("Only storing values <= 64 bits supported")
|
panxpress/dnaencode.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
from numba import njit, uint8, int64, uint64, prange
|
|
4
|
+
|
|
5
|
+
from .mask import Mask
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
U64_MINUSONE = uint64(np.iinfo(np.uint64).max)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# encoding DNA ###############################
|
|
12
|
+
|
|
13
|
+
def _get_table_dna_to_2bits(default=4):
|
|
14
|
+
b = np.full(256, default, dtype=np.uint8)
|
|
15
|
+
b[97] = 0 # a
|
|
16
|
+
b[65] = 0 # A
|
|
17
|
+
b[99] = 1 # c
|
|
18
|
+
b[67] = 1 # C
|
|
19
|
+
b[103] = 2 # g
|
|
20
|
+
b[71] = 2 # G
|
|
21
|
+
b[116] = 3 # t
|
|
22
|
+
b[84] = 3 # T
|
|
23
|
+
b[117] = 3 # u
|
|
24
|
+
b[85] = 3 # U
|
|
25
|
+
return b
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_TABLE_DNA_TO_2BITS = _get_table_dna_to_2bits()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@njit(nogil=True, locals=dict(i=int64))
|
|
32
|
+
def _dna_to_2bits(x, table):
|
|
33
|
+
for i in range(x.size):
|
|
34
|
+
x[i] = table[x[i]]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# this is the one to use!
|
|
38
|
+
@njit(nogil=True)
|
|
39
|
+
def quick_dna_to_2bits(x):
|
|
40
|
+
for i in range(len(x)):
|
|
41
|
+
x[i] = _TABLE_DNA_TO_2BITS[x[i]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@njit(nogil=True, parallel=True)
|
|
45
|
+
def parallel_dna_to_2bits(x):
|
|
46
|
+
for i in prange(x.size):
|
|
47
|
+
x[i] = _TABLE_DNA_TO_2BITS[x[i]]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# numba compile error
|
|
51
|
+
# @njit(nogil=True, locals=dict(seq=uint8[:], table=uint8[:]))
|
|
52
|
+
def dna_to_2bits(seq, table=_TABLE_DNA_TO_2BITS):
|
|
53
|
+
# we expect seq to be a bytearray
|
|
54
|
+
# xx = np.array(seq, dtype=np.uint8)
|
|
55
|
+
if isinstance(seq, bytes):
|
|
56
|
+
xx = np.frombuffer(bytearray(seq), dtype=np.uint8)
|
|
57
|
+
else:
|
|
58
|
+
xx = np.frombuffer(seq, dtype=np.uint8)
|
|
59
|
+
_dna_to_2bits(xx, table)
|
|
60
|
+
return xx
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
_TABLE_BITS_TO_DNASTR = ["A", "C", "G", "T"]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def qcode_to_dnastr(qcode, q, table=_TABLE_BITS_TO_DNASTR):
|
|
67
|
+
qc = int(qcode)
|
|
68
|
+
return "".join([table[((qc >> (2 * (q - i - 1))) & 3)] for i in range(q)])
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@njit(nogil=True, locals=dict(base=uint64))
|
|
72
|
+
def write_qcode_to_buffer(qcode, q, buf, start):
|
|
73
|
+
for i in range(q):
|
|
74
|
+
base = (qcode >> (2 * (q - i - 1))) & 3
|
|
75
|
+
buf[start + i] = uint8(base)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# no need to njit!
|
|
79
|
+
def _get_table_2bits_to_dna(default=4):
|
|
80
|
+
b = np.full(256, 35, dtype=np.uint8) # fill with b'#'
|
|
81
|
+
b[0] = 65
|
|
82
|
+
b[1] = 67
|
|
83
|
+
b[2] = 71
|
|
84
|
+
b[3] = 84
|
|
85
|
+
b[default] = 78
|
|
86
|
+
return b
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
_TABLE_2BITS_TO_DNA = _get_table_2bits_to_dna()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@njit(nogil=True)
|
|
93
|
+
def twobits_to_dna_inplace(buf, start=0, end=0):
|
|
94
|
+
if end <= 0:
|
|
95
|
+
end = len(buf) - end
|
|
96
|
+
for i in range(start, end):
|
|
97
|
+
buf[i] = _TABLE_2BITS_TO_DNA[buf[i]]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ########## reverse complements and canonical representation ##############
|
|
101
|
+
|
|
102
|
+
@njit(nogil=True,
|
|
103
|
+
locals=dict(c1=uint8, c2=uint8, n=int64, drei=uint8))
|
|
104
|
+
def revcomp_inplace(seq):
|
|
105
|
+
n = seq.size
|
|
106
|
+
drei = 3
|
|
107
|
+
for i in range((n + 1) // 2):
|
|
108
|
+
j = n - 1 - i
|
|
109
|
+
c1 = seq[i]
|
|
110
|
+
c2 = seq[j]
|
|
111
|
+
seq[j] = drei - c1 if c1 < 4 else c1
|
|
112
|
+
seq[i] = drei - c2 if c2 < 4 else c2
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@njit(nogil=True, locals=dict(
|
|
116
|
+
c=uint8, n=int64, drei=uint8, rc=uint8[:]))
|
|
117
|
+
def revcomp_to_buffer(seq, rc):
|
|
118
|
+
n = seq.size
|
|
119
|
+
drei = 3
|
|
120
|
+
for i in range(n):
|
|
121
|
+
c = seq[n - 1 - i]
|
|
122
|
+
rc[i] = drei - c if c < 4 else c
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@njit(nogil=True, locals=dict(rc=uint8[:]))
|
|
126
|
+
def revcomp(seq):
|
|
127
|
+
rc = np.empty_like(seq, dtype=np.uint8)
|
|
128
|
+
revcomp_to_buffer(seq, rc)
|
|
129
|
+
return rc
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@njit(nogil=True, locals=dict(
|
|
133
|
+
code=uint64, drei=uint64, rc=uint64, c=uint64))
|
|
134
|
+
def revcomp_code(code, q):
|
|
135
|
+
# only works for 0 <= q <= 31 !
|
|
136
|
+
# when using uints, due to a potential bug in numpy/numba,
|
|
137
|
+
# we would have to re-declare code as uint64 locally.
|
|
138
|
+
drei = uint64(3)
|
|
139
|
+
rc = 0
|
|
140
|
+
for i in range(q):
|
|
141
|
+
c = drei - (code & drei)
|
|
142
|
+
rc = (rc << 2) | c
|
|
143
|
+
code >>= 2
|
|
144
|
+
return rc
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@njit(nogil=True, locals=dict(
|
|
148
|
+
code=uint64, c=uint64))
|
|
149
|
+
def _get_rctable():
|
|
150
|
+
rctable = np.zeros(256, dtype=np.uint64)
|
|
151
|
+
for c in range(256):
|
|
152
|
+
rctable[c] = revcomp_code(c, 4)
|
|
153
|
+
return rctable
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
_RCTABLE = _get_rctable()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@njit(nogil=True, locals=dict(
|
|
160
|
+
code=uint64, rc=uint64, c=uint64))
|
|
161
|
+
def revcomp_code_table(code, q):
|
|
162
|
+
rc = 0
|
|
163
|
+
while q >= 4:
|
|
164
|
+
c = _RCTABLE[code & 255]
|
|
165
|
+
rc = (rc << 8) | c
|
|
166
|
+
code >>= 8
|
|
167
|
+
q -= 4
|
|
168
|
+
for i in range(q):
|
|
169
|
+
c = 3 - (code & 3)
|
|
170
|
+
rc = (rc << 2) | c
|
|
171
|
+
code >>= 2
|
|
172
|
+
return rc
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@njit(nogil=True, locals=dict(
|
|
176
|
+
code=int64, rc=int64))
|
|
177
|
+
def canonical_code(code, q):
|
|
178
|
+
rc = revcomp_code(code, q)
|
|
179
|
+
return code if code <= rc else rc
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def compile_revcomp_and_canonical_code(q, rcmode):
|
|
183
|
+
"""
|
|
184
|
+
return pair of functions (revcomp_code_q, canonical_code_q)
|
|
185
|
+
specialized for q-gram codes for the given value of q.
|
|
186
|
+
It is expected that LLVM optimization does loop unrolling.
|
|
187
|
+
"""
|
|
188
|
+
@njit(nogil=True, locals=dict(
|
|
189
|
+
code=uint64, rc=uint64, c=uint64))
|
|
190
|
+
def _rc(code):
|
|
191
|
+
rc = 0
|
|
192
|
+
t = q // 4
|
|
193
|
+
for i in range(t):
|
|
194
|
+
c = _RCTABLE[code & 255]
|
|
195
|
+
rc = (rc << 8) | c
|
|
196
|
+
code >>= 8
|
|
197
|
+
r = q % 4
|
|
198
|
+
for i in range(r):
|
|
199
|
+
c = 3 - (code & 3)
|
|
200
|
+
rc = (rc << 2) | c
|
|
201
|
+
code >>= 2
|
|
202
|
+
return rc
|
|
203
|
+
|
|
204
|
+
if rcmode == "min":
|
|
205
|
+
@njit(nogil=True, locals=dict(
|
|
206
|
+
code=uint64, rc=uint64))
|
|
207
|
+
def _cc(code):
|
|
208
|
+
rc = _rc(code)
|
|
209
|
+
return code if code <= rc else rc
|
|
210
|
+
elif rcmode == "max":
|
|
211
|
+
@njit(nogil=True, locals=dict(
|
|
212
|
+
code=uint64, rc=uint64))
|
|
213
|
+
def _cc(code):
|
|
214
|
+
rc = _rc(code)
|
|
215
|
+
return code if code >= rc else rc
|
|
216
|
+
elif rcmode == "r":
|
|
217
|
+
@njit(nogil=True, locals=dict(
|
|
218
|
+
code=uint64, rc=uint64))
|
|
219
|
+
def _cc(code):
|
|
220
|
+
rc = _rc(code)
|
|
221
|
+
return rc
|
|
222
|
+
else: # 'f', 'both', ...
|
|
223
|
+
@njit(nogil=True, locals=dict(
|
|
224
|
+
code=uint64, rc=uint64))
|
|
225
|
+
def _cc(code):
|
|
226
|
+
return code
|
|
227
|
+
|
|
228
|
+
return _rc, _cc
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# translation of a DNA buffer into codes
|
|
232
|
+
def compile_twobit_to_codes(tmask, rcmode, invalid=U64_MINUSONE):
|
|
233
|
+
# tmask should be a mask tuple, but might be an int in some cases
|
|
234
|
+
if isinstance(tmask, int):
|
|
235
|
+
k = w = tmask # be safe here
|
|
236
|
+
tmask = tuple(range(k))
|
|
237
|
+
elif isinstance(tmask, Mask):
|
|
238
|
+
k = tmask.k
|
|
239
|
+
w = tmask.w
|
|
240
|
+
tmask = tmask.tuple
|
|
241
|
+
elif type(tmask) is tuple:
|
|
242
|
+
k, w = len(tmask), tmask[-1] + 1
|
|
243
|
+
else:
|
|
244
|
+
raise ValueError(f"mask type {type(tmask)} is not supported.")
|
|
245
|
+
_, ccc = compile_revcomp_and_canonical_code(k, rcmode)
|
|
246
|
+
|
|
247
|
+
@njit(nogil=True, locals=dict(code=uint64))
|
|
248
|
+
def twobit_to_codes(seq, out, start=0, n=-1):
|
|
249
|
+
"""write n (or all) canonical k-mer codes from seq[start...] into out buffer"""
|
|
250
|
+
if n == -1:
|
|
251
|
+
n = len(seq) - w + 1 - start
|
|
252
|
+
for i in range(start, start + n):
|
|
253
|
+
code = 0
|
|
254
|
+
for j in tmask:
|
|
255
|
+
c = seq[i + j]
|
|
256
|
+
if c >= 4:
|
|
257
|
+
out[i - start] = uint64(invalid)
|
|
258
|
+
break
|
|
259
|
+
code = (code << 2) | c
|
|
260
|
+
else:
|
|
261
|
+
code = ccc(code)
|
|
262
|
+
out[i - start] = code
|
|
263
|
+
|
|
264
|
+
@njit(nogil=True, locals=dict(code=uint64))
|
|
265
|
+
def twobit_to_code(seq, start=0):
|
|
266
|
+
"""return a single canonical code at seq[start...]"""
|
|
267
|
+
code = 0
|
|
268
|
+
for j in tmask:
|
|
269
|
+
c = seq[start + j]
|
|
270
|
+
if c >= 4:
|
|
271
|
+
return uint64(invalid)
|
|
272
|
+
code = (code << 2) | c
|
|
273
|
+
else:
|
|
274
|
+
code = ccc(code)
|
|
275
|
+
return code
|
|
276
|
+
|
|
277
|
+
return twobit_to_codes, twobit_to_code
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fast DNA encoding and computation of the reverse complement A=00, C=01, T=10, G=11, U=10
|
|
3
|
+
Other characters are not treated correctly, except N=7>3 and n = 6>3
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from numba import njit, uint64
|
|
7
|
+
import numpy as np
|
|
8
|
+
from .mask import Mask
|
|
9
|
+
|
|
10
|
+
U64_MINUSONE = uint64(np.iinfo(np.uint64).max)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@njit(nogil=True)
|
|
14
|
+
def quick_dna_to_2bits(x):
|
|
15
|
+
for i in range(len(x)):
|
|
16
|
+
x[i] = (x[i] >> 1) & 7
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@njit(nogil=True)
|
|
20
|
+
def dna_to_2bits(x, y):
|
|
21
|
+
for i in range(len(x)):
|
|
22
|
+
y[i] = (x[i] >> 1) & 7
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# no need to njit!
|
|
26
|
+
def _get_table_2bits_to_dna():
|
|
27
|
+
b = np.full(256, 35, dtype=np.uint8) # fill with b'#'
|
|
28
|
+
b[0] = 65
|
|
29
|
+
b[1] = 67
|
|
30
|
+
b[2] = 84
|
|
31
|
+
b[3] = 71
|
|
32
|
+
b[6] = 78
|
|
33
|
+
b[7] = 78
|
|
34
|
+
return b
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_TABLE_2BITS_TO_DNA = _get_table_2bits_to_dna()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@njit(nogil=True)
|
|
41
|
+
def twobits_to_dna_inplace(buf, start=0, end=0):
|
|
42
|
+
if end <= 0:
|
|
43
|
+
end = len(buf) - end
|
|
44
|
+
for i in range(start, end):
|
|
45
|
+
buf[i] = _TABLE_2BITS_TO_DNA[buf[i]]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
_TABLE_BITS_TO_DNASTR = ["A", "C", "T", "G"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def qcode_to_dnastr(qcode, q, table=_TABLE_BITS_TO_DNASTR):
|
|
52
|
+
qc = int(qcode)
|
|
53
|
+
return "".join([table[((qc >> (2 * (q - i - 1))) & 3)] for i in range(q)])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def compile_revcomp_and_canonical_code(q, rcmode):
|
|
57
|
+
"""
|
|
58
|
+
return pair of functions (revcomp_code_q, canonical_code_q)
|
|
59
|
+
specialized for q-gram codes for the given value of q.
|
|
60
|
+
It is expected that LLVM optimization does loop unrolling.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
mask = uint64(int(q * '10', 2))
|
|
64
|
+
shift = uint64(64 - 2 * q)
|
|
65
|
+
|
|
66
|
+
@njit(nogil=True, locals=dict(value=uint64))
|
|
67
|
+
def reverse_in_pairs(value):
|
|
68
|
+
value = ((value & 0x3333333333333333) << 2) | ((value & 0xCCCCCCCCCCCCCCCC) >> 2)
|
|
69
|
+
value = ((value & 0x0F0F0F0F0F0F0F0F) << 4) | ((value >> 4) & 0x0F0F0F0F0F0F0F0F)
|
|
70
|
+
value = ((value & 0x00FF00FF00FF00FF) << 8) | ((value >> 8) & 0x00FF00FF00FF00FF)
|
|
71
|
+
value = ((value & 0x0000FFFF0000FFFF) << 16) | ((value >> 16) & 0x0000FFFF0000FFFF)
|
|
72
|
+
value = (value << 32) | (value >> 32)
|
|
73
|
+
return value
|
|
74
|
+
|
|
75
|
+
@njit(nogil=True, locals=dict(value=uint64))
|
|
76
|
+
def _rc(value):
|
|
77
|
+
value ^= mask
|
|
78
|
+
value = reverse_in_pairs(value)
|
|
79
|
+
return value >> shift
|
|
80
|
+
|
|
81
|
+
if rcmode == "min":
|
|
82
|
+
@njit(nogil=True, locals=dict(
|
|
83
|
+
code=uint64, rc=uint64))
|
|
84
|
+
def _cc(code):
|
|
85
|
+
rc = _rc(code)
|
|
86
|
+
return code if code <= rc else rc
|
|
87
|
+
elif rcmode == "max":
|
|
88
|
+
@njit(nogil=True, locals=dict(
|
|
89
|
+
code=uint64, rc=uint64))
|
|
90
|
+
def _cc(code):
|
|
91
|
+
rc = _rc(code)
|
|
92
|
+
return code if code >= rc else rc
|
|
93
|
+
elif rcmode == "r":
|
|
94
|
+
@njit(nogil=True, locals=dict(
|
|
95
|
+
code=uint64, rc=uint64))
|
|
96
|
+
def _cc(code):
|
|
97
|
+
rc = _rc(code)
|
|
98
|
+
return rc
|
|
99
|
+
else: # 'f', 'both', ...
|
|
100
|
+
@njit(nogil=True, locals=dict(
|
|
101
|
+
code=uint64, rc=uint64))
|
|
102
|
+
def _cc(code):
|
|
103
|
+
return code
|
|
104
|
+
|
|
105
|
+
return _rc, _cc
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# translation of a DNA buffer into codes
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def compile_twobit_to_codes(tmask, rcmode, invalid=U64_MINUSONE):
|
|
112
|
+
# tmask should be a mask tuple, but might be an int in some cases
|
|
113
|
+
if isinstance(tmask, int):
|
|
114
|
+
k = w = tmask # be safe here
|
|
115
|
+
tmask = tuple(range(k))
|
|
116
|
+
elif isinstance(tmask, Mask):
|
|
117
|
+
k = tmask.k
|
|
118
|
+
w = tmask.w
|
|
119
|
+
tmask = tmask.tuple
|
|
120
|
+
elif type(tmask) is tuple:
|
|
121
|
+
k, w = len(tmask), tmask[-1] + 1
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError(f"mask type {type(tmask)} is not supported.")
|
|
124
|
+
_, ccc = compile_revcomp_and_canonical_code(k, rcmode)
|
|
125
|
+
|
|
126
|
+
@njit(nogil=True, locals=dict(code=uint64))
|
|
127
|
+
def twobit_to_codes(seq, out, start=0, n=-1):
|
|
128
|
+
"""write n (or all) canonical k-mer codes from seq[start...] into out buffer"""
|
|
129
|
+
if n == -1:
|
|
130
|
+
n = len(seq) - w + 1 - start
|
|
131
|
+
for i in range(start, start + n):
|
|
132
|
+
code = 0
|
|
133
|
+
for j in tmask:
|
|
134
|
+
c = seq[i + j]
|
|
135
|
+
if c >= 4:
|
|
136
|
+
out[i - start] = uint64(invalid)
|
|
137
|
+
break
|
|
138
|
+
code = (code << 2) | c
|
|
139
|
+
else:
|
|
140
|
+
code = ccc(code)
|
|
141
|
+
out[i - start] = code
|
|
142
|
+
|
|
143
|
+
@njit(nogil=True, locals=dict(code=uint64))
|
|
144
|
+
def twobit_to_code(seq, start=0):
|
|
145
|
+
"""return a single canonical code at seq[start...]"""
|
|
146
|
+
code = 0
|
|
147
|
+
for j in tmask:
|
|
148
|
+
c = seq[start + j]
|
|
149
|
+
if c >= 4:
|
|
150
|
+
return uint64(invalid)
|
|
151
|
+
code = (code << 2) | c
|
|
152
|
+
else:
|
|
153
|
+
code = ccc(code)
|
|
154
|
+
return code
|
|
155
|
+
|
|
156
|
+
return twobit_to_codes, twobit_to_code
|