panxpress 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. panxpress/cuckoo_filter_utils.py +207 -0
  2. panxpress/dnaencode.py +277 -0
  3. panxpress/dnaencode_fast.py +156 -0
  4. panxpress/fastcash_info.py +156 -0
  5. panxpress/fastcash_main.py +271 -0
  6. panxpress/fastcash_weak_ptr.py +489 -0
  7. panxpress/hash_new.py +742 -0
  8. panxpress/hashfunctions.py +221 -0
  9. panxpress/io/binaryio.py +62 -0
  10. panxpress/io/fastaio.py +461 -0
  11. panxpress/io/fastqio.py +596 -0
  12. panxpress/io/filterio.py +112 -0
  13. panxpress/io/generaldsio.py +46 -0
  14. panxpress/io/generalio.py +252 -0
  15. panxpress/io/hashio.py +514 -0
  16. panxpress/io/seqio.py +200 -0
  17. panxpress/io/textio.py +21 -0
  18. panxpress/io/xorio.py +94 -0
  19. panxpress/kmers.py +474 -0
  20. panxpress/lowlevel/aligned_arrays.py +42 -0
  21. panxpress/lowlevel/bitarray.py +228 -0
  22. panxpress/lowlevel/conpro.py +504 -0
  23. panxpress/lowlevel/debug.py +97 -0
  24. panxpress/lowlevel/intbitarray.py +252 -0
  25. panxpress/lowlevel/libc.py +174 -0
  26. panxpress/lowlevel/llvm.py +638 -0
  27. panxpress/lowlevel/lowlevelfunctions.txt +1 -0
  28. panxpress/lowlevel/numbautils.py +25 -0
  29. panxpress/lowlevel/packedarray.py +186 -0
  30. panxpress/mask.py +69 -0
  31. panxpress/mathutils.py +296 -0
  32. panxpress/panxpress/config/index.yaml +7 -0
  33. panxpress/panxpress/panxpress_build_reference.py +475 -0
  34. panxpress/panxpress/panxpress_correct_gff.py +1342 -0
  35. panxpress/panxpress/panxpress_index.py +286 -0
  36. panxpress/panxpress/panxpress_main.py +308 -0
  37. panxpress/panxpress/panxpress_map_parallel.py +480 -0
  38. panxpress/parameters.py +63 -0
  39. panxpress/srhash.py +594 -0
  40. panxpress/subtable_hashfunctions.py +395 -0
  41. panxpress/values/panxpress.py +87 -0
  42. panxpress-0.2.dist-info/METADATA +374 -0
  43. panxpress-0.2.dist-info/RECORD +47 -0
  44. panxpress-0.2.dist-info/WHEEL +5 -0
  45. panxpress-0.2.dist-info/entry_points.txt +3 -0
  46. panxpress-0.2.dist-info/licenses/LICENSE +21 -0
  47. panxpress-0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,207 @@
1
+ """
2
+ Utilities, pointer and bit magic for cuckoo filters
3
+ """
4
+
5
+ from math import log2
6
+ from numba import njit, uint8, uint16, uint32, uint64
7
+ import numba as nb
8
+ import numpy as np
9
+ from llvmlite import ir
10
+ from .mathutils import bitsfor
11
+ from .lowlevel.llvm import compile_cttz
12
+
13
+
14
+ def compute_masks(bits_per_slot, slots):
15
+ mask1 = ('0' * (bits_per_slot - 1) + '1') * slots
16
+ mask2 = ('1' + '0' * (bits_per_slot - 1)) * slots
17
+ return uint64(int(mask1, 2)), uint64(int(mask2, 2))
18
+
19
+
20
+ def compute_fp_mask(bits_per_slot, fingerprint_bits, slots):
21
+ fp_mask = uint64(0)
22
+ for s in range(1, slots):
23
+ fp_mask |= (s << (s * bits_per_slot + fingerprint_bits))
24
+ return uint64(fp_mask)
25
+
26
+
27
+ def compute_choice_fp_mask(bits_per_slot, fingerprint_bits, slots):
28
+ window_bits = bitsfor(slots)
29
+ fp_mask = uint64(0)
30
+ for s in range(0, slots):
31
+ fp_mask |= (s << (s * bits_per_slot + fingerprint_bits))
32
+ fp_mask |= 1 << (s * bits_per_slot + fingerprint_bits + window_bits)
33
+ for s in range(0, slots):
34
+ fp_mask |= (s << ((slots + s) * bits_per_slot + fingerprint_bits))
35
+ return uint64(fp_mask)
36
+
37
+
38
+ def compute_choice_mask(bits_per_slot, fingerprint_bits, slots):
39
+ fp_mask = uint64(0)
40
+ for s in range(0, slots):
41
+ fp_mask |= 1 << (s * bits_per_slot + fingerprint_bits)
42
+ return uint64(fp_mask)
43
+
44
+
45
+ def compile_lookup(bits_per_slot, slots, windowed=True, choice=False):
46
+ fingerprint_bits = uint64(bits_per_slot - int(log2(slots)) - 1 if windowed else bits_per_slot - 1)
47
+ if choice:
48
+ assert bits_per_slot * slots * 2 <= 64
49
+ m1, m2 = compute_masks(bits_per_slot, 2 * slots)
50
+ if windowed:
51
+ fp_mask = compute_choice_fp_mask(bits_per_slot, fingerprint_bits, slots)
52
+ else:
53
+ fp_mask = compute_choice_mask(bits_per_slot, fingerprint_bits, slots)
54
+ else:
55
+ assert bits_per_slot * slots <= 64
56
+ m1, m2 = compute_masks(bits_per_slot, slots)
57
+ fp_mask = compute_fp_mask(bits_per_slot, fingerprint_bits, slots) if windowed else uint64(0)
58
+
59
+ @njit(nogil=True)
60
+ def haszero(x):
61
+ return uint64(((x) - m1) & (~(x)) & m2)
62
+
63
+ @njit(nogil=True)
64
+ def hasvalue(window, fp):
65
+ x = haszero(window ^ ((m1 * fp) | fp_mask))
66
+ return x != 0
67
+
68
+ return hasvalue
69
+
70
+
71
+ def compile_get_empty_slot(bits_per_slot, slots):
72
+ assert bits_per_slot * slots <= 64
73
+ count_trailing_zeros = compile_cttz('uint64')
74
+ m1, m2 = compute_masks(bits_per_slot, slots)
75
+
76
+ @njit(nogil=True)
77
+ def haszero(x):
78
+ return (((x) - m1) & (~(x)) & m2)
79
+
80
+ @njit(nogil=True, locals=dict(x=uint64, slot=uint64))
81
+ def get_empty_slot(window):
82
+ x = haszero(window)
83
+ slot = count_trailing_zeros(x) // bits_per_slot
84
+ return slot
85
+
86
+ return get_empty_slot
87
+
88
+
89
+ def compile_load_value(nbits):
90
+ # fast version for multiples of 8 (cast byte counter)
91
+ if nbits in [8, 16, 32, 64]:
92
+ signatures = {8: uint8, 16: uint16, 32: uint32, 64: uint64}
93
+ pointer = ir.IntType(nbits).as_pointer()
94
+ int_type = signatures[nbits]
95
+
96
+ @nb.extending.intrinsic
97
+ def address_to_value(typingctx, src):
98
+ """ returns the value stored at a given memory address """
99
+ sig = int_type(src)
100
+ def codegen(cgctx, builder, sig, args):
101
+ ptr = builder.inttoptr(args[0], pointer)
102
+ return builder.load(ptr)
103
+ return sig, codegen
104
+
105
+ @njit(nogil=True, locals=dict(address=uint64))
106
+ def get_value(fltr, start):
107
+ address = fltr.ctypes.data + (uint64(start) >> 3)
108
+ return address_to_value(address)
109
+
110
+ return get_value
111
+
112
+ # slow version if nbits is not a multiple of 8
113
+ elif 0 < nbits <= 64:
114
+ padded_nbits = ((int(nbits) + 7) & (-8)) + 8
115
+ int_type = uint64
116
+ mask = ir.Constant(ir.IntType(64), int(nbits * '1', 2))
117
+ finalcast = ir.IntType(64)
118
+ if padded_nbits <= 64:
119
+ cast = ir.IntType(64)
120
+ pointer = ir.IntType(64).as_pointer()
121
+ else:
122
+ cast = ir.IntType(128)
123
+ pointer = ir.IntType(128).as_pointer()
124
+
125
+ @nb.extending.intrinsic
126
+ def address_to_value(typingctx, address, offset):
127
+ """ returns the value stored at a given memory address """
128
+ sig = int_type(address, offset)
129
+ def codegen(cgctx, builder, sig, args):
130
+ ptr = builder.inttoptr(args[0], pointer)
131
+ value = builder.load(ptr)
132
+ shift = builder.zext(args[1], cast)
133
+ value = builder.lshr(value, shift)
134
+ value = builder.trunc(value, finalcast)
135
+ return builder.and_(value, mask)
136
+ return sig, codegen
137
+
138
+ @njit(nogil=True, locals=dict(address=uint64, offset=uint64))
139
+ def get_value(array, pos):
140
+ address = array.ctypes.data + (uint64(pos) >> 3)
141
+ return address_to_value(address, (uint64(pos) & 7))
142
+
143
+ return get_value
144
+
145
+ return None
146
+
147
+ def compile_store_value(nbits):
148
+ # fast version for multiples of 8 (cast byte counter)
149
+ if nbits in [8, 16, 32, 64]:
150
+ pointer = ir.IntType(nbits).as_pointer()
151
+ signatures = {8: uint8, 16: uint16, 32: uint32, 64: uint64}
152
+ int_type = signatures[nbits]
153
+
154
+ @nb.extending.intrinsic
155
+ def store_value_at_address(typingctx, address, value):
156
+ """returns the value stored at a given memory address """
157
+ sig = nb.void(nb.types.uintp, int_type)
158
+ def codegen(cgctx, builder, sig, args):
159
+ ptr = builder.inttoptr(args[0], pointer)
160
+ builder.store(args[1], ptr)
161
+ return sig, codegen
162
+
163
+ @njit(nogil=True, locals=dict(pos=uint64))
164
+ def store_value(array, pos, value):
165
+ pos = array.ctypes.data + (uint64(pos) >> 3)
166
+ store_value_at_address(pos, value)
167
+
168
+ return store_value
169
+
170
+ # slow version if nbits is not a multiple of 8
171
+ elif 0 < nbits <= 64:
172
+ padded_nbits = ((int(nbits) + 7) & (-8)) + 8
173
+ if padded_nbits <= 64:
174
+ cast = ir.IntType(64)
175
+ pointer = ir.IntType(64).as_pointer()
176
+ ones = ir.Constant(ir.IntType(64), int(2**64 - 1))
177
+ mask = ir.Constant(ir.IntType(64), 2**int(nbits) - 1)
178
+ elif padded_nbits <= 128:
179
+ cast = ir.IntType(128)
180
+ pointer = ir.IntType(128).as_pointer()
181
+ ones = ir.Constant(ir.IntType(128), 2**128 - 1)
182
+ mask = ir.Constant(ir.IntType(128), 2**int(nbits) - 1)
183
+
184
+ @nb.extending.intrinsic
185
+ def store_value_at_address(typingctx, address, value, shift):
186
+ """returns the value stored at a given memory address """
187
+ sig = nb.void(nb.types.uintp, address, value, shift)
188
+ def codegen(cgctx, builder, sig, args):
189
+ ptr = builder.inttoptr(args[0], pointer)
190
+ value = builder.load(ptr)
191
+ zero_mask = builder.shl(mask, builder.zext(args[2], cast))
192
+ zero_mask = builder.xor(zero_mask, ones)
193
+ value = builder.and_(value, zero_mask)
194
+ insert = builder.shl(args[1], args[2])
195
+ value = builder.or_(value, builder.zext(insert, cast))
196
+ builder.store(value, ptr)
197
+ return sig, codegen
198
+
199
+ @njit(nogil=True, locals=dict(address=uint64, shift=uint64, value=uint64))
200
+ def store_value(array, pos, value):
201
+ address = array.ctypes.data + (uint64(pos) >> 3)
202
+ shift = uint64(pos) & 7
203
+ store_value_at_address(address, value, shift)
204
+
205
+ return store_value
206
+
207
+ raise NotImplementedError("Only storing values <= 64 bits supported")
panxpress/dnaencode.py ADDED
@@ -0,0 +1,277 @@
1
+
2
+ import numpy as np
3
+ from numba import njit, uint8, int64, uint64, prange
4
+
5
+ from .mask import Mask
6
+
7
+
8
+ U64_MINUSONE = uint64(np.iinfo(np.uint64).max)
9
+
10
+
11
+ # encoding DNA ###############################
12
+
13
+ def _get_table_dna_to_2bits(default=4):
14
+ b = np.full(256, default, dtype=np.uint8)
15
+ b[97] = 0 # a
16
+ b[65] = 0 # A
17
+ b[99] = 1 # c
18
+ b[67] = 1 # C
19
+ b[103] = 2 # g
20
+ b[71] = 2 # G
21
+ b[116] = 3 # t
22
+ b[84] = 3 # T
23
+ b[117] = 3 # u
24
+ b[85] = 3 # U
25
+ return b
26
+
27
+
28
+ _TABLE_DNA_TO_2BITS = _get_table_dna_to_2bits()
29
+
30
+
31
+ @njit(nogil=True, locals=dict(i=int64))
32
+ def _dna_to_2bits(x, table):
33
+ for i in range(x.size):
34
+ x[i] = table[x[i]]
35
+
36
+
37
+ # this is the one to use!
38
+ @njit(nogil=True)
39
+ def quick_dna_to_2bits(x):
40
+ for i in range(len(x)):
41
+ x[i] = _TABLE_DNA_TO_2BITS[x[i]]
42
+
43
+
44
+ @njit(nogil=True, parallel=True)
45
+ def parallel_dna_to_2bits(x):
46
+ for i in prange(x.size):
47
+ x[i] = _TABLE_DNA_TO_2BITS[x[i]]
48
+
49
+
50
+ # numba compile error
51
+ # @njit(nogil=True, locals=dict(seq=uint8[:], table=uint8[:]))
52
+ def dna_to_2bits(seq, table=_TABLE_DNA_TO_2BITS):
53
+ # we expect seq to be a bytearray
54
+ # xx = np.array(seq, dtype=np.uint8)
55
+ if isinstance(seq, bytes):
56
+ xx = np.frombuffer(bytearray(seq), dtype=np.uint8)
57
+ else:
58
+ xx = np.frombuffer(seq, dtype=np.uint8)
59
+ _dna_to_2bits(xx, table)
60
+ return xx
61
+
62
+
63
+ _TABLE_BITS_TO_DNASTR = ["A", "C", "G", "T"]
64
+
65
+
66
+ def qcode_to_dnastr(qcode, q, table=_TABLE_BITS_TO_DNASTR):
67
+ qc = int(qcode)
68
+ return "".join([table[((qc >> (2 * (q - i - 1))) & 3)] for i in range(q)])
69
+
70
+
71
+ @njit(nogil=True, locals=dict(base=uint64))
72
+ def write_qcode_to_buffer(qcode, q, buf, start):
73
+ for i in range(q):
74
+ base = (qcode >> (2 * (q - i - 1))) & 3
75
+ buf[start + i] = uint8(base)
76
+
77
+
78
+ # no need to njit!
79
+ def _get_table_2bits_to_dna(default=4):
80
+ b = np.full(256, 35, dtype=np.uint8) # fill with b'#'
81
+ b[0] = 65
82
+ b[1] = 67
83
+ b[2] = 71
84
+ b[3] = 84
85
+ b[default] = 78
86
+ return b
87
+
88
+
89
+ _TABLE_2BITS_TO_DNA = _get_table_2bits_to_dna()
90
+
91
+
92
+ @njit(nogil=True)
93
+ def twobits_to_dna_inplace(buf, start=0, end=0):
94
+ if end <= 0:
95
+ end = len(buf) - end
96
+ for i in range(start, end):
97
+ buf[i] = _TABLE_2BITS_TO_DNA[buf[i]]
98
+
99
+
100
+ # ########## reverse complements and canonical representation ##############
101
+
102
+ @njit(nogil=True,
103
+ locals=dict(c1=uint8, c2=uint8, n=int64, drei=uint8))
104
+ def revcomp_inplace(seq):
105
+ n = seq.size
106
+ drei = 3
107
+ for i in range((n + 1) // 2):
108
+ j = n - 1 - i
109
+ c1 = seq[i]
110
+ c2 = seq[j]
111
+ seq[j] = drei - c1 if c1 < 4 else c1
112
+ seq[i] = drei - c2 if c2 < 4 else c2
113
+
114
+
115
+ @njit(nogil=True, locals=dict(
116
+ c=uint8, n=int64, drei=uint8, rc=uint8[:]))
117
+ def revcomp_to_buffer(seq, rc):
118
+ n = seq.size
119
+ drei = 3
120
+ for i in range(n):
121
+ c = seq[n - 1 - i]
122
+ rc[i] = drei - c if c < 4 else c
123
+
124
+
125
+ @njit(nogil=True, locals=dict(rc=uint8[:]))
126
+ def revcomp(seq):
127
+ rc = np.empty_like(seq, dtype=np.uint8)
128
+ revcomp_to_buffer(seq, rc)
129
+ return rc
130
+
131
+
132
+ @njit(nogil=True, locals=dict(
133
+ code=uint64, drei=uint64, rc=uint64, c=uint64))
134
+ def revcomp_code(code, q):
135
+ # only works for 0 <= q <= 31 !
136
+ # when using uints, due to a potential bug in numpy/numba,
137
+ # we would have to re-declare code as uint64 locally.
138
+ drei = uint64(3)
139
+ rc = 0
140
+ for i in range(q):
141
+ c = drei - (code & drei)
142
+ rc = (rc << 2) | c
143
+ code >>= 2
144
+ return rc
145
+
146
+
147
+ @njit(nogil=True, locals=dict(
148
+ code=uint64, c=uint64))
149
+ def _get_rctable():
150
+ rctable = np.zeros(256, dtype=np.uint64)
151
+ for c in range(256):
152
+ rctable[c] = revcomp_code(c, 4)
153
+ return rctable
154
+
155
+
156
+ _RCTABLE = _get_rctable()
157
+
158
+
159
+ @njit(nogil=True, locals=dict(
160
+ code=uint64, rc=uint64, c=uint64))
161
+ def revcomp_code_table(code, q):
162
+ rc = 0
163
+ while q >= 4:
164
+ c = _RCTABLE[code & 255]
165
+ rc = (rc << 8) | c
166
+ code >>= 8
167
+ q -= 4
168
+ for i in range(q):
169
+ c = 3 - (code & 3)
170
+ rc = (rc << 2) | c
171
+ code >>= 2
172
+ return rc
173
+
174
+
175
+ @njit(nogil=True, locals=dict(
176
+ code=int64, rc=int64))
177
+ def canonical_code(code, q):
178
+ rc = revcomp_code(code, q)
179
+ return code if code <= rc else rc
180
+
181
+
182
+ def compile_revcomp_and_canonical_code(q, rcmode):
183
+ """
184
+ return pair of functions (revcomp_code_q, canonical_code_q)
185
+ specialized for q-gram codes for the given value of q.
186
+ It is expected that LLVM optimization does loop unrolling.
187
+ """
188
+ @njit(nogil=True, locals=dict(
189
+ code=uint64, rc=uint64, c=uint64))
190
+ def _rc(code):
191
+ rc = 0
192
+ t = q // 4
193
+ for i in range(t):
194
+ c = _RCTABLE[code & 255]
195
+ rc = (rc << 8) | c
196
+ code >>= 8
197
+ r = q % 4
198
+ for i in range(r):
199
+ c = 3 - (code & 3)
200
+ rc = (rc << 2) | c
201
+ code >>= 2
202
+ return rc
203
+
204
+ if rcmode == "min":
205
+ @njit(nogil=True, locals=dict(
206
+ code=uint64, rc=uint64))
207
+ def _cc(code):
208
+ rc = _rc(code)
209
+ return code if code <= rc else rc
210
+ elif rcmode == "max":
211
+ @njit(nogil=True, locals=dict(
212
+ code=uint64, rc=uint64))
213
+ def _cc(code):
214
+ rc = _rc(code)
215
+ return code if code >= rc else rc
216
+ elif rcmode == "r":
217
+ @njit(nogil=True, locals=dict(
218
+ code=uint64, rc=uint64))
219
+ def _cc(code):
220
+ rc = _rc(code)
221
+ return rc
222
+ else: # 'f', 'both', ...
223
+ @njit(nogil=True, locals=dict(
224
+ code=uint64, rc=uint64))
225
+ def _cc(code):
226
+ return code
227
+
228
+ return _rc, _cc
229
+
230
+
231
+ # translation of a DNA buffer into codes
232
+ def compile_twobit_to_codes(tmask, rcmode, invalid=U64_MINUSONE):
233
+ # tmask should be a mask tuple, but might be an int in some cases
234
+ if isinstance(tmask, int):
235
+ k = w = tmask # be safe here
236
+ tmask = tuple(range(k))
237
+ elif isinstance(tmask, Mask):
238
+ k = tmask.k
239
+ w = tmask.w
240
+ tmask = tmask.tuple
241
+ elif type(tmask) is tuple:
242
+ k, w = len(tmask), tmask[-1] + 1
243
+ else:
244
+ raise ValueError(f"mask type {type(tmask)} is not supported.")
245
+ _, ccc = compile_revcomp_and_canonical_code(k, rcmode)
246
+
247
+ @njit(nogil=True, locals=dict(code=uint64))
248
+ def twobit_to_codes(seq, out, start=0, n=-1):
249
+ """write n (or all) canonical k-mer codes from seq[start...] into out buffer"""
250
+ if n == -1:
251
+ n = len(seq) - w + 1 - start
252
+ for i in range(start, start + n):
253
+ code = 0
254
+ for j in tmask:
255
+ c = seq[i + j]
256
+ if c >= 4:
257
+ out[i - start] = uint64(invalid)
258
+ break
259
+ code = (code << 2) | c
260
+ else:
261
+ code = ccc(code)
262
+ out[i - start] = code
263
+
264
+ @njit(nogil=True, locals=dict(code=uint64))
265
+ def twobit_to_code(seq, start=0):
266
+ """return a single canonical code at seq[start...]"""
267
+ code = 0
268
+ for j in tmask:
269
+ c = seq[start + j]
270
+ if c >= 4:
271
+ return uint64(invalid)
272
+ code = (code << 2) | c
273
+ else:
274
+ code = ccc(code)
275
+ return code
276
+
277
+ return twobit_to_codes, twobit_to_code
@@ -0,0 +1,156 @@
1
+ """
2
+ Fast DNA encoding and computation of the reverse complement A=00, C=01, T=10, G=11, U=10
3
+ Other characters are not treated correctly, except N=7>3 and n = 6>3
4
+ """
5
+
6
+ from numba import njit, uint64
7
+ import numpy as np
8
+ from .mask import Mask
9
+
10
+ U64_MINUSONE = uint64(np.iinfo(np.uint64).max)
11
+
12
+
13
+ @njit(nogil=True)
14
+ def quick_dna_to_2bits(x):
15
+ for i in range(len(x)):
16
+ x[i] = (x[i] >> 1) & 7
17
+
18
+
19
+ @njit(nogil=True)
20
+ def dna_to_2bits(x, y):
21
+ for i in range(len(x)):
22
+ y[i] = (x[i] >> 1) & 7
23
+
24
+
25
+ # no need to njit!
26
+ def _get_table_2bits_to_dna():
27
+ b = np.full(256, 35, dtype=np.uint8) # fill with b'#'
28
+ b[0] = 65
29
+ b[1] = 67
30
+ b[2] = 84
31
+ b[3] = 71
32
+ b[6] = 78
33
+ b[7] = 78
34
+ return b
35
+
36
+
37
+ _TABLE_2BITS_TO_DNA = _get_table_2bits_to_dna()
38
+
39
+
40
+ @njit(nogil=True)
41
+ def twobits_to_dna_inplace(buf, start=0, end=0):
42
+ if end <= 0:
43
+ end = len(buf) - end
44
+ for i in range(start, end):
45
+ buf[i] = _TABLE_2BITS_TO_DNA[buf[i]]
46
+
47
+
48
+ _TABLE_BITS_TO_DNASTR = ["A", "C", "T", "G"]
49
+
50
+
51
+ def qcode_to_dnastr(qcode, q, table=_TABLE_BITS_TO_DNASTR):
52
+ qc = int(qcode)
53
+ return "".join([table[((qc >> (2 * (q - i - 1))) & 3)] for i in range(q)])
54
+
55
+
56
+ def compile_revcomp_and_canonical_code(q, rcmode):
57
+ """
58
+ return pair of functions (revcomp_code_q, canonical_code_q)
59
+ specialized for q-gram codes for the given value of q.
60
+ It is expected that LLVM optimization does loop unrolling.
61
+ """
62
+
63
+ mask = uint64(int(q * '10', 2))
64
+ shift = uint64(64 - 2 * q)
65
+
66
+ @njit(nogil=True, locals=dict(value=uint64))
67
+ def reverse_in_pairs(value):
68
+ value = ((value & 0x3333333333333333) << 2) | ((value & 0xCCCCCCCCCCCCCCCC) >> 2)
69
+ value = ((value & 0x0F0F0F0F0F0F0F0F) << 4) | ((value >> 4) & 0x0F0F0F0F0F0F0F0F)
70
+ value = ((value & 0x00FF00FF00FF00FF) << 8) | ((value >> 8) & 0x00FF00FF00FF00FF)
71
+ value = ((value & 0x0000FFFF0000FFFF) << 16) | ((value >> 16) & 0x0000FFFF0000FFFF)
72
+ value = (value << 32) | (value >> 32)
73
+ return value
74
+
75
+ @njit(nogil=True, locals=dict(value=uint64))
76
+ def _rc(value):
77
+ value ^= mask
78
+ value = reverse_in_pairs(value)
79
+ return value >> shift
80
+
81
+ if rcmode == "min":
82
+ @njit(nogil=True, locals=dict(
83
+ code=uint64, rc=uint64))
84
+ def _cc(code):
85
+ rc = _rc(code)
86
+ return code if code <= rc else rc
87
+ elif rcmode == "max":
88
+ @njit(nogil=True, locals=dict(
89
+ code=uint64, rc=uint64))
90
+ def _cc(code):
91
+ rc = _rc(code)
92
+ return code if code >= rc else rc
93
+ elif rcmode == "r":
94
+ @njit(nogil=True, locals=dict(
95
+ code=uint64, rc=uint64))
96
+ def _cc(code):
97
+ rc = _rc(code)
98
+ return rc
99
+ else: # 'f', 'both', ...
100
+ @njit(nogil=True, locals=dict(
101
+ code=uint64, rc=uint64))
102
+ def _cc(code):
103
+ return code
104
+
105
+ return _rc, _cc
106
+
107
+
108
+ # translation of a DNA buffer into codes
109
+
110
+
111
+ def compile_twobit_to_codes(tmask, rcmode, invalid=U64_MINUSONE):
112
+ # tmask should be a mask tuple, but might be an int in some cases
113
+ if isinstance(tmask, int):
114
+ k = w = tmask # be safe here
115
+ tmask = tuple(range(k))
116
+ elif isinstance(tmask, Mask):
117
+ k = tmask.k
118
+ w = tmask.w
119
+ tmask = tmask.tuple
120
+ elif type(tmask) is tuple:
121
+ k, w = len(tmask), tmask[-1] + 1
122
+ else:
123
+ raise ValueError(f"mask type {type(tmask)} is not supported.")
124
+ _, ccc = compile_revcomp_and_canonical_code(k, rcmode)
125
+
126
+ @njit(nogil=True, locals=dict(code=uint64))
127
+ def twobit_to_codes(seq, out, start=0, n=-1):
128
+ """write n (or all) canonical k-mer codes from seq[start...] into out buffer"""
129
+ if n == -1:
130
+ n = len(seq) - w + 1 - start
131
+ for i in range(start, start + n):
132
+ code = 0
133
+ for j in tmask:
134
+ c = seq[i + j]
135
+ if c >= 4:
136
+ out[i - start] = uint64(invalid)
137
+ break
138
+ code = (code << 2) | c
139
+ else:
140
+ code = ccc(code)
141
+ out[i - start] = code
142
+
143
+ @njit(nogil=True, locals=dict(code=uint64))
144
+ def twobit_to_code(seq, start=0):
145
+ """return a single canonical code at seq[start...]"""
146
+ code = 0
147
+ for j in tmask:
148
+ c = seq[start + j]
149
+ if c >= 4:
150
+ return uint64(invalid)
151
+ code = (code << 2) | c
152
+ else:
153
+ code = ccc(code)
154
+ return code
155
+
156
+ return twobit_to_codes, twobit_to_code