minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
from libc.stdint cimport uint8_t, int8_t
|
2
|
+
from libc.stdlib cimport free
|
3
|
+
cimport cmappy
|
4
|
+
import sys
|
5
|
+
|
6
|
+
__version__ = '2.24'
|
7
|
+
|
8
|
+
cmappy.mm_reset_timer()
|
9
|
+
|
10
|
+
cdef class Alignment:
|
11
|
+
cdef int _ctg_len, _r_st, _r_en
|
12
|
+
cdef int _q_st, _q_en
|
13
|
+
cdef int _NM, _mlen, _blen
|
14
|
+
cdef int8_t _strand, _trans_strand
|
15
|
+
cdef uint8_t _mapq, _is_primary
|
16
|
+
cdef int _seg_id
|
17
|
+
cdef _ctg, _cigar, _cs, _MD # these are python objects
|
18
|
+
|
19
|
+
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
|
20
|
+
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
|
21
|
+
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
|
22
|
+
self._strand, self._q_st, self._q_en = strand, qs, qe
|
23
|
+
self._NM, self._mlen, self._blen = NM, mlen, blen
|
24
|
+
self._mapq = mapq
|
25
|
+
self._cigar = cigar
|
26
|
+
self._is_primary = is_primary
|
27
|
+
self._trans_strand = trans_strand
|
28
|
+
self._seg_id = seg_id
|
29
|
+
self._cs = cs_str
|
30
|
+
self._MD = MD_str
|
31
|
+
|
32
|
+
@property
|
33
|
+
def ctg(self): return self._ctg
|
34
|
+
|
35
|
+
@property
|
36
|
+
def ctg_len(self): return self._ctg_len
|
37
|
+
|
38
|
+
@property
|
39
|
+
def r_st(self): return self._r_st
|
40
|
+
|
41
|
+
@property
|
42
|
+
def r_en(self): return self._r_en
|
43
|
+
|
44
|
+
@property
|
45
|
+
def strand(self): return self._strand
|
46
|
+
|
47
|
+
@property
|
48
|
+
def trans_strand(self): return self._trans_strand
|
49
|
+
|
50
|
+
@property
|
51
|
+
def blen(self): return self._blen
|
52
|
+
|
53
|
+
@property
|
54
|
+
def mlen(self): return self._mlen
|
55
|
+
|
56
|
+
@property
|
57
|
+
def NM(self): return self._NM
|
58
|
+
|
59
|
+
@property
|
60
|
+
def is_primary(self): return (self._is_primary != 0)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def q_st(self): return self._q_st
|
64
|
+
|
65
|
+
@property
|
66
|
+
def q_en(self): return self._q_en
|
67
|
+
|
68
|
+
@property
|
69
|
+
def mapq(self): return self._mapq
|
70
|
+
|
71
|
+
@property
|
72
|
+
def cigar(self): return self._cigar
|
73
|
+
|
74
|
+
@property
|
75
|
+
def read_num(self): return self._seg_id + 1
|
76
|
+
|
77
|
+
@property
|
78
|
+
def cs(self): return self._cs
|
79
|
+
|
80
|
+
@property
|
81
|
+
def MD(self): return self._MD
|
82
|
+
|
83
|
+
@property
|
84
|
+
def cigar_str(self):
|
85
|
+
return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
|
86
|
+
|
87
|
+
def __str__(self):
|
88
|
+
if self._strand > 0: strand = '+'
|
89
|
+
elif self._strand < 0: strand = '-'
|
90
|
+
else: strand = '?'
|
91
|
+
if self._is_primary != 0: tp = 'tp:A:P'
|
92
|
+
else: tp = 'tp:A:S'
|
93
|
+
if self._trans_strand > 0: ts = 'ts:A:+'
|
94
|
+
elif self._trans_strand < 0: ts = 'ts:A:-'
|
95
|
+
else: ts = 'ts:A:.'
|
96
|
+
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
97
|
+
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
|
98
|
+
if self._cs != "": a.append("cs:Z:" + self._cs)
|
99
|
+
return "\t".join(a)
|
100
|
+
|
101
|
+
cdef class ThreadBuffer:
|
102
|
+
cdef cmappy.mm_tbuf_t *_b
|
103
|
+
|
104
|
+
def __cinit__(self):
|
105
|
+
self._b = cmappy.mm_tbuf_init()
|
106
|
+
|
107
|
+
def __dealloc__(self):
|
108
|
+
cmappy.mm_tbuf_destroy(self._b)
|
109
|
+
|
110
|
+
cdef class Aligner:
|
111
|
+
cdef cmappy.mm_idx_t *_idx
|
112
|
+
cdef cmappy.mm_idxopt_t idx_opt
|
113
|
+
cdef cmappy.mm_mapopt_t map_opt
|
114
|
+
|
115
|
+
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
|
+
self._idx = NULL
|
117
|
+
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
118
|
+
if preset is not None:
|
119
|
+
cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
|
120
|
+
self.map_opt.flag |= 4 # always perform alignment
|
121
|
+
self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
|
122
|
+
if k is not None: self.idx_opt.k = k
|
123
|
+
if w is not None: self.idx_opt.w = w
|
124
|
+
if min_cnt is not None: self.map_opt.min_cnt = min_cnt
|
125
|
+
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
126
|
+
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
127
|
+
if bw is not None: self.map_opt.bw = bw
|
128
|
+
if best_n is not None: self.map_opt.best_n = best_n
|
129
|
+
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
130
|
+
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
131
|
+
if scoring is not None and len(scoring) >= 4:
|
132
|
+
self.map_opt.a, self.map_opt.b = scoring[0], scoring[1]
|
133
|
+
self.map_opt.q, self.map_opt.e = scoring[2], scoring[3]
|
134
|
+
self.map_opt.q2, self.map_opt.e2 = self.map_opt.q, self.map_opt.e
|
135
|
+
if len(scoring) >= 6:
|
136
|
+
self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
|
137
|
+
if len(scoring) >= 7:
|
138
|
+
self.map_opt.sc_ambi = scoring[6]
|
139
|
+
|
140
|
+
cdef cmappy.mm_idx_reader_t *r;
|
141
|
+
|
142
|
+
if seq is None:
|
143
|
+
if fn_idx_out is None:
|
144
|
+
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
|
145
|
+
else:
|
146
|
+
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, str.encode(fn_idx_out))
|
147
|
+
if r is not NULL:
|
148
|
+
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
|
149
|
+
cmappy.mm_idx_reader_close(r)
|
150
|
+
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
151
|
+
cmappy.mm_idx_index_name(self._idx)
|
152
|
+
else:
|
153
|
+
self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
|
154
|
+
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
155
|
+
self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
|
156
|
+
|
157
|
+
def __dealloc__(self):
|
158
|
+
if self._idx is not NULL:
|
159
|
+
cmappy.mm_idx_destroy(self._idx)
|
160
|
+
|
161
|
+
def __bool__(self):
|
162
|
+
return (self._idx != NULL)
|
163
|
+
|
164
|
+
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
|
165
|
+
cdef cmappy.mm_reg1_t *regs
|
166
|
+
cdef cmappy.mm_hitpy_t h
|
167
|
+
cdef ThreadBuffer b
|
168
|
+
cdef int n_regs
|
169
|
+
cdef char *cs_str = NULL
|
170
|
+
cdef int l_cs_str, m_cs_str = 0
|
171
|
+
cdef void *km
|
172
|
+
cdef cmappy.mm_mapopt_t map_opt
|
173
|
+
|
174
|
+
if self._idx == NULL: return
|
175
|
+
map_opt = self.map_opt
|
176
|
+
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
|
177
|
+
if extra_flags is not None: map_opt.flag |= extra_flags
|
178
|
+
|
179
|
+
if self._idx is NULL: return None
|
180
|
+
if buf is None: b = ThreadBuffer()
|
181
|
+
else: b = buf
|
182
|
+
km = cmappy.mm_tbuf_get_km(b._b)
|
183
|
+
|
184
|
+
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
185
|
+
if seq2 is None:
|
186
|
+
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
|
187
|
+
else:
|
188
|
+
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
|
189
|
+
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
|
190
|
+
|
191
|
+
try:
|
192
|
+
i = 0
|
193
|
+
while i < n_regs:
|
194
|
+
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
195
|
+
cigar, _cs, _MD = [], '', ''
|
196
|
+
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
|
197
|
+
c = h.cigar32[k]
|
198
|
+
cigar.append([c>>4, c&0xf])
|
199
|
+
if cs or MD: # generate the cs and/or the MD tag, if requested
|
200
|
+
if cs:
|
201
|
+
l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1)
|
202
|
+
_cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
203
|
+
if MD:
|
204
|
+
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq)
|
205
|
+
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
206
|
+
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
|
207
|
+
cmappy.mm_free_reg1(®s[i])
|
208
|
+
i += 1
|
209
|
+
finally:
|
210
|
+
while i < n_regs:
|
211
|
+
cmappy.mm_free_reg1(®s[i])
|
212
|
+
i += 1
|
213
|
+
free(regs)
|
214
|
+
free(cs_str)
|
215
|
+
|
216
|
+
def seq(self, str name, int start=0, int end=0x7fffffff):
|
217
|
+
cdef int l
|
218
|
+
cdef char *s
|
219
|
+
if self._idx == NULL: return
|
220
|
+
s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
|
221
|
+
if l == 0: return None
|
222
|
+
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
223
|
+
free(s)
|
224
|
+
return r
|
225
|
+
|
226
|
+
@property
|
227
|
+
def k(self): return self._idx.k
|
228
|
+
|
229
|
+
@property
|
230
|
+
def w(self): return self._idx.w
|
231
|
+
|
232
|
+
@property
|
233
|
+
def n_seq(self): return self._idx.n_seq
|
234
|
+
|
235
|
+
@property
|
236
|
+
def seq_names(self):
|
237
|
+
cdef char *p
|
238
|
+
if self._idx == NULL: return
|
239
|
+
sn = []
|
240
|
+
for i in range(self._idx.n_seq):
|
241
|
+
p = self._idx.seq[i].name
|
242
|
+
s = p if isinstance(p, str) else p.decode()
|
243
|
+
sn.append(s)
|
244
|
+
return sn
|
245
|
+
|
246
|
+
def fastx_read(fn, read_comment=False):
|
247
|
+
cdef cmappy.kseq_t *ks
|
248
|
+
ks = cmappy.mm_fastx_open(str.encode(fn))
|
249
|
+
if ks is NULL: return None
|
250
|
+
while cmappy.kseq_read(ks) >= 0:
|
251
|
+
if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
|
252
|
+
else: qual = None
|
253
|
+
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
|
254
|
+
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
|
255
|
+
if read_comment:
|
256
|
+
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
|
257
|
+
else: comment = None
|
258
|
+
yield name, seq, qual, comment
|
259
|
+
else:
|
260
|
+
yield name, seq, qual
|
261
|
+
cmappy.mm_fastx_close(ks)
|
262
|
+
|
263
|
+
def revcomp(seq):
|
264
|
+
l = len(seq)
|
265
|
+
bseq = seq if isinstance(seq, bytes) else seq.encode()
|
266
|
+
cdef char *s = cmappy.mappy_revcomp(l, bseq)
|
267
|
+
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
268
|
+
free(s)
|
269
|
+
return r
|
270
|
+
|
271
|
+
def verbose(v=None):
|
272
|
+
if v is None: v = -1
|
273
|
+
return cmappy.mm_verbose_level(v)
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import getopt
|
5
|
+
import mappy as mp
|
6
|
+
|
7
|
+
def main(argv):
|
8
|
+
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
|
9
|
+
if len(args) < 2:
|
10
|
+
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
|
11
|
+
print("Options:")
|
12
|
+
print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice")
|
13
|
+
print(" -n INT mininum number of minimizers")
|
14
|
+
print(" -m INT mininum chaining score")
|
15
|
+
print(" -k INT k-mer length")
|
16
|
+
print(" -w INT minimizer window length")
|
17
|
+
print(" -r INT band width")
|
18
|
+
print(" -c output the cs tag")
|
19
|
+
sys.exit(1)
|
20
|
+
|
21
|
+
preset = min_cnt = min_sc = k = w = bw = None
|
22
|
+
out_cs = False
|
23
|
+
for opt, arg in opts:
|
24
|
+
if opt == '-x': preset = arg
|
25
|
+
elif opt == '-n': min_cnt = int(arg)
|
26
|
+
elif opt == '-m': min_chain_score = int(arg)
|
27
|
+
elif opt == '-r': bw = int(arg)
|
28
|
+
elif opt == '-k': k = int(arg)
|
29
|
+
elif opt == '-w': w = int(arg)
|
30
|
+
elif opt == '-c': out_cs = True
|
31
|
+
|
32
|
+
a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
|
33
|
+
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
34
|
+
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
35
|
+
for h in a.map(seq, cs=out_cs): # traverse hits
|
36
|
+
print('{}\t{}\t{}'.format(name, len(seq), h))
|
37
|
+
|
38
|
+
if __name__ == "__main__":
|
39
|
+
main(sys.argv)
|
@@ -0,0 +1,213 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "kalloc.h"
|
5
|
+
#include "kdq.h"
|
6
|
+
#include "kvec.h"
|
7
|
+
#include "sdust.h"
|
8
|
+
|
9
|
+
#define SD_WLEN 3
|
10
|
+
#define SD_WTOT (1<<(SD_WLEN<<1))
|
11
|
+
#define SD_WMSK (SD_WTOT - 1)
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
int start, finish;
|
15
|
+
int r, l;
|
16
|
+
} perf_intv_t;
|
17
|
+
|
18
|
+
typedef kvec_t(perf_intv_t) perf_intv_v;
|
19
|
+
typedef kvec_t(uint64_t) uint64_v;
|
20
|
+
|
21
|
+
KDQ_INIT(int)
|
22
|
+
|
23
|
+
#if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN)
|
24
|
+
unsigned char seq_nt4_table[256] = {
|
25
|
+
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
26
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
27
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
28
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
29
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
30
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
31
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
32
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
33
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
34
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
35
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
36
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
37
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
38
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
39
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
40
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
41
|
+
};
|
42
|
+
#else
|
43
|
+
extern unsigned char seq_nt4_table[256];
|
44
|
+
#endif
|
45
|
+
|
46
|
+
struct sdust_buf_s {
|
47
|
+
kdq_t(int) *w;
|
48
|
+
perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
|
49
|
+
uint64_v res; // the result
|
50
|
+
void *km; // memory pool
|
51
|
+
};
|
52
|
+
|
53
|
+
sdust_buf_t *sdust_buf_init(void *km)
|
54
|
+
{
|
55
|
+
sdust_buf_t *buf;
|
56
|
+
buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t));
|
57
|
+
buf->km = km;
|
58
|
+
buf->w = kdq_init(int, buf->km);
|
59
|
+
kdq_resize(int, buf->w, 8);
|
60
|
+
return buf;
|
61
|
+
}
|
62
|
+
|
63
|
+
void sdust_buf_destroy(sdust_buf_t *buf)
|
64
|
+
{
|
65
|
+
if (buf == 0) return;
|
66
|
+
kdq_destroy(int, buf->w);
|
67
|
+
kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf);
|
68
|
+
}
|
69
|
+
|
70
|
+
static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv)
|
71
|
+
{
|
72
|
+
int s;
|
73
|
+
if ((int)kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
|
74
|
+
s = *kdq_shift(int, w);
|
75
|
+
*rw -= --cw[s];
|
76
|
+
if (*L > (int)kdq_size(w))
|
77
|
+
--*L, *rv -= --cv[s];
|
78
|
+
}
|
79
|
+
kdq_push(int, w, t);
|
80
|
+
++*L;
|
81
|
+
*rw += cw[t]++;
|
82
|
+
*rv += cv[t]++;
|
83
|
+
if (cv[t] * 10 > T<<1) {
|
84
|
+
do {
|
85
|
+
s = kdq_at(w, kdq_size(w) - *L);
|
86
|
+
*rv -= --cv[s];
|
87
|
+
--*L;
|
88
|
+
} while (s != t);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start)
|
93
|
+
{
|
94
|
+
int i, saved = 0;
|
95
|
+
perf_intv_t *p;
|
96
|
+
if (P->n == 0 || P->a[P->n - 1].start >= start) return;
|
97
|
+
p = &P->a[P->n - 1];
|
98
|
+
if (res->n) {
|
99
|
+
int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
|
100
|
+
if (p->start <= f) // if overlapping with or adjacent to the previous interval
|
101
|
+
saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish);
|
102
|
+
}
|
103
|
+
if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish);
|
104
|
+
for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
|
105
|
+
P->n = i + 1;
|
106
|
+
}
|
107
|
+
|
108
|
+
static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv)
|
109
|
+
{
|
110
|
+
int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
|
111
|
+
memcpy(c, cv, SD_WTOT * sizeof(int));
|
112
|
+
for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
|
113
|
+
int j, t = kdq_at(w, i), new_r, new_l;
|
114
|
+
r += c[t]++;
|
115
|
+
new_r = r, new_l = kdq_size(w) - i - 1;
|
116
|
+
if (new_r * 10 > T * new_l) {
|
117
|
+
for (j = 0; j < (int)P->n && P->a[j].start >= i + start; ++j) { // find insertion position
|
118
|
+
perf_intv_t *p = &P->a[j];
|
119
|
+
if (max_r == 0 || p->r * max_l > max_r * p->l)
|
120
|
+
max_r = p->r, max_l = p->l;
|
121
|
+
}
|
122
|
+
if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert
|
123
|
+
max_r = new_r, max_l = new_l;
|
124
|
+
if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1);
|
125
|
+
memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
|
126
|
+
++P->n;
|
127
|
+
P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
|
128
|
+
P->a[j].r = new_r, P->a[j].l = new_l;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
}
|
132
|
+
}
|
133
|
+
|
134
|
+
const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf)
|
135
|
+
{
|
136
|
+
int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
|
137
|
+
int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
|
138
|
+
unsigned t; // current word
|
139
|
+
|
140
|
+
buf->P.n = buf->res.n = 0;
|
141
|
+
buf->w->front = buf->w->count = 0;
|
142
|
+
memset(cv, 0, SD_WTOT * sizeof(int));
|
143
|
+
memset(cw, 0, SD_WTOT * sizeof(int));
|
144
|
+
if (l_seq < 0) l_seq = strlen((const char*)seq);
|
145
|
+
for (i = l = t = 0; i <= l_seq; ++i) {
|
146
|
+
int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
|
147
|
+
if (b < 4) { // an A/C/G/T base
|
148
|
+
++l, t = (t<<2 | b) & SD_WMSK;
|
149
|
+
if (l >= SD_WLEN) { // we have seen a word
|
150
|
+
start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
|
151
|
+
save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window?
|
152
|
+
shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
|
153
|
+
if (rw * 10 > L * T)
|
154
|
+
find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv);
|
155
|
+
}
|
156
|
+
} else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
|
157
|
+
start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
|
158
|
+
while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals
|
159
|
+
l = t = 0;
|
160
|
+
}
|
161
|
+
}
|
162
|
+
*n = buf->res.n;
|
163
|
+
return buf->res.a;
|
164
|
+
}
|
165
|
+
|
166
|
+
uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n)
|
167
|
+
{
|
168
|
+
uint64_t *ret;
|
169
|
+
sdust_buf_t *buf;
|
170
|
+
buf = sdust_buf_init(km);
|
171
|
+
ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
|
172
|
+
buf->res.a = 0;
|
173
|
+
sdust_buf_destroy(buf);
|
174
|
+
return ret;
|
175
|
+
}
|
176
|
+
|
177
|
+
#ifdef _SDUST_MAIN
|
178
|
+
#include <zlib.h>
|
179
|
+
#include <stdio.h>
|
180
|
+
#include "ketopt.h"
|
181
|
+
#include "kseq.h"
|
182
|
+
KSEQ_INIT(gzFile, gzread)
|
183
|
+
|
184
|
+
int main(int argc, char *argv[])
|
185
|
+
{
|
186
|
+
gzFile fp;
|
187
|
+
kseq_t *ks;
|
188
|
+
int W = 64, T = 20, c;
|
189
|
+
ketopt_t o = KETOPT_INIT;
|
190
|
+
|
191
|
+
while ((c = ketopt(&o, argc, argv, 1, "w:t:", 0)) >= 0) {
|
192
|
+
if (c == 'w') W = atoi(o.arg);
|
193
|
+
else if (c == 't') T = atoi(o.arg);
|
194
|
+
}
|
195
|
+
if (o.ind == argc) {
|
196
|
+
fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
|
197
|
+
return 1;
|
198
|
+
}
|
199
|
+
fp = strcmp(argv[o.ind], "-")? gzopen(argv[o.ind], "r") : gzdopen(fileno(stdin), "r");
|
200
|
+
ks = kseq_init(fp);
|
201
|
+
while (kseq_read(ks) >= 0) {
|
202
|
+
uint64_t *r;
|
203
|
+
int i, n;
|
204
|
+
r = sdust(0, (uint8_t*)ks->seq.s, -1, T, W, &n);
|
205
|
+
for (i = 0; i < n; ++i)
|
206
|
+
printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
|
207
|
+
free(r);
|
208
|
+
}
|
209
|
+
kseq_destroy(ks);
|
210
|
+
gzclose(fp);
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
#endif
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef SDUST_H
|
2
|
+
#define SDUST_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
struct sdust_buf_s;
|
11
|
+
typedef struct sdust_buf_s sdust_buf_t;
|
12
|
+
|
13
|
+
// the simple interface
|
14
|
+
uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n);
|
15
|
+
|
16
|
+
// the following interface dramatically reduce heap allocations when sdust is frequently called.
|
17
|
+
sdust_buf_t *sdust_buf_init(void *km);
|
18
|
+
void sdust_buf_destroy(sdust_buf_t *buf);
|
19
|
+
const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf);
|
20
|
+
|
21
|
+
#ifdef __cplusplus
|
22
|
+
}
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#endif
|
data/ext/minimap2/seed.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
#include "mmpriv.h"
|
2
|
+
#include "kalloc.h"
|
3
|
+
#include "ksort.h"
|
4
|
+
|
5
|
+
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
|
6
|
+
{
|
7
|
+
mm128_t *a;
|
8
|
+
size_t i, j, st;
|
9
|
+
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
|
10
|
+
KMALLOC(km, a, mv->n);
|
11
|
+
for (i = 0; i < mv->n; ++i)
|
12
|
+
a[i].x = mv->a[i].x, a[i].y = i;
|
13
|
+
radix_sort_128x(a, a + mv->n);
|
14
|
+
for (st = 0, i = 1; i <= mv->n; ++i) {
|
15
|
+
if (i == mv->n || a[i].x != a[st].x) {
|
16
|
+
int32_t cnt = i - st;
|
17
|
+
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
18
|
+
for (j = st; j < i; ++j)
|
19
|
+
mv->a[a[j].y].x = 0;
|
20
|
+
st = i;
|
21
|
+
}
|
22
|
+
}
|
23
|
+
kfree(km, a);
|
24
|
+
for (i = j = 0; i < mv->n; ++i)
|
25
|
+
if (mv->a[i].x != 0)
|
26
|
+
mv->a[j++] = mv->a[i];
|
27
|
+
mv->n = j;
|
28
|
+
}
|
29
|
+
|
30
|
+
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
|
31
|
+
{
|
32
|
+
mm_seed_t *m;
|
33
|
+
size_t i;
|
34
|
+
int32_t k;
|
35
|
+
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
36
|
+
for (i = k = 0; i < mv->n; ++i) {
|
37
|
+
const uint64_t *cr;
|
38
|
+
mm_seed_t *q;
|
39
|
+
mm128_t *p = &mv->a[i];
|
40
|
+
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
41
|
+
int t;
|
42
|
+
cr = mm_idx_get(mi, p->x>>8, &t);
|
43
|
+
if (t == 0) continue;
|
44
|
+
q = &m[k++];
|
45
|
+
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
46
|
+
q->is_tandem = q->flt = 0;
|
47
|
+
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
48
|
+
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
49
|
+
}
|
50
|
+
*n_m_ = k;
|
51
|
+
return m;
|
52
|
+
}
|
53
|
+
|
54
|
+
#define MAX_MAX_HIGH_OCC 128
|
55
|
+
|
56
|
+
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
57
|
+
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
58
|
+
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
59
|
+
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
60
|
+
int32_t i, last0, m;
|
61
|
+
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
62
|
+
|
63
|
+
if (n == 0 || n == 1) return;
|
64
|
+
for (i = m = 0; i < n; ++i)
|
65
|
+
if (a[i].n > max_occ) ++m;
|
66
|
+
if (m == 0) return; // no high-frequency k-mers; do nothing
|
67
|
+
for (i = 0, last0 = -1; i <= n; ++i) {
|
68
|
+
if (i == n || a[i].n <= max_occ) {
|
69
|
+
if (i - last0 > 1) {
|
70
|
+
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
|
71
|
+
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
|
72
|
+
int32_t j, k, st = last0 + 1, en = i;
|
73
|
+
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
74
|
+
if (max_high_occ > 0) {
|
75
|
+
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
76
|
+
max_high_occ = MAX_MAX_HIGH_OCC;
|
77
|
+
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
78
|
+
b[k] = (uint64_t)a[j].n<<32 | j;
|
79
|
+
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
80
|
+
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
81
|
+
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
|
82
|
+
b[0] = (uint64_t)a[j].n<<32 | j;
|
83
|
+
ks_heapdown_uint64_t(0, k, b);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
87
|
+
}
|
88
|
+
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
89
|
+
for (j = st; j < en; ++j)
|
90
|
+
if (a[j].n > max_max_occ)
|
91
|
+
a[j].flt = 1;
|
92
|
+
}
|
93
|
+
last0 = i;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
99
|
+
{
|
100
|
+
int rep_st = 0, rep_en = 0, n_m, n_m0;
|
101
|
+
size_t i;
|
102
|
+
mm_seed_t *m;
|
103
|
+
*n_mini_pos = 0;
|
104
|
+
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
105
|
+
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
106
|
+
if (dist > 0 && max_max_occ > max_occ) {
|
107
|
+
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
108
|
+
} else {
|
109
|
+
for (i = 0; i < n_m0; ++i)
|
110
|
+
if (m[i].n > max_occ)
|
111
|
+
m[i].flt = 1;
|
112
|
+
}
|
113
|
+
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
114
|
+
mm_seed_t *q = &m[i];
|
115
|
+
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
116
|
+
if (q->flt) {
|
117
|
+
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
118
|
+
if (st > rep_en) {
|
119
|
+
*rep_len += rep_en - rep_st;
|
120
|
+
rep_st = st, rep_en = en;
|
121
|
+
} else rep_en = en;
|
122
|
+
} else {
|
123
|
+
*n_a += q->n;
|
124
|
+
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
125
|
+
m[n_m++] = *q;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
*rep_len += rep_en - rep_st;
|
129
|
+
*_n_m = n_m;
|
130
|
+
return m;
|
131
|
+
}
|