minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
from libc.stdint cimport uint8_t, int8_t
|
2
|
+
from libc.stdlib cimport free
|
3
|
+
cimport cmappy
|
4
|
+
import sys
|
5
|
+
|
6
|
+
__version__ = '2.24'
|
7
|
+
|
8
|
+
cmappy.mm_reset_timer()
|
9
|
+
|
10
|
+
cdef class Alignment:
|
11
|
+
cdef int _ctg_len, _r_st, _r_en
|
12
|
+
cdef int _q_st, _q_en
|
13
|
+
cdef int _NM, _mlen, _blen
|
14
|
+
cdef int8_t _strand, _trans_strand
|
15
|
+
cdef uint8_t _mapq, _is_primary
|
16
|
+
cdef int _seg_id
|
17
|
+
cdef _ctg, _cigar, _cs, _MD # these are python objects
|
18
|
+
|
19
|
+
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
|
20
|
+
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
|
21
|
+
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
|
22
|
+
self._strand, self._q_st, self._q_en = strand, qs, qe
|
23
|
+
self._NM, self._mlen, self._blen = NM, mlen, blen
|
24
|
+
self._mapq = mapq
|
25
|
+
self._cigar = cigar
|
26
|
+
self._is_primary = is_primary
|
27
|
+
self._trans_strand = trans_strand
|
28
|
+
self._seg_id = seg_id
|
29
|
+
self._cs = cs_str
|
30
|
+
self._MD = MD_str
|
31
|
+
|
32
|
+
@property
|
33
|
+
def ctg(self): return self._ctg
|
34
|
+
|
35
|
+
@property
|
36
|
+
def ctg_len(self): return self._ctg_len
|
37
|
+
|
38
|
+
@property
|
39
|
+
def r_st(self): return self._r_st
|
40
|
+
|
41
|
+
@property
|
42
|
+
def r_en(self): return self._r_en
|
43
|
+
|
44
|
+
@property
|
45
|
+
def strand(self): return self._strand
|
46
|
+
|
47
|
+
@property
|
48
|
+
def trans_strand(self): return self._trans_strand
|
49
|
+
|
50
|
+
@property
|
51
|
+
def blen(self): return self._blen
|
52
|
+
|
53
|
+
@property
|
54
|
+
def mlen(self): return self._mlen
|
55
|
+
|
56
|
+
@property
|
57
|
+
def NM(self): return self._NM
|
58
|
+
|
59
|
+
@property
|
60
|
+
def is_primary(self): return (self._is_primary != 0)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def q_st(self): return self._q_st
|
64
|
+
|
65
|
+
@property
|
66
|
+
def q_en(self): return self._q_en
|
67
|
+
|
68
|
+
@property
|
69
|
+
def mapq(self): return self._mapq
|
70
|
+
|
71
|
+
@property
|
72
|
+
def cigar(self): return self._cigar
|
73
|
+
|
74
|
+
@property
|
75
|
+
def read_num(self): return self._seg_id + 1
|
76
|
+
|
77
|
+
@property
|
78
|
+
def cs(self): return self._cs
|
79
|
+
|
80
|
+
@property
|
81
|
+
def MD(self): return self._MD
|
82
|
+
|
83
|
+
@property
|
84
|
+
def cigar_str(self):
|
85
|
+
return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
|
86
|
+
|
87
|
+
def __str__(self):
|
88
|
+
if self._strand > 0: strand = '+'
|
89
|
+
elif self._strand < 0: strand = '-'
|
90
|
+
else: strand = '?'
|
91
|
+
if self._is_primary != 0: tp = 'tp:A:P'
|
92
|
+
else: tp = 'tp:A:S'
|
93
|
+
if self._trans_strand > 0: ts = 'ts:A:+'
|
94
|
+
elif self._trans_strand < 0: ts = 'ts:A:-'
|
95
|
+
else: ts = 'ts:A:.'
|
96
|
+
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
97
|
+
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
|
98
|
+
if self._cs != "": a.append("cs:Z:" + self._cs)
|
99
|
+
return "\t".join(a)
|
100
|
+
|
101
|
+
cdef class ThreadBuffer:
|
102
|
+
cdef cmappy.mm_tbuf_t *_b
|
103
|
+
|
104
|
+
def __cinit__(self):
|
105
|
+
self._b = cmappy.mm_tbuf_init()
|
106
|
+
|
107
|
+
def __dealloc__(self):
|
108
|
+
cmappy.mm_tbuf_destroy(self._b)
|
109
|
+
|
110
|
+
cdef class Aligner:
|
111
|
+
cdef cmappy.mm_idx_t *_idx
|
112
|
+
cdef cmappy.mm_idxopt_t idx_opt
|
113
|
+
cdef cmappy.mm_mapopt_t map_opt
|
114
|
+
|
115
|
+
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
116
|
+
self._idx = NULL
|
117
|
+
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
118
|
+
if preset is not None:
|
119
|
+
cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
|
120
|
+
self.map_opt.flag |= 4 # always perform alignment
|
121
|
+
self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
|
122
|
+
if k is not None: self.idx_opt.k = k
|
123
|
+
if w is not None: self.idx_opt.w = w
|
124
|
+
if min_cnt is not None: self.map_opt.min_cnt = min_cnt
|
125
|
+
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
126
|
+
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
127
|
+
if bw is not None: self.map_opt.bw = bw
|
128
|
+
if best_n is not None: self.map_opt.best_n = best_n
|
129
|
+
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
130
|
+
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
131
|
+
if scoring is not None and len(scoring) >= 4:
|
132
|
+
self.map_opt.a, self.map_opt.b = scoring[0], scoring[1]
|
133
|
+
self.map_opt.q, self.map_opt.e = scoring[2], scoring[3]
|
134
|
+
self.map_opt.q2, self.map_opt.e2 = self.map_opt.q, self.map_opt.e
|
135
|
+
if len(scoring) >= 6:
|
136
|
+
self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
|
137
|
+
if len(scoring) >= 7:
|
138
|
+
self.map_opt.sc_ambi = scoring[6]
|
139
|
+
|
140
|
+
cdef cmappy.mm_idx_reader_t *r;
|
141
|
+
|
142
|
+
if seq is None:
|
143
|
+
if fn_idx_out is None:
|
144
|
+
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
|
145
|
+
else:
|
146
|
+
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, str.encode(fn_idx_out))
|
147
|
+
if r is not NULL:
|
148
|
+
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
|
149
|
+
cmappy.mm_idx_reader_close(r)
|
150
|
+
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
151
|
+
cmappy.mm_idx_index_name(self._idx)
|
152
|
+
else:
|
153
|
+
self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
|
154
|
+
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
155
|
+
self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
|
156
|
+
|
157
|
+
def __dealloc__(self):
|
158
|
+
if self._idx is not NULL:
|
159
|
+
cmappy.mm_idx_destroy(self._idx)
|
160
|
+
|
161
|
+
def __bool__(self):
|
162
|
+
return (self._idx != NULL)
|
163
|
+
|
164
|
+
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
|
165
|
+
cdef cmappy.mm_reg1_t *regs
|
166
|
+
cdef cmappy.mm_hitpy_t h
|
167
|
+
cdef ThreadBuffer b
|
168
|
+
cdef int n_regs
|
169
|
+
cdef char *cs_str = NULL
|
170
|
+
cdef int l_cs_str, m_cs_str = 0
|
171
|
+
cdef void *km
|
172
|
+
cdef cmappy.mm_mapopt_t map_opt
|
173
|
+
|
174
|
+
if self._idx == NULL: return
|
175
|
+
map_opt = self.map_opt
|
176
|
+
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
|
177
|
+
if extra_flags is not None: map_opt.flag |= extra_flags
|
178
|
+
|
179
|
+
if self._idx is NULL: return None
|
180
|
+
if buf is None: b = ThreadBuffer()
|
181
|
+
else: b = buf
|
182
|
+
km = cmappy.mm_tbuf_get_km(b._b)
|
183
|
+
|
184
|
+
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
185
|
+
if seq2 is None:
|
186
|
+
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
|
187
|
+
else:
|
188
|
+
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
|
189
|
+
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
|
190
|
+
|
191
|
+
try:
|
192
|
+
i = 0
|
193
|
+
while i < n_regs:
|
194
|
+
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
195
|
+
cigar, _cs, _MD = [], '', ''
|
196
|
+
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
|
197
|
+
c = h.cigar32[k]
|
198
|
+
cigar.append([c>>4, c&0xf])
|
199
|
+
if cs or MD: # generate the cs and/or the MD tag, if requested
|
200
|
+
if cs:
|
201
|
+
l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1)
|
202
|
+
_cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
203
|
+
if MD:
|
204
|
+
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq)
|
205
|
+
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
206
|
+
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
|
207
|
+
cmappy.mm_free_reg1(®s[i])
|
208
|
+
i += 1
|
209
|
+
finally:
|
210
|
+
while i < n_regs:
|
211
|
+
cmappy.mm_free_reg1(®s[i])
|
212
|
+
i += 1
|
213
|
+
free(regs)
|
214
|
+
free(cs_str)
|
215
|
+
|
216
|
+
def seq(self, str name, int start=0, int end=0x7fffffff):
|
217
|
+
cdef int l
|
218
|
+
cdef char *s
|
219
|
+
if self._idx == NULL: return
|
220
|
+
s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
|
221
|
+
if l == 0: return None
|
222
|
+
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
223
|
+
free(s)
|
224
|
+
return r
|
225
|
+
|
226
|
+
@property
|
227
|
+
def k(self): return self._idx.k
|
228
|
+
|
229
|
+
@property
|
230
|
+
def w(self): return self._idx.w
|
231
|
+
|
232
|
+
@property
|
233
|
+
def n_seq(self): return self._idx.n_seq
|
234
|
+
|
235
|
+
@property
|
236
|
+
def seq_names(self):
|
237
|
+
cdef char *p
|
238
|
+
if self._idx == NULL: return
|
239
|
+
sn = []
|
240
|
+
for i in range(self._idx.n_seq):
|
241
|
+
p = self._idx.seq[i].name
|
242
|
+
s = p if isinstance(p, str) else p.decode()
|
243
|
+
sn.append(s)
|
244
|
+
return sn
|
245
|
+
|
246
|
+
def fastx_read(fn, read_comment=False):
|
247
|
+
cdef cmappy.kseq_t *ks
|
248
|
+
ks = cmappy.mm_fastx_open(str.encode(fn))
|
249
|
+
if ks is NULL: return None
|
250
|
+
while cmappy.kseq_read(ks) >= 0:
|
251
|
+
if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
|
252
|
+
else: qual = None
|
253
|
+
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
|
254
|
+
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
|
255
|
+
if read_comment:
|
256
|
+
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
|
257
|
+
else: comment = None
|
258
|
+
yield name, seq, qual, comment
|
259
|
+
else:
|
260
|
+
yield name, seq, qual
|
261
|
+
cmappy.mm_fastx_close(ks)
|
262
|
+
|
263
|
+
def revcomp(seq):
|
264
|
+
l = len(seq)
|
265
|
+
bseq = seq if isinstance(seq, bytes) else seq.encode()
|
266
|
+
cdef char *s = cmappy.mappy_revcomp(l, bseq)
|
267
|
+
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
268
|
+
free(s)
|
269
|
+
return r
|
270
|
+
|
271
|
+
def verbose(v=None):
|
272
|
+
if v is None: v = -1
|
273
|
+
return cmappy.mm_verbose_level(v)
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import getopt
|
5
|
+
import mappy as mp
|
6
|
+
|
7
|
+
def main(argv):
|
8
|
+
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
|
9
|
+
if len(args) < 2:
|
10
|
+
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
|
11
|
+
print("Options:")
|
12
|
+
print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice")
|
13
|
+
print(" -n INT mininum number of minimizers")
|
14
|
+
print(" -m INT mininum chaining score")
|
15
|
+
print(" -k INT k-mer length")
|
16
|
+
print(" -w INT minimizer window length")
|
17
|
+
print(" -r INT band width")
|
18
|
+
print(" -c output the cs tag")
|
19
|
+
sys.exit(1)
|
20
|
+
|
21
|
+
preset = min_cnt = min_sc = k = w = bw = None
|
22
|
+
out_cs = False
|
23
|
+
for opt, arg in opts:
|
24
|
+
if opt == '-x': preset = arg
|
25
|
+
elif opt == '-n': min_cnt = int(arg)
|
26
|
+
elif opt == '-m': min_chain_score = int(arg)
|
27
|
+
elif opt == '-r': bw = int(arg)
|
28
|
+
elif opt == '-k': k = int(arg)
|
29
|
+
elif opt == '-w': w = int(arg)
|
30
|
+
elif opt == '-c': out_cs = True
|
31
|
+
|
32
|
+
a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
|
33
|
+
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
34
|
+
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
35
|
+
for h in a.map(seq, cs=out_cs): # traverse hits
|
36
|
+
print('{}\t{}\t{}'.format(name, len(seq), h))
|
37
|
+
|
38
|
+
if __name__ == "__main__":
|
39
|
+
main(sys.argv)
|
@@ -0,0 +1,213 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "kalloc.h"
|
5
|
+
#include "kdq.h"
|
6
|
+
#include "kvec.h"
|
7
|
+
#include "sdust.h"
|
8
|
+
|
9
|
+
#define SD_WLEN 3
|
10
|
+
#define SD_WTOT (1<<(SD_WLEN<<1))
|
11
|
+
#define SD_WMSK (SD_WTOT - 1)
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
int start, finish;
|
15
|
+
int r, l;
|
16
|
+
} perf_intv_t;
|
17
|
+
|
18
|
+
typedef kvec_t(perf_intv_t) perf_intv_v;
|
19
|
+
typedef kvec_t(uint64_t) uint64_v;
|
20
|
+
|
21
|
+
KDQ_INIT(int)
|
22
|
+
|
23
|
+
#if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN)
|
24
|
+
unsigned char seq_nt4_table[256] = {
|
25
|
+
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
26
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
27
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
28
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
29
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
30
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
31
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
32
|
+
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
33
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
34
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
35
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
36
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
37
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
38
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
39
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
40
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
41
|
+
};
|
42
|
+
#else
|
43
|
+
extern unsigned char seq_nt4_table[256];
|
44
|
+
#endif
|
45
|
+
|
46
|
+
struct sdust_buf_s {
|
47
|
+
kdq_t(int) *w;
|
48
|
+
perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
|
49
|
+
uint64_v res; // the result
|
50
|
+
void *km; // memory pool
|
51
|
+
};
|
52
|
+
|
53
|
+
sdust_buf_t *sdust_buf_init(void *km)
|
54
|
+
{
|
55
|
+
sdust_buf_t *buf;
|
56
|
+
buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t));
|
57
|
+
buf->km = km;
|
58
|
+
buf->w = kdq_init(int, buf->km);
|
59
|
+
kdq_resize(int, buf->w, 8);
|
60
|
+
return buf;
|
61
|
+
}
|
62
|
+
|
63
|
+
void sdust_buf_destroy(sdust_buf_t *buf)
|
64
|
+
{
|
65
|
+
if (buf == 0) return;
|
66
|
+
kdq_destroy(int, buf->w);
|
67
|
+
kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf);
|
68
|
+
}
|
69
|
+
|
70
|
+
static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv)
|
71
|
+
{
|
72
|
+
int s;
|
73
|
+
if ((int)kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
|
74
|
+
s = *kdq_shift(int, w);
|
75
|
+
*rw -= --cw[s];
|
76
|
+
if (*L > (int)kdq_size(w))
|
77
|
+
--*L, *rv -= --cv[s];
|
78
|
+
}
|
79
|
+
kdq_push(int, w, t);
|
80
|
+
++*L;
|
81
|
+
*rw += cw[t]++;
|
82
|
+
*rv += cv[t]++;
|
83
|
+
if (cv[t] * 10 > T<<1) {
|
84
|
+
do {
|
85
|
+
s = kdq_at(w, kdq_size(w) - *L);
|
86
|
+
*rv -= --cv[s];
|
87
|
+
--*L;
|
88
|
+
} while (s != t);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start)
|
93
|
+
{
|
94
|
+
int i, saved = 0;
|
95
|
+
perf_intv_t *p;
|
96
|
+
if (P->n == 0 || P->a[P->n - 1].start >= start) return;
|
97
|
+
p = &P->a[P->n - 1];
|
98
|
+
if (res->n) {
|
99
|
+
int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
|
100
|
+
if (p->start <= f) // if overlapping with or adjacent to the previous interval
|
101
|
+
saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish);
|
102
|
+
}
|
103
|
+
if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish);
|
104
|
+
for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
|
105
|
+
P->n = i + 1;
|
106
|
+
}
|
107
|
+
|
108
|
+
static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv)
|
109
|
+
{
|
110
|
+
int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
|
111
|
+
memcpy(c, cv, SD_WTOT * sizeof(int));
|
112
|
+
for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
|
113
|
+
int j, t = kdq_at(w, i), new_r, new_l;
|
114
|
+
r += c[t]++;
|
115
|
+
new_r = r, new_l = kdq_size(w) - i - 1;
|
116
|
+
if (new_r * 10 > T * new_l) {
|
117
|
+
for (j = 0; j < (int)P->n && P->a[j].start >= i + start; ++j) { // find insertion position
|
118
|
+
perf_intv_t *p = &P->a[j];
|
119
|
+
if (max_r == 0 || p->r * max_l > max_r * p->l)
|
120
|
+
max_r = p->r, max_l = p->l;
|
121
|
+
}
|
122
|
+
if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert
|
123
|
+
max_r = new_r, max_l = new_l;
|
124
|
+
if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1);
|
125
|
+
memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
|
126
|
+
++P->n;
|
127
|
+
P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
|
128
|
+
P->a[j].r = new_r, P->a[j].l = new_l;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
}
|
132
|
+
}
|
133
|
+
|
134
|
+
const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf)
|
135
|
+
{
|
136
|
+
int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
|
137
|
+
int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
|
138
|
+
unsigned t; // current word
|
139
|
+
|
140
|
+
buf->P.n = buf->res.n = 0;
|
141
|
+
buf->w->front = buf->w->count = 0;
|
142
|
+
memset(cv, 0, SD_WTOT * sizeof(int));
|
143
|
+
memset(cw, 0, SD_WTOT * sizeof(int));
|
144
|
+
if (l_seq < 0) l_seq = strlen((const char*)seq);
|
145
|
+
for (i = l = t = 0; i <= l_seq; ++i) {
|
146
|
+
int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
|
147
|
+
if (b < 4) { // an A/C/G/T base
|
148
|
+
++l, t = (t<<2 | b) & SD_WMSK;
|
149
|
+
if (l >= SD_WLEN) { // we have seen a word
|
150
|
+
start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
|
151
|
+
save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window?
|
152
|
+
shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
|
153
|
+
if (rw * 10 > L * T)
|
154
|
+
find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv);
|
155
|
+
}
|
156
|
+
} else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
|
157
|
+
start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
|
158
|
+
while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals
|
159
|
+
l = t = 0;
|
160
|
+
}
|
161
|
+
}
|
162
|
+
*n = buf->res.n;
|
163
|
+
return buf->res.a;
|
164
|
+
}
|
165
|
+
|
166
|
+
uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n)
|
167
|
+
{
|
168
|
+
uint64_t *ret;
|
169
|
+
sdust_buf_t *buf;
|
170
|
+
buf = sdust_buf_init(km);
|
171
|
+
ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
|
172
|
+
buf->res.a = 0;
|
173
|
+
sdust_buf_destroy(buf);
|
174
|
+
return ret;
|
175
|
+
}
|
176
|
+
|
177
|
+
#ifdef _SDUST_MAIN
|
178
|
+
#include <zlib.h>
|
179
|
+
#include <stdio.h>
|
180
|
+
#include "ketopt.h"
|
181
|
+
#include "kseq.h"
|
182
|
+
KSEQ_INIT(gzFile, gzread)
|
183
|
+
|
184
|
+
int main(int argc, char *argv[])
|
185
|
+
{
|
186
|
+
gzFile fp;
|
187
|
+
kseq_t *ks;
|
188
|
+
int W = 64, T = 20, c;
|
189
|
+
ketopt_t o = KETOPT_INIT;
|
190
|
+
|
191
|
+
while ((c = ketopt(&o, argc, argv, 1, "w:t:", 0)) >= 0) {
|
192
|
+
if (c == 'w') W = atoi(o.arg);
|
193
|
+
else if (c == 't') T = atoi(o.arg);
|
194
|
+
}
|
195
|
+
if (o.ind == argc) {
|
196
|
+
fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
|
197
|
+
return 1;
|
198
|
+
}
|
199
|
+
fp = strcmp(argv[o.ind], "-")? gzopen(argv[o.ind], "r") : gzdopen(fileno(stdin), "r");
|
200
|
+
ks = kseq_init(fp);
|
201
|
+
while (kseq_read(ks) >= 0) {
|
202
|
+
uint64_t *r;
|
203
|
+
int i, n;
|
204
|
+
r = sdust(0, (uint8_t*)ks->seq.s, -1, T, W, &n);
|
205
|
+
for (i = 0; i < n; ++i)
|
206
|
+
printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
|
207
|
+
free(r);
|
208
|
+
}
|
209
|
+
kseq_destroy(ks);
|
210
|
+
gzclose(fp);
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
#endif
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef SDUST_H
|
2
|
+
#define SDUST_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
struct sdust_buf_s;
|
11
|
+
typedef struct sdust_buf_s sdust_buf_t;
|
12
|
+
|
13
|
+
// the simple interface
|
14
|
+
uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n);
|
15
|
+
|
16
|
+
// the following interface dramatically reduce heap allocations when sdust is frequently called.
|
17
|
+
sdust_buf_t *sdust_buf_init(void *km);
|
18
|
+
void sdust_buf_destroy(sdust_buf_t *buf);
|
19
|
+
const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf);
|
20
|
+
|
21
|
+
#ifdef __cplusplus
|
22
|
+
}
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#endif
|
data/ext/minimap2/seed.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
#include "mmpriv.h"
|
2
|
+
#include "kalloc.h"
|
3
|
+
#include "ksort.h"
|
4
|
+
|
5
|
+
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
|
6
|
+
{
|
7
|
+
mm128_t *a;
|
8
|
+
size_t i, j, st;
|
9
|
+
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
|
10
|
+
KMALLOC(km, a, mv->n);
|
11
|
+
for (i = 0; i < mv->n; ++i)
|
12
|
+
a[i].x = mv->a[i].x, a[i].y = i;
|
13
|
+
radix_sort_128x(a, a + mv->n);
|
14
|
+
for (st = 0, i = 1; i <= mv->n; ++i) {
|
15
|
+
if (i == mv->n || a[i].x != a[st].x) {
|
16
|
+
int32_t cnt = i - st;
|
17
|
+
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
18
|
+
for (j = st; j < i; ++j)
|
19
|
+
mv->a[a[j].y].x = 0;
|
20
|
+
st = i;
|
21
|
+
}
|
22
|
+
}
|
23
|
+
kfree(km, a);
|
24
|
+
for (i = j = 0; i < mv->n; ++i)
|
25
|
+
if (mv->a[i].x != 0)
|
26
|
+
mv->a[j++] = mv->a[i];
|
27
|
+
mv->n = j;
|
28
|
+
}
|
29
|
+
|
30
|
+
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
|
31
|
+
{
|
32
|
+
mm_seed_t *m;
|
33
|
+
size_t i;
|
34
|
+
int32_t k;
|
35
|
+
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
36
|
+
for (i = k = 0; i < mv->n; ++i) {
|
37
|
+
const uint64_t *cr;
|
38
|
+
mm_seed_t *q;
|
39
|
+
mm128_t *p = &mv->a[i];
|
40
|
+
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
41
|
+
int t;
|
42
|
+
cr = mm_idx_get(mi, p->x>>8, &t);
|
43
|
+
if (t == 0) continue;
|
44
|
+
q = &m[k++];
|
45
|
+
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
46
|
+
q->is_tandem = q->flt = 0;
|
47
|
+
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
48
|
+
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
49
|
+
}
|
50
|
+
*n_m_ = k;
|
51
|
+
return m;
|
52
|
+
}
|
53
|
+
|
54
|
+
#define MAX_MAX_HIGH_OCC 128
|
55
|
+
|
56
|
+
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
57
|
+
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
58
|
+
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
59
|
+
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
60
|
+
int32_t i, last0, m;
|
61
|
+
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
62
|
+
|
63
|
+
if (n == 0 || n == 1) return;
|
64
|
+
for (i = m = 0; i < n; ++i)
|
65
|
+
if (a[i].n > max_occ) ++m;
|
66
|
+
if (m == 0) return; // no high-frequency k-mers; do nothing
|
67
|
+
for (i = 0, last0 = -1; i <= n; ++i) {
|
68
|
+
if (i == n || a[i].n <= max_occ) {
|
69
|
+
if (i - last0 > 1) {
|
70
|
+
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
|
71
|
+
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
|
72
|
+
int32_t j, k, st = last0 + 1, en = i;
|
73
|
+
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
74
|
+
if (max_high_occ > 0) {
|
75
|
+
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
76
|
+
max_high_occ = MAX_MAX_HIGH_OCC;
|
77
|
+
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
78
|
+
b[k] = (uint64_t)a[j].n<<32 | j;
|
79
|
+
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
80
|
+
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
81
|
+
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
|
82
|
+
b[0] = (uint64_t)a[j].n<<32 | j;
|
83
|
+
ks_heapdown_uint64_t(0, k, b);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
87
|
+
}
|
88
|
+
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
89
|
+
for (j = st; j < en; ++j)
|
90
|
+
if (a[j].n > max_max_occ)
|
91
|
+
a[j].flt = 1;
|
92
|
+
}
|
93
|
+
last0 = i;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
99
|
+
{
|
100
|
+
int rep_st = 0, rep_en = 0, n_m, n_m0;
|
101
|
+
size_t i;
|
102
|
+
mm_seed_t *m;
|
103
|
+
*n_mini_pos = 0;
|
104
|
+
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
105
|
+
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
106
|
+
if (dist > 0 && max_max_occ > max_occ) {
|
107
|
+
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
108
|
+
} else {
|
109
|
+
for (i = 0; i < n_m0; ++i)
|
110
|
+
if (m[i].n > max_occ)
|
111
|
+
m[i].flt = 1;
|
112
|
+
}
|
113
|
+
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
114
|
+
mm_seed_t *q = &m[i];
|
115
|
+
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
116
|
+
if (q->flt) {
|
117
|
+
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
118
|
+
if (st > rep_en) {
|
119
|
+
*rep_len += rep_en - rep_st;
|
120
|
+
rep_st = st, rep_en = en;
|
121
|
+
} else rep_en = en;
|
122
|
+
} else {
|
123
|
+
*n_a += q->n;
|
124
|
+
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
125
|
+
m[n_m++] = *q;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
*rep_len += rep_en - rep_st;
|
129
|
+
*_n_m = n_m;
|
130
|
+
return m;
|
131
|
+
}
|