minimap2 0.2.21 → 0.2.24.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +53 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +75 -56
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +80 -28
  99. metadata +97 -65
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,273 @@
1
+ from libc.stdint cimport uint8_t, int8_t
2
+ from libc.stdlib cimport free
3
+ cimport cmappy
4
+ import sys
5
+
6
+ __version__ = '2.24'
7
+
8
+ cmappy.mm_reset_timer()
9
+
10
+ cdef class Alignment:
11
+ cdef int _ctg_len, _r_st, _r_en
12
+ cdef int _q_st, _q_en
13
+ cdef int _NM, _mlen, _blen
14
+ cdef int8_t _strand, _trans_strand
15
+ cdef uint8_t _mapq, _is_primary
16
+ cdef int _seg_id
17
+ cdef _ctg, _cigar, _cs, _MD # these are python objects
18
+
19
+ def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
20
+ self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
21
+ self._ctg_len, self._r_st, self._r_en = cl, cs, ce
22
+ self._strand, self._q_st, self._q_en = strand, qs, qe
23
+ self._NM, self._mlen, self._blen = NM, mlen, blen
24
+ self._mapq = mapq
25
+ self._cigar = cigar
26
+ self._is_primary = is_primary
27
+ self._trans_strand = trans_strand
28
+ self._seg_id = seg_id
29
+ self._cs = cs_str
30
+ self._MD = MD_str
31
+
32
+ @property
33
+ def ctg(self): return self._ctg
34
+
35
+ @property
36
+ def ctg_len(self): return self._ctg_len
37
+
38
+ @property
39
+ def r_st(self): return self._r_st
40
+
41
+ @property
42
+ def r_en(self): return self._r_en
43
+
44
+ @property
45
+ def strand(self): return self._strand
46
+
47
+ @property
48
+ def trans_strand(self): return self._trans_strand
49
+
50
+ @property
51
+ def blen(self): return self._blen
52
+
53
+ @property
54
+ def mlen(self): return self._mlen
55
+
56
+ @property
57
+ def NM(self): return self._NM
58
+
59
+ @property
60
+ def is_primary(self): return (self._is_primary != 0)
61
+
62
+ @property
63
+ def q_st(self): return self._q_st
64
+
65
+ @property
66
+ def q_en(self): return self._q_en
67
+
68
+ @property
69
+ def mapq(self): return self._mapq
70
+
71
+ @property
72
+ def cigar(self): return self._cigar
73
+
74
+ @property
75
+ def read_num(self): return self._seg_id + 1
76
+
77
+ @property
78
+ def cs(self): return self._cs
79
+
80
+ @property
81
+ def MD(self): return self._MD
82
+
83
+ @property
84
+ def cigar_str(self):
85
+ return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
86
+
87
+ def __str__(self):
88
+ if self._strand > 0: strand = '+'
89
+ elif self._strand < 0: strand = '-'
90
+ else: strand = '?'
91
+ if self._is_primary != 0: tp = 'tp:A:P'
92
+ else: tp = 'tp:A:S'
93
+ if self._trans_strand > 0: ts = 'ts:A:+'
94
+ elif self._trans_strand < 0: ts = 'ts:A:-'
95
+ else: ts = 'ts:A:.'
96
+ a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
97
+ str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
98
+ if self._cs != "": a.append("cs:Z:" + self._cs)
99
+ return "\t".join(a)
100
+
101
+ cdef class ThreadBuffer:
102
+ cdef cmappy.mm_tbuf_t *_b
103
+
104
+ def __cinit__(self):
105
+ self._b = cmappy.mm_tbuf_init()
106
+
107
+ def __dealloc__(self):
108
+ cmappy.mm_tbuf_destroy(self._b)
109
+
110
+ cdef class Aligner:
111
+ cdef cmappy.mm_idx_t *_idx
112
+ cdef cmappy.mm_idxopt_t idx_opt
113
+ cdef cmappy.mm_mapopt_t map_opt
114
+
115
+ def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
116
+ self._idx = NULL
117
+ cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
118
+ if preset is not None:
119
+ cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
120
+ self.map_opt.flag |= 4 # always perform alignment
121
+ self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
122
+ if k is not None: self.idx_opt.k = k
123
+ if w is not None: self.idx_opt.w = w
124
+ if min_cnt is not None: self.map_opt.min_cnt = min_cnt
125
+ if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
126
+ if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
127
+ if bw is not None: self.map_opt.bw = bw
128
+ if best_n is not None: self.map_opt.best_n = best_n
129
+ if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
130
+ if extra_flags is not None: self.map_opt.flag |= extra_flags
131
+ if scoring is not None and len(scoring) >= 4:
132
+ self.map_opt.a, self.map_opt.b = scoring[0], scoring[1]
133
+ self.map_opt.q, self.map_opt.e = scoring[2], scoring[3]
134
+ self.map_opt.q2, self.map_opt.e2 = self.map_opt.q, self.map_opt.e
135
+ if len(scoring) >= 6:
136
+ self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
137
+ if len(scoring) >= 7:
138
+ self.map_opt.sc_ambi = scoring[6]
139
+
140
+ cdef cmappy.mm_idx_reader_t *r;
141
+
142
+ if seq is None:
143
+ if fn_idx_out is None:
144
+ r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
145
+ else:
146
+ r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, str.encode(fn_idx_out))
147
+ if r is not NULL:
148
+ self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
149
+ cmappy.mm_idx_reader_close(r)
150
+ cmappy.mm_mapopt_update(&self.map_opt, self._idx)
151
+ cmappy.mm_idx_index_name(self._idx)
152
+ else:
153
+ self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
154
+ cmappy.mm_mapopt_update(&self.map_opt, self._idx)
155
+ self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
156
+
157
+ def __dealloc__(self):
158
+ if self._idx is not NULL:
159
+ cmappy.mm_idx_destroy(self._idx)
160
+
161
+ def __bool__(self):
162
+ return (self._idx != NULL)
163
+
164
+ def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
165
+ cdef cmappy.mm_reg1_t *regs
166
+ cdef cmappy.mm_hitpy_t h
167
+ cdef ThreadBuffer b
168
+ cdef int n_regs
169
+ cdef char *cs_str = NULL
170
+ cdef int l_cs_str, m_cs_str = 0
171
+ cdef void *km
172
+ cdef cmappy.mm_mapopt_t map_opt
173
+
174
+ if self._idx == NULL: return
175
+ map_opt = self.map_opt
176
+ if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
177
+ if extra_flags is not None: map_opt.flag |= extra_flags
178
+
179
+ if self._idx is NULL: return None
180
+ if buf is None: b = ThreadBuffer()
181
+ else: b = buf
182
+ km = cmappy.mm_tbuf_get_km(b._b)
183
+
184
+ _seq = seq if isinstance(seq, bytes) else seq.encode()
185
+ if seq2 is None:
186
+ regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
187
+ else:
188
+ _seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
189
+ regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
190
+
191
+ try:
192
+ i = 0
193
+ while i < n_regs:
194
+ cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
195
+ cigar, _cs, _MD = [], '', ''
196
+ for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
197
+ c = h.cigar32[k]
198
+ cigar.append([c>>4, c&0xf])
199
+ if cs or MD: # generate the cs and/or the MD tag, if requested
200
+ if cs:
201
+ l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq, 1)
202
+ _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
203
+ if MD:
204
+ l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq)
205
+ _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
206
+ yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
207
+ cmappy.mm_free_reg1(&regs[i])
208
+ i += 1
209
+ finally:
210
+ while i < n_regs:
211
+ cmappy.mm_free_reg1(&regs[i])
212
+ i += 1
213
+ free(regs)
214
+ free(cs_str)
215
+
216
+ def seq(self, str name, int start=0, int end=0x7fffffff):
217
+ cdef int l
218
+ cdef char *s
219
+ if self._idx == NULL: return
220
+ s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
221
+ if l == 0: return None
222
+ r = s[:l] if isinstance(s, str) else s[:l].decode()
223
+ free(s)
224
+ return r
225
+
226
+ @property
227
+ def k(self): return self._idx.k
228
+
229
+ @property
230
+ def w(self): return self._idx.w
231
+
232
+ @property
233
+ def n_seq(self): return self._idx.n_seq
234
+
235
+ @property
236
+ def seq_names(self):
237
+ cdef char *p
238
+ if self._idx == NULL: return
239
+ sn = []
240
+ for i in range(self._idx.n_seq):
241
+ p = self._idx.seq[i].name
242
+ s = p if isinstance(p, str) else p.decode()
243
+ sn.append(s)
244
+ return sn
245
+
246
+ def fastx_read(fn, read_comment=False):
247
+ cdef cmappy.kseq_t *ks
248
+ ks = cmappy.mm_fastx_open(str.encode(fn))
249
+ if ks is NULL: return None
250
+ while cmappy.kseq_read(ks) >= 0:
251
+ if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
252
+ else: qual = None
253
+ name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
254
+ seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
255
+ if read_comment:
256
+ if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
257
+ else: comment = None
258
+ yield name, seq, qual, comment
259
+ else:
260
+ yield name, seq, qual
261
+ cmappy.mm_fastx_close(ks)
262
+
263
+ def revcomp(seq):
264
+ l = len(seq)
265
+ bseq = seq if isinstance(seq, bytes) else seq.encode()
266
+ cdef char *s = cmappy.mappy_revcomp(l, bseq)
267
+ r = s[:l] if isinstance(s, str) else s[:l].decode()
268
+ free(s)
269
+ return r
270
+
271
+ def verbose(v=None):
272
+ if v is None: v = -1
273
+ return cmappy.mm_verbose_level(v)
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import getopt
5
+ import mappy as mp
6
+
7
+ def main(argv):
8
+ opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
9
+ if len(args) < 2:
10
+ print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
11
+ print("Options:")
12
+ print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice")
13
+ print(" -n INT mininum number of minimizers")
14
+ print(" -m INT mininum chaining score")
15
+ print(" -k INT k-mer length")
16
+ print(" -w INT minimizer window length")
17
+ print(" -r INT band width")
18
+ print(" -c output the cs tag")
19
+ sys.exit(1)
20
+
21
+ preset = min_cnt = min_sc = k = w = bw = None
22
+ out_cs = False
23
+ for opt, arg in opts:
24
+ if opt == '-x': preset = arg
25
+ elif opt == '-n': min_cnt = int(arg)
26
+ elif opt == '-m': min_chain_score = int(arg)
27
+ elif opt == '-r': bw = int(arg)
28
+ elif opt == '-k': k = int(arg)
29
+ elif opt == '-w': w = int(arg)
30
+ elif opt == '-c': out_cs = True
31
+
32
+ a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
33
+ if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
34
+ for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
35
+ for h in a.map(seq, cs=out_cs): # traverse hits
36
+ print('{}\t{}\t{}'.format(name, len(seq), h))
37
+
38
+ if __name__ == "__main__":
39
+ main(sys.argv)
@@ -0,0 +1,213 @@
1
+ #include <string.h>
2
+ #include <stdint.h>
3
+ #include <stdio.h>
4
+ #include "kalloc.h"
5
+ #include "kdq.h"
6
+ #include "kvec.h"
7
+ #include "sdust.h"
8
+
9
+ #define SD_WLEN 3
10
+ #define SD_WTOT (1<<(SD_WLEN<<1))
11
+ #define SD_WMSK (SD_WTOT - 1)
12
+
13
+ typedef struct {
14
+ int start, finish;
15
+ int r, l;
16
+ } perf_intv_t;
17
+
18
+ typedef kvec_t(perf_intv_t) perf_intv_v;
19
+ typedef kvec_t(uint64_t) uint64_v;
20
+
21
+ KDQ_INIT(int)
22
+
23
+ #if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN)
24
+ unsigned char seq_nt4_table[256] = {
25
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
30
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
32
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
33
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
34
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
35
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
36
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
37
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
38
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
39
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
40
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
41
+ };
42
+ #else
43
+ extern unsigned char seq_nt4_table[256];
44
+ #endif
45
+
46
+ struct sdust_buf_s {
47
+ kdq_t(int) *w;
48
+ perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
49
+ uint64_v res; // the result
50
+ void *km; // memory pool
51
+ };
52
+
53
+ sdust_buf_t *sdust_buf_init(void *km)
54
+ {
55
+ sdust_buf_t *buf;
56
+ buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t));
57
+ buf->km = km;
58
+ buf->w = kdq_init(int, buf->km);
59
+ kdq_resize(int, buf->w, 8);
60
+ return buf;
61
+ }
62
+
63
+ void sdust_buf_destroy(sdust_buf_t *buf)
64
+ {
65
+ if (buf == 0) return;
66
+ kdq_destroy(int, buf->w);
67
+ kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf);
68
+ }
69
+
70
+ static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv)
71
+ {
72
+ int s;
73
+ if ((int)kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
74
+ s = *kdq_shift(int, w);
75
+ *rw -= --cw[s];
76
+ if (*L > (int)kdq_size(w))
77
+ --*L, *rv -= --cv[s];
78
+ }
79
+ kdq_push(int, w, t);
80
+ ++*L;
81
+ *rw += cw[t]++;
82
+ *rv += cv[t]++;
83
+ if (cv[t] * 10 > T<<1) {
84
+ do {
85
+ s = kdq_at(w, kdq_size(w) - *L);
86
+ *rv -= --cv[s];
87
+ --*L;
88
+ } while (s != t);
89
+ }
90
+ }
91
+
92
+ static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start)
93
+ {
94
+ int i, saved = 0;
95
+ perf_intv_t *p;
96
+ if (P->n == 0 || P->a[P->n - 1].start >= start) return;
97
+ p = &P->a[P->n - 1];
98
+ if (res->n) {
99
+ int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
100
+ if (p->start <= f) // if overlapping with or adjacent to the previous interval
101
+ saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish);
102
+ }
103
+ if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish);
104
+ for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
105
+ P->n = i + 1;
106
+ }
107
+
108
+ static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv)
109
+ {
110
+ int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
111
+ memcpy(c, cv, SD_WTOT * sizeof(int));
112
+ for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
113
+ int j, t = kdq_at(w, i), new_r, new_l;
114
+ r += c[t]++;
115
+ new_r = r, new_l = kdq_size(w) - i - 1;
116
+ if (new_r * 10 > T * new_l) {
117
+ for (j = 0; j < (int)P->n && P->a[j].start >= i + start; ++j) { // find insertion position
118
+ perf_intv_t *p = &P->a[j];
119
+ if (max_r == 0 || p->r * max_l > max_r * p->l)
120
+ max_r = p->r, max_l = p->l;
121
+ }
122
+ if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert
123
+ max_r = new_r, max_l = new_l;
124
+ if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1);
125
+ memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
126
+ ++P->n;
127
+ P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
128
+ P->a[j].r = new_r, P->a[j].l = new_l;
129
+ }
130
+ }
131
+ }
132
+ }
133
+
134
+ const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf)
135
+ {
136
+ int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
137
+ int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
138
+ unsigned t; // current word
139
+
140
+ buf->P.n = buf->res.n = 0;
141
+ buf->w->front = buf->w->count = 0;
142
+ memset(cv, 0, SD_WTOT * sizeof(int));
143
+ memset(cw, 0, SD_WTOT * sizeof(int));
144
+ if (l_seq < 0) l_seq = strlen((const char*)seq);
145
+ for (i = l = t = 0; i <= l_seq; ++i) {
146
+ int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
147
+ if (b < 4) { // an A/C/G/T base
148
+ ++l, t = (t<<2 | b) & SD_WMSK;
149
+ if (l >= SD_WLEN) { // we have seen a word
150
+ start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
151
+ save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window?
152
+ shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
153
+ if (rw * 10 > L * T)
154
+ find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv);
155
+ }
156
+ } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
157
+ start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
158
+ while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals
159
+ l = t = 0;
160
+ }
161
+ }
162
+ *n = buf->res.n;
163
+ return buf->res.a;
164
+ }
165
+
166
+ uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n)
167
+ {
168
+ uint64_t *ret;
169
+ sdust_buf_t *buf;
170
+ buf = sdust_buf_init(km);
171
+ ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
172
+ buf->res.a = 0;
173
+ sdust_buf_destroy(buf);
174
+ return ret;
175
+ }
176
+
177
+ #ifdef _SDUST_MAIN
178
+ #include <zlib.h>
179
+ #include <stdio.h>
180
+ #include "ketopt.h"
181
+ #include "kseq.h"
182
+ KSEQ_INIT(gzFile, gzread)
183
+
184
+ int main(int argc, char *argv[])
185
+ {
186
+ gzFile fp;
187
+ kseq_t *ks;
188
+ int W = 64, T = 20, c;
189
+ ketopt_t o = KETOPT_INIT;
190
+
191
+ while ((c = ketopt(&o, argc, argv, 1, "w:t:", 0)) >= 0) {
192
+ if (c == 'w') W = atoi(o.arg);
193
+ else if (c == 't') T = atoi(o.arg);
194
+ }
195
+ if (o.ind == argc) {
196
+ fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
197
+ return 1;
198
+ }
199
+ fp = strcmp(argv[o.ind], "-")? gzopen(argv[o.ind], "r") : gzdopen(fileno(stdin), "r");
200
+ ks = kseq_init(fp);
201
+ while (kseq_read(ks) >= 0) {
202
+ uint64_t *r;
203
+ int i, n;
204
+ r = sdust(0, (uint8_t*)ks->seq.s, -1, T, W, &n);
205
+ for (i = 0; i < n; ++i)
206
+ printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
207
+ free(r);
208
+ }
209
+ kseq_destroy(ks);
210
+ gzclose(fp);
211
+ return 0;
212
+ }
213
+ #endif
@@ -0,0 +1,25 @@
1
+ #ifndef SDUST_H
2
+ #define SDUST_H
3
+
4
+ #include <stdint.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ struct sdust_buf_s;
11
+ typedef struct sdust_buf_s sdust_buf_t;
12
+
13
+ // the simple interface
14
+ uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n);
15
+
16
+ // the following interface dramatically reduce heap allocations when sdust is frequently called.
17
+ sdust_buf_t *sdust_buf_init(void *km);
18
+ void sdust_buf_destroy(sdust_buf_t *buf);
19
+ const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf);
20
+
21
+ #ifdef __cplusplus
22
+ }
23
+ #endif
24
+
25
+ #endif
@@ -0,0 +1,131 @@
1
+ #include "mmpriv.h"
2
+ #include "kalloc.h"
3
+ #include "ksort.h"
4
+
5
+ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
6
+ {
7
+ mm128_t *a;
8
+ size_t i, j, st;
9
+ if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
10
+ KMALLOC(km, a, mv->n);
11
+ for (i = 0; i < mv->n; ++i)
12
+ a[i].x = mv->a[i].x, a[i].y = i;
13
+ radix_sort_128x(a, a + mv->n);
14
+ for (st = 0, i = 1; i <= mv->n; ++i) {
15
+ if (i == mv->n || a[i].x != a[st].x) {
16
+ int32_t cnt = i - st;
17
+ if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
18
+ for (j = st; j < i; ++j)
19
+ mv->a[a[j].y].x = 0;
20
+ st = i;
21
+ }
22
+ }
23
+ kfree(km, a);
24
+ for (i = j = 0; i < mv->n; ++i)
25
+ if (mv->a[i].x != 0)
26
+ mv->a[j++] = mv->a[i];
27
+ mv->n = j;
28
+ }
29
+
30
+ mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
31
+ {
32
+ mm_seed_t *m;
33
+ size_t i;
34
+ int32_t k;
35
+ m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
36
+ for (i = k = 0; i < mv->n; ++i) {
37
+ const uint64_t *cr;
38
+ mm_seed_t *q;
39
+ mm128_t *p = &mv->a[i];
40
+ uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
41
+ int t;
42
+ cr = mm_idx_get(mi, p->x>>8, &t);
43
+ if (t == 0) continue;
44
+ q = &m[k++];
45
+ q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
46
+ q->is_tandem = q->flt = 0;
47
+ if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
48
+ if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
49
+ }
50
+ *n_m_ = k;
51
+ return m;
52
+ }
53
+
54
+ #define MAX_MAX_HIGH_OCC 128
55
+
56
+ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
57
+ { // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
58
+ extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
59
+ extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
60
+ int32_t i, last0, m;
61
+ uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
62
+
63
+ if (n == 0 || n == 1) return;
64
+ for (i = m = 0; i < n; ++i)
65
+ if (a[i].n > max_occ) ++m;
66
+ if (m == 0) return; // no high-frequency k-mers; do nothing
67
+ for (i = 0, last0 = -1; i <= n; ++i) {
68
+ if (i == n || a[i].n <= max_occ) {
69
+ if (i - last0 > 1) {
70
+ int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
71
+ int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
72
+ int32_t j, k, st = last0 + 1, en = i;
73
+ int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
74
+ if (max_high_occ > 0) {
75
+ if (max_high_occ > MAX_MAX_HIGH_OCC)
76
+ max_high_occ = MAX_MAX_HIGH_OCC;
77
+ for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
78
+ b[k] = (uint64_t)a[j].n<<32 | j;
79
+ ks_heapmake_uint64_t(k, b); // initialize the binomial heap
80
+ for (; j < en; ++j) { // if there are more, choose top max_high_occ
81
+ if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
82
+ b[0] = (uint64_t)a[j].n<<32 | j;
83
+ ks_heapdown_uint64_t(0, k, b);
84
+ }
85
+ }
86
+ for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
87
+ }
88
+ for (j = st; j < en; ++j) a[j].flt ^= 1;
89
+ for (j = st; j < en; ++j)
90
+ if (a[j].n > max_max_occ)
91
+ a[j].flt = 1;
92
+ }
93
+ last0 = i;
94
+ }
95
+ }
96
+ }
97
+
98
+ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
99
+ {
100
+ int rep_st = 0, rep_en = 0, n_m, n_m0;
101
+ size_t i;
102
+ mm_seed_t *m;
103
+ *n_mini_pos = 0;
104
+ *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
105
+ m = mm_seed_collect_all(km, mi, mv, &n_m0);
106
+ if (dist > 0 && max_max_occ > max_occ) {
107
+ mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
108
+ } else {
109
+ for (i = 0; i < n_m0; ++i)
110
+ if (m[i].n > max_occ)
111
+ m[i].flt = 1;
112
+ }
113
+ for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
114
+ mm_seed_t *q = &m[i];
115
+ //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
116
+ if (q->flt) {
117
+ int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
118
+ if (st > rep_en) {
119
+ *rep_len += rep_en - rep_st;
120
+ rep_st = st, rep_en = en;
121
+ } else rep_en = en;
122
+ } else {
123
+ *n_a += q->n;
124
+ (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
125
+ m[n_m++] = *q;
126
+ }
127
+ }
128
+ *rep_len += rep_en - rep_st;
129
+ *_n_m = n_m;
130
+ return m;
131
+ }