minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,273 @@
1
+ from libc.stdint cimport uint8_t, int8_t
2
+ from libc.stdlib cimport free
3
+ cimport cmappy
4
+ import sys
5
+
6
+ __version__ = '2.24'
7
+
8
+ cmappy.mm_reset_timer()
9
+
10
+ cdef class Alignment:
11
+ cdef int _ctg_len, _r_st, _r_en
12
+ cdef int _q_st, _q_en
13
+ cdef int _NM, _mlen, _blen
14
+ cdef int8_t _strand, _trans_strand
15
+ cdef uint8_t _mapq, _is_primary
16
+ cdef int _seg_id
17
+ cdef _ctg, _cigar, _cs, _MD # these are python objects
18
+
19
+ def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
20
+ self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
21
+ self._ctg_len, self._r_st, self._r_en = cl, cs, ce
22
+ self._strand, self._q_st, self._q_en = strand, qs, qe
23
+ self._NM, self._mlen, self._blen = NM, mlen, blen
24
+ self._mapq = mapq
25
+ self._cigar = cigar
26
+ self._is_primary = is_primary
27
+ self._trans_strand = trans_strand
28
+ self._seg_id = seg_id
29
+ self._cs = cs_str
30
+ self._MD = MD_str
31
+
32
+ @property
33
+ def ctg(self): return self._ctg
34
+
35
+ @property
36
+ def ctg_len(self): return self._ctg_len
37
+
38
+ @property
39
+ def r_st(self): return self._r_st
40
+
41
+ @property
42
+ def r_en(self): return self._r_en
43
+
44
+ @property
45
+ def strand(self): return self._strand
46
+
47
+ @property
48
+ def trans_strand(self): return self._trans_strand
49
+
50
+ @property
51
+ def blen(self): return self._blen
52
+
53
+ @property
54
+ def mlen(self): return self._mlen
55
+
56
+ @property
57
+ def NM(self): return self._NM
58
+
59
+ @property
60
+ def is_primary(self): return (self._is_primary != 0)
61
+
62
+ @property
63
+ def q_st(self): return self._q_st
64
+
65
+ @property
66
+ def q_en(self): return self._q_en
67
+
68
+ @property
69
+ def mapq(self): return self._mapq
70
+
71
+ @property
72
+ def cigar(self): return self._cigar
73
+
74
+ @property
75
+ def read_num(self): return self._seg_id + 1
76
+
77
+ @property
78
+ def cs(self): return self._cs
79
+
80
+ @property
81
+ def MD(self): return self._MD
82
+
83
+ @property
84
+ def cigar_str(self):
85
+ return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
86
+
87
+ def __str__(self):
88
+ if self._strand > 0: strand = '+'
89
+ elif self._strand < 0: strand = '-'
90
+ else: strand = '?'
91
+ if self._is_primary != 0: tp = 'tp:A:P'
92
+ else: tp = 'tp:A:S'
93
+ if self._trans_strand > 0: ts = 'ts:A:+'
94
+ elif self._trans_strand < 0: ts = 'ts:A:-'
95
+ else: ts = 'ts:A:.'
96
+ a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
97
+ str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
98
+ if self._cs != "": a.append("cs:Z:" + self._cs)
99
+ return "\t".join(a)
100
+
101
+ cdef class ThreadBuffer:
102
+ cdef cmappy.mm_tbuf_t *_b
103
+
104
+ def __cinit__(self):
105
+ self._b = cmappy.mm_tbuf_init()
106
+
107
+ def __dealloc__(self):
108
+ cmappy.mm_tbuf_destroy(self._b)
109
+
110
+ cdef class Aligner:
111
+ cdef cmappy.mm_idx_t *_idx
112
+ cdef cmappy.mm_idxopt_t idx_opt
113
+ cdef cmappy.mm_mapopt_t map_opt
114
+
115
+ def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
116
+ self._idx = NULL
117
+ cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
118
+ if preset is not None:
119
+ cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
120
+ self.map_opt.flag |= 4 # always perform alignment
121
+ self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
122
+ if k is not None: self.idx_opt.k = k
123
+ if w is not None: self.idx_opt.w = w
124
+ if min_cnt is not None: self.map_opt.min_cnt = min_cnt
125
+ if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
126
+ if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
127
+ if bw is not None: self.map_opt.bw = bw
128
+ if best_n is not None: self.map_opt.best_n = best_n
129
+ if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
130
+ if extra_flags is not None: self.map_opt.flag |= extra_flags
131
+ if scoring is not None and len(scoring) >= 4:
132
+ self.map_opt.a, self.map_opt.b = scoring[0], scoring[1]
133
+ self.map_opt.q, self.map_opt.e = scoring[2], scoring[3]
134
+ self.map_opt.q2, self.map_opt.e2 = self.map_opt.q, self.map_opt.e
135
+ if len(scoring) >= 6:
136
+ self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
137
+ if len(scoring) >= 7:
138
+ self.map_opt.sc_ambi = scoring[6]
139
+
140
+ cdef cmappy.mm_idx_reader_t *r;
141
+
142
+ if seq is None:
143
+ if fn_idx_out is None:
144
+ r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
145
+ else:
146
+ r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, str.encode(fn_idx_out))
147
+ if r is not NULL:
148
+ self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
149
+ cmappy.mm_idx_reader_close(r)
150
+ cmappy.mm_mapopt_update(&self.map_opt, self._idx)
151
+ cmappy.mm_idx_index_name(self._idx)
152
+ else:
153
+ self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
154
+ cmappy.mm_mapopt_update(&self.map_opt, self._idx)
155
+ self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
156
+
157
+ def __dealloc__(self):
158
+ if self._idx is not NULL:
159
+ cmappy.mm_idx_destroy(self._idx)
160
+
161
+ def __bool__(self):
162
+ return (self._idx != NULL)
163
+
164
+ def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
165
+ cdef cmappy.mm_reg1_t *regs
166
+ cdef cmappy.mm_hitpy_t h
167
+ cdef ThreadBuffer b
168
+ cdef int n_regs
169
+ cdef char *cs_str = NULL
170
+ cdef int l_cs_str, m_cs_str = 0
171
+ cdef void *km
172
+ cdef cmappy.mm_mapopt_t map_opt
173
+
174
+ if self._idx == NULL: return
175
+ map_opt = self.map_opt
176
+ if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
177
+ if extra_flags is not None: map_opt.flag |= extra_flags
178
+
179
+ if self._idx is NULL: return None
180
+ if buf is None: b = ThreadBuffer()
181
+ else: b = buf
182
+ km = cmappy.mm_tbuf_get_km(b._b)
183
+
184
+ _seq = seq if isinstance(seq, bytes) else seq.encode()
185
+ if seq2 is None:
186
+ regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
187
+ else:
188
+ _seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
189
+ regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
190
+
191
+ try:
192
+ i = 0
193
+ while i < n_regs:
194
+ cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
195
+ cigar, _cs, _MD = [], '', ''
196
+ for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
197
+ c = h.cigar32[k]
198
+ cigar.append([c>>4, c&0xf])
199
+ if cs or MD: # generate the cs and/or the MD tag, if requested
200
+ if cs:
201
+ l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq, 1)
202
+ _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
203
+ if MD:
204
+ l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq)
205
+ _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
206
+ yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
207
+ cmappy.mm_free_reg1(&regs[i])
208
+ i += 1
209
+ finally:
210
+ while i < n_regs:
211
+ cmappy.mm_free_reg1(&regs[i])
212
+ i += 1
213
+ free(regs)
214
+ free(cs_str)
215
+
216
+ def seq(self, str name, int start=0, int end=0x7fffffff):
217
+ cdef int l
218
+ cdef char *s
219
+ if self._idx == NULL: return
220
+ s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
221
+ if l == 0: return None
222
+ r = s[:l] if isinstance(s, str) else s[:l].decode()
223
+ free(s)
224
+ return r
225
+
226
+ @property
227
+ def k(self): return self._idx.k
228
+
229
+ @property
230
+ def w(self): return self._idx.w
231
+
232
+ @property
233
+ def n_seq(self): return self._idx.n_seq
234
+
235
+ @property
236
+ def seq_names(self):
237
+ cdef char *p
238
+ if self._idx == NULL: return
239
+ sn = []
240
+ for i in range(self._idx.n_seq):
241
+ p = self._idx.seq[i].name
242
+ s = p if isinstance(p, str) else p.decode()
243
+ sn.append(s)
244
+ return sn
245
+
246
+ def fastx_read(fn, read_comment=False):
247
+ cdef cmappy.kseq_t *ks
248
+ ks = cmappy.mm_fastx_open(str.encode(fn))
249
+ if ks is NULL: return None
250
+ while cmappy.kseq_read(ks) >= 0:
251
+ if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
252
+ else: qual = None
253
+ name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
254
+ seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
255
+ if read_comment:
256
+ if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
257
+ else: comment = None
258
+ yield name, seq, qual, comment
259
+ else:
260
+ yield name, seq, qual
261
+ cmappy.mm_fastx_close(ks)
262
+
263
+ def revcomp(seq):
264
+ l = len(seq)
265
+ bseq = seq if isinstance(seq, bytes) else seq.encode()
266
+ cdef char *s = cmappy.mappy_revcomp(l, bseq)
267
+ r = s[:l] if isinstance(s, str) else s[:l].decode()
268
+ free(s)
269
+ return r
270
+
271
+ def verbose(v=None):
272
+ if v is None: v = -1
273
+ return cmappy.mm_verbose_level(v)
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import getopt
5
+ import mappy as mp
6
+
7
+ def main(argv):
8
+ opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
9
+ if len(args) < 2:
10
+ print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
11
+ print("Options:")
12
+ print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice")
13
+ print(" -n INT mininum number of minimizers")
14
+ print(" -m INT mininum chaining score")
15
+ print(" -k INT k-mer length")
16
+ print(" -w INT minimizer window length")
17
+ print(" -r INT band width")
18
+ print(" -c output the cs tag")
19
+ sys.exit(1)
20
+
21
+ preset = min_cnt = min_sc = k = w = bw = None
22
+ out_cs = False
23
+ for opt, arg in opts:
24
+ if opt == '-x': preset = arg
25
+ elif opt == '-n': min_cnt = int(arg)
26
+ elif opt == '-m': min_chain_score = int(arg)
27
+ elif opt == '-r': bw = int(arg)
28
+ elif opt == '-k': k = int(arg)
29
+ elif opt == '-w': w = int(arg)
30
+ elif opt == '-c': out_cs = True
31
+
32
+ a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
33
+ if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
34
+ for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
35
+ for h in a.map(seq, cs=out_cs): # traverse hits
36
+ print('{}\t{}\t{}'.format(name, len(seq), h))
37
+
38
+ if __name__ == "__main__":
39
+ main(sys.argv)
@@ -0,0 +1,213 @@
1
+ #include <string.h>
2
+ #include <stdint.h>
3
+ #include <stdio.h>
4
+ #include "kalloc.h"
5
+ #include "kdq.h"
6
+ #include "kvec.h"
7
+ #include "sdust.h"
8
+
9
+ #define SD_WLEN 3
10
+ #define SD_WTOT (1<<(SD_WLEN<<1))
11
+ #define SD_WMSK (SD_WTOT - 1)
12
+
13
+ typedef struct {
14
+ int start, finish;
15
+ int r, l;
16
+ } perf_intv_t;
17
+
18
+ typedef kvec_t(perf_intv_t) perf_intv_v;
19
+ typedef kvec_t(uint64_t) uint64_v;
20
+
21
+ KDQ_INIT(int)
22
+
23
+ #if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN)
24
+ unsigned char seq_nt4_table[256] = {
25
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
30
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
32
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
33
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
34
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
35
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
36
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
37
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
38
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
39
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
40
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
41
+ };
42
+ #else
43
+ extern unsigned char seq_nt4_table[256];
44
+ #endif
45
+
46
+ struct sdust_buf_s {
47
+ kdq_t(int) *w;
48
+ perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
49
+ uint64_v res; // the result
50
+ void *km; // memory pool
51
+ };
52
+
53
+ sdust_buf_t *sdust_buf_init(void *km)
54
+ {
55
+ sdust_buf_t *buf;
56
+ buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t));
57
+ buf->km = km;
58
+ buf->w = kdq_init(int, buf->km);
59
+ kdq_resize(int, buf->w, 8);
60
+ return buf;
61
+ }
62
+
63
+ void sdust_buf_destroy(sdust_buf_t *buf)
64
+ {
65
+ if (buf == 0) return;
66
+ kdq_destroy(int, buf->w);
67
+ kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf);
68
+ }
69
+
70
+ static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv)
71
+ {
72
+ int s;
73
+ if ((int)kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
74
+ s = *kdq_shift(int, w);
75
+ *rw -= --cw[s];
76
+ if (*L > (int)kdq_size(w))
77
+ --*L, *rv -= --cv[s];
78
+ }
79
+ kdq_push(int, w, t);
80
+ ++*L;
81
+ *rw += cw[t]++;
82
+ *rv += cv[t]++;
83
+ if (cv[t] * 10 > T<<1) {
84
+ do {
85
+ s = kdq_at(w, kdq_size(w) - *L);
86
+ *rv -= --cv[s];
87
+ --*L;
88
+ } while (s != t);
89
+ }
90
+ }
91
+
92
+ static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start)
93
+ {
94
+ int i, saved = 0;
95
+ perf_intv_t *p;
96
+ if (P->n == 0 || P->a[P->n - 1].start >= start) return;
97
+ p = &P->a[P->n - 1];
98
+ if (res->n) {
99
+ int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
100
+ if (p->start <= f) // if overlapping with or adjacent to the previous interval
101
+ saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish);
102
+ }
103
+ if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish);
104
+ for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
105
+ P->n = i + 1;
106
+ }
107
+
108
+ static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv)
109
+ {
110
+ int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
111
+ memcpy(c, cv, SD_WTOT * sizeof(int));
112
+ for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
113
+ int j, t = kdq_at(w, i), new_r, new_l;
114
+ r += c[t]++;
115
+ new_r = r, new_l = kdq_size(w) - i - 1;
116
+ if (new_r * 10 > T * new_l) {
117
+ for (j = 0; j < (int)P->n && P->a[j].start >= i + start; ++j) { // find insertion position
118
+ perf_intv_t *p = &P->a[j];
119
+ if (max_r == 0 || p->r * max_l > max_r * p->l)
120
+ max_r = p->r, max_l = p->l;
121
+ }
122
+ if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert
123
+ max_r = new_r, max_l = new_l;
124
+ if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1);
125
+ memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
126
+ ++P->n;
127
+ P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
128
+ P->a[j].r = new_r, P->a[j].l = new_l;
129
+ }
130
+ }
131
+ }
132
+ }
133
+
134
+ const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf)
135
+ {
136
+ int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
137
+ int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
138
+ unsigned t; // current word
139
+
140
+ buf->P.n = buf->res.n = 0;
141
+ buf->w->front = buf->w->count = 0;
142
+ memset(cv, 0, SD_WTOT * sizeof(int));
143
+ memset(cw, 0, SD_WTOT * sizeof(int));
144
+ if (l_seq < 0) l_seq = strlen((const char*)seq);
145
+ for (i = l = t = 0; i <= l_seq; ++i) {
146
+ int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
147
+ if (b < 4) { // an A/C/G/T base
148
+ ++l, t = (t<<2 | b) & SD_WMSK;
149
+ if (l >= SD_WLEN) { // we have seen a word
150
+ start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
151
+ save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window?
152
+ shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
153
+ if (rw * 10 > L * T)
154
+ find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv);
155
+ }
156
+ } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
157
+ start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
158
+ while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals
159
+ l = t = 0;
160
+ }
161
+ }
162
+ *n = buf->res.n;
163
+ return buf->res.a;
164
+ }
165
+
166
+ uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n)
167
+ {
168
+ uint64_t *ret;
169
+ sdust_buf_t *buf;
170
+ buf = sdust_buf_init(km);
171
+ ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
172
+ buf->res.a = 0;
173
+ sdust_buf_destroy(buf);
174
+ return ret;
175
+ }
176
+
177
+ #ifdef _SDUST_MAIN
178
+ #include <zlib.h>
179
+ #include <stdio.h>
180
+ #include "ketopt.h"
181
+ #include "kseq.h"
182
+ KSEQ_INIT(gzFile, gzread)
183
+
184
+ int main(int argc, char *argv[])
185
+ {
186
+ gzFile fp;
187
+ kseq_t *ks;
188
+ int W = 64, T = 20, c;
189
+ ketopt_t o = KETOPT_INIT;
190
+
191
+ while ((c = ketopt(&o, argc, argv, 1, "w:t:", 0)) >= 0) {
192
+ if (c == 'w') W = atoi(o.arg);
193
+ else if (c == 't') T = atoi(o.arg);
194
+ }
195
+ if (o.ind == argc) {
196
+ fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
197
+ return 1;
198
+ }
199
+ fp = strcmp(argv[o.ind], "-")? gzopen(argv[o.ind], "r") : gzdopen(fileno(stdin), "r");
200
+ ks = kseq_init(fp);
201
+ while (kseq_read(ks) >= 0) {
202
+ uint64_t *r;
203
+ int i, n;
204
+ r = sdust(0, (uint8_t*)ks->seq.s, -1, T, W, &n);
205
+ for (i = 0; i < n; ++i)
206
+ printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
207
+ free(r);
208
+ }
209
+ kseq_destroy(ks);
210
+ gzclose(fp);
211
+ return 0;
212
+ }
213
+ #endif
@@ -0,0 +1,25 @@
1
+ #ifndef SDUST_H
2
+ #define SDUST_H
3
+
4
+ #include <stdint.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ struct sdust_buf_s;
11
+ typedef struct sdust_buf_s sdust_buf_t;
12
+
13
+ // the simple interface
14
+ uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n);
15
+
16
+ // the following interface dramatically reduce heap allocations when sdust is frequently called.
17
+ sdust_buf_t *sdust_buf_init(void *km);
18
+ void sdust_buf_destroy(sdust_buf_t *buf);
19
+ const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf);
20
+
21
+ #ifdef __cplusplus
22
+ }
23
+ #endif
24
+
25
+ #endif
@@ -0,0 +1,131 @@
1
+ #include "mmpriv.h"
2
+ #include "kalloc.h"
3
+ #include "ksort.h"
4
+
5
+ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
6
+ {
7
+ mm128_t *a;
8
+ size_t i, j, st;
9
+ if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
10
+ KMALLOC(km, a, mv->n);
11
+ for (i = 0; i < mv->n; ++i)
12
+ a[i].x = mv->a[i].x, a[i].y = i;
13
+ radix_sort_128x(a, a + mv->n);
14
+ for (st = 0, i = 1; i <= mv->n; ++i) {
15
+ if (i == mv->n || a[i].x != a[st].x) {
16
+ int32_t cnt = i - st;
17
+ if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
18
+ for (j = st; j < i; ++j)
19
+ mv->a[a[j].y].x = 0;
20
+ st = i;
21
+ }
22
+ }
23
+ kfree(km, a);
24
+ for (i = j = 0; i < mv->n; ++i)
25
+ if (mv->a[i].x != 0)
26
+ mv->a[j++] = mv->a[i];
27
+ mv->n = j;
28
+ }
29
+
30
+ mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
31
+ {
32
+ mm_seed_t *m;
33
+ size_t i;
34
+ int32_t k;
35
+ m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
36
+ for (i = k = 0; i < mv->n; ++i) {
37
+ const uint64_t *cr;
38
+ mm_seed_t *q;
39
+ mm128_t *p = &mv->a[i];
40
+ uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
41
+ int t;
42
+ cr = mm_idx_get(mi, p->x>>8, &t);
43
+ if (t == 0) continue;
44
+ q = &m[k++];
45
+ q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
46
+ q->is_tandem = q->flt = 0;
47
+ if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
48
+ if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
49
+ }
50
+ *n_m_ = k;
51
+ return m;
52
+ }
53
+
54
+ #define MAX_MAX_HIGH_OCC 128
55
+
56
+ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
57
+ { // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
58
+ extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
59
+ extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
60
+ int32_t i, last0, m;
61
+ uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
62
+
63
+ if (n == 0 || n == 1) return;
64
+ for (i = m = 0; i < n; ++i)
65
+ if (a[i].n > max_occ) ++m;
66
+ if (m == 0) return; // no high-frequency k-mers; do nothing
67
+ for (i = 0, last0 = -1; i <= n; ++i) {
68
+ if (i == n || a[i].n <= max_occ) {
69
+ if (i - last0 > 1) {
70
+ int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
71
+ int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
72
+ int32_t j, k, st = last0 + 1, en = i;
73
+ int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
74
+ if (max_high_occ > 0) {
75
+ if (max_high_occ > MAX_MAX_HIGH_OCC)
76
+ max_high_occ = MAX_MAX_HIGH_OCC;
77
+ for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
78
+ b[k] = (uint64_t)a[j].n<<32 | j;
79
+ ks_heapmake_uint64_t(k, b); // initialize the binomial heap
80
+ for (; j < en; ++j) { // if there are more, choose top max_high_occ
81
+ if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
82
+ b[0] = (uint64_t)a[j].n<<32 | j;
83
+ ks_heapdown_uint64_t(0, k, b);
84
+ }
85
+ }
86
+ for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
87
+ }
88
+ for (j = st; j < en; ++j) a[j].flt ^= 1;
89
+ for (j = st; j < en; ++j)
90
+ if (a[j].n > max_max_occ)
91
+ a[j].flt = 1;
92
+ }
93
+ last0 = i;
94
+ }
95
+ }
96
+ }
97
+
98
+ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
99
+ {
100
+ int rep_st = 0, rep_en = 0, n_m, n_m0;
101
+ size_t i;
102
+ mm_seed_t *m;
103
+ *n_mini_pos = 0;
104
+ *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
105
+ m = mm_seed_collect_all(km, mi, mv, &n_m0);
106
+ if (dist > 0 && max_max_occ > max_occ) {
107
+ mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
108
+ } else {
109
+ for (i = 0; i < n_m0; ++i)
110
+ if (m[i].n > max_occ)
111
+ m[i].flt = 1;
112
+ }
113
+ for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
114
+ mm_seed_t *q = &m[i];
115
+ //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
116
+ if (q->flt) {
117
+ int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
118
+ if (st > rep_en) {
119
+ *rep_len += rep_en - rep_st;
120
+ rep_st = st, rep_en = en;
121
+ } else rep_en = en;
122
+ } else {
123
+ *n_a += q->n;
124
+ (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
125
+ m[n_m++] = *q;
126
+ }
127
+ }
128
+ *rep_len += rep_en - rep_st;
129
+ *_n_m = n_m;
130
+ return m;
131
+ }