minimap2 0.2.22.0 → 0.2.24.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,775 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #if defined(WIN32) || defined(_WIN32)
4
+ #include <io.h> // for open(2)
5
+ #else
6
+ #include <unistd.h>
7
+ #endif
8
+ #include <fcntl.h>
9
+ #include <stdio.h>
10
+ #define __STDC_LIMIT_MACROS
11
+ #include "kthread.h"
12
+ #include "bseq.h"
13
+ #include "minimap.h"
14
+ #include "mmpriv.h"
15
+ #include "kvec.h"
16
+ #include "khash.h"
17
+
18
+ #define idx_hash(a) ((a)>>1)
19
+ #define idx_eq(a, b) ((a)>>1 == (b)>>1)
20
+ KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
21
+ typedef khash_t(idx) idxhash_t;
22
+
23
+ KHASH_MAP_INIT_STR(str, uint32_t)
24
+
25
+ #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
26
+
27
+ typedef struct mm_idx_bucket_s {
28
+ mm128_v a; // (minimizer, position) array
29
+ int32_t n; // size of the _p_ array
30
+ uint64_t *p; // position array for minimizers appearing >1 times
31
+ void *h; // hash table indexing _p_ and minimizers appearing once
32
+ } mm_idx_bucket_t;
33
+
34
+ typedef struct {
35
+ int32_t st, en, max; // max is not used for now
36
+ int32_t score:30, strand:2;
37
+ } mm_idx_intv1_t;
38
+
39
+ typedef struct mm_idx_intv_s {
40
+ int32_t n, m;
41
+ mm_idx_intv1_t *a;
42
+ } mm_idx_intv_t;
43
+
44
+ mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
45
+ {
46
+ mm_idx_t *mi;
47
+ if (k*2 < b) b = k * 2;
48
+ if (w < 1) w = 1;
49
+ mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
50
+ mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
51
+ mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
52
+ if (!(mm_dbg_flag & 1)) mi->km = km_init();
53
+ return mi;
54
+ }
55
+
56
+ void mm_idx_destroy(mm_idx_t *mi)
57
+ {
58
+ uint32_t i;
59
+ if (mi == 0) return;
60
+ if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
61
+ if (mi->B) {
62
+ for (i = 0; i < 1U<<mi->b; ++i) {
63
+ free(mi->B[i].p);
64
+ free(mi->B[i].a.a);
65
+ kh_destroy(idx, (idxhash_t*)mi->B[i].h);
66
+ }
67
+ }
68
+ if (mi->I) {
69
+ for (i = 0; i < mi->n_seq; ++i)
70
+ free(mi->I[i].a);
71
+ free(mi->I);
72
+ }
73
+ if (!mi->km) {
74
+ for (i = 0; i < mi->n_seq; ++i)
75
+ free(mi->seq[i].name);
76
+ free(mi->seq);
77
+ } else km_destroy(mi->km);
78
+ free(mi->B); free(mi->S); free(mi);
79
+ }
80
+
81
+ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
82
+ {
83
+ int mask = (1<<mi->b) - 1;
84
+ khint_t k;
85
+ mm_idx_bucket_t *b = &mi->B[minier&mask];
86
+ idxhash_t *h = (idxhash_t*)b->h;
87
+ *n = 0;
88
+ if (h == 0) return 0;
89
+ k = kh_get(idx, h, minier>>mi->b<<1);
90
+ if (k == kh_end(h)) return 0;
91
+ if (kh_key(h, k)&1) { // special casing when there is only one k-mer
92
+ *n = 1;
93
+ return &kh_val(h, k);
94
+ } else {
95
+ *n = (uint32_t)kh_val(h, k);
96
+ return &b->p[kh_val(h, k)>>32];
97
+ }
98
+ }
99
+
100
+ void mm_idx_stat(const mm_idx_t *mi)
101
+ {
102
+ int n = 0, n1 = 0;
103
+ uint32_t i;
104
+ uint64_t sum = 0, len = 0;
105
+ fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
106
+ for (i = 0; i < mi->n_seq; ++i)
107
+ len += mi->seq[i].len;
108
+ for (i = 0; i < 1U<<mi->b; ++i)
109
+ if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
110
+ for (i = 0; i < 1U<<mi->b; ++i) {
111
+ idxhash_t *h = (idxhash_t*)mi->B[i].h;
112
+ khint_t k;
113
+ if (h == 0) continue;
114
+ for (k = 0; k < kh_end(h); ++k)
115
+ if (kh_exist(h, k)) {
116
+ sum += kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
117
+ if (kh_key(h, k)&1) ++n1;
118
+ }
119
+ }
120
+ fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
121
+ __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
122
+ }
123
+
124
+ int mm_idx_index_name(mm_idx_t *mi)
125
+ {
126
+ khash_t(str) *h;
127
+ uint32_t i;
128
+ int has_dup = 0, absent;
129
+ if (mi->h) return 0;
130
+ h = kh_init(str);
131
+ for (i = 0; i < mi->n_seq; ++i) {
132
+ khint_t k;
133
+ k = kh_put(str, h, mi->seq[i].name, &absent);
134
+ if (absent) kh_val(h, k) = i;
135
+ else has_dup = 1;
136
+ }
137
+ mi->h = h;
138
+ if (has_dup && mm_verbose >= 2)
139
+ fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
140
+ return has_dup;
141
+ }
142
+
143
+ int mm_idx_name2id(const mm_idx_t *mi, const char *name)
144
+ {
145
+ khash_t(str) *h = (khash_t(str)*)mi->h;
146
+ khint_t k;
147
+ if (h == 0) return -2;
148
+ k = kh_get(str, h, name);
149
+ return k == kh_end(h)? -1 : kh_val(h, k);
150
+ }
151
+
152
+ int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
153
+ {
154
+ uint64_t i, st1, en1;
155
+ if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
156
+ if (en > mi->seq[rid].len) en = mi->seq[rid].len;
157
+ st1 = mi->seq[rid].offset + st;
158
+ en1 = mi->seq[rid].offset + en;
159
+ for (i = st1; i < en1; ++i)
160
+ seq[i - st1] = mm_seq4_get(mi->S, i);
161
+ return en - st;
162
+ }
163
+
164
+ int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
165
+ {
166
+ uint64_t i, st1, en1;
167
+ const mm_idx_seq_t *s;
168
+ if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
169
+ s = &mi->seq[rid];
170
+ if (en > s->len) en = s->len;
171
+ st1 = s->offset + (s->len - en);
172
+ en1 = s->offset + (s->len - st);
173
+ for (i = st1; i < en1; ++i) {
174
+ uint8_t c = mm_seq4_get(mi->S, i);
175
+ seq[en1 - i - 1] = c < 4? 3 - c : c;
176
+ }
177
+ return en - st;
178
+ }
179
+
180
+ int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
181
+ {
182
+ if (is_rev) return mm_idx_getseq_rev(mi, rid, st, en, seq);
183
+ else return mm_idx_getseq(mi, rid, st, en, seq);
184
+ }
185
+
186
+ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
187
+ {
188
+ int i;
189
+ size_t n = 0;
190
+ uint32_t thres;
191
+ khint_t *a, k;
192
+ if (f <= 0.) return INT32_MAX;
193
+ for (i = 0; i < 1<<mi->b; ++i)
194
+ if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
195
+ a = (uint32_t*)malloc(n * 4);
196
+ for (i = n = 0; i < 1<<mi->b; ++i) {
197
+ idxhash_t *h = (idxhash_t*)mi->B[i].h;
198
+ if (h == 0) continue;
199
+ for (k = 0; k < kh_end(h); ++k) {
200
+ if (!kh_exist(h, k)) continue;
201
+ a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
202
+ }
203
+ }
204
+ thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
205
+ free(a);
206
+ return thres;
207
+ }
208
+
209
+ /*********************************
210
+ * Sort and generate hash tables *
211
+ *********************************/
212
+
213
+ static void worker_post(void *g, long i, int tid)
214
+ {
215
+ int n, n_keys;
216
+ size_t j, start_a, start_p;
217
+ idxhash_t *h;
218
+ mm_idx_t *mi = (mm_idx_t*)g;
219
+ mm_idx_bucket_t *b = &mi->B[i];
220
+ if (b->a.n == 0) return;
221
+
222
+ // sort by minimizer
223
+ radix_sort_128x(b->a.a, b->a.a + b->a.n);
224
+
225
+ // count and preallocate
226
+ for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
227
+ if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
228
+ ++n_keys;
229
+ if (n > 1) b->n += n;
230
+ n = 1;
231
+ } else ++n;
232
+ }
233
+ h = kh_init(idx);
234
+ kh_resize(idx, h, n_keys);
235
+ b->p = (uint64_t*)calloc(b->n, 8);
236
+
237
+ // create the hash table
238
+ for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
239
+ if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
240
+ khint_t itr;
241
+ int absent;
242
+ mm128_t *p = &b->a.a[j-1];
243
+ itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
244
+ assert(absent && j == start_a + n);
245
+ if (n == 1) {
246
+ kh_key(h, itr) |= 1;
247
+ kh_val(h, itr) = p->y;
248
+ } else {
249
+ int k;
250
+ for (k = 0; k < n; ++k)
251
+ b->p[start_p + k] = b->a.a[start_a + k].y;
252
+ radix_sort_64(&b->p[start_p], &b->p[start_p + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
253
+ kh_val(h, itr) = (uint64_t)start_p<<32 | n;
254
+ start_p += n;
255
+ }
256
+ start_a = j, n = 1;
257
+ } else ++n;
258
+ }
259
+ b->h = h;
260
+ assert(b->n == (int32_t)start_p);
261
+
262
+ // deallocate and clear b->a
263
+ kfree(0, b->a.a);
264
+ b->a.n = b->a.m = 0, b->a.a = 0;
265
+ }
266
+
267
+ static void mm_idx_post(mm_idx_t *mi, int n_threads)
268
+ {
269
+ kt_for(n_threads, worker_post, mi, 1<<mi->b);
270
+ }
271
+
272
+ /******************
273
+ * Generate index *
274
+ ******************/
275
+
276
+ #include <string.h>
277
+ #include <zlib.h>
278
+ #include "bseq.h"
279
+
280
+ typedef struct {
281
+ int mini_batch_size;
282
+ uint64_t batch_size, sum_len;
283
+ mm_bseq_file_t *fp;
284
+ mm_idx_t *mi;
285
+ } pipeline_t;
286
+
287
+ typedef struct {
288
+ int n_seq;
289
+ mm_bseq1_t *seq;
290
+ mm128_v a;
291
+ } step_t;
292
+
293
+ static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
294
+ {
295
+ int i, mask = (1<<mi->b) - 1;
296
+ for (i = 0; i < n; ++i) {
297
+ mm128_v *p = &mi->B[a[i].x>>8&mask].a;
298
+ kv_push(mm128_t, 0, *p, a[i]);
299
+ }
300
+ }
301
+
302
+ static void *worker_pipeline(void *shared, int step, void *in)
303
+ {
304
+ int i;
305
+ pipeline_t *p = (pipeline_t*)shared;
306
+ if (step == 0) { // step 0: read sequences
307
+ step_t *s;
308
+ if (p->sum_len > p->batch_size) return 0;
309
+ s = (step_t*)calloc(1, sizeof(step_t));
310
+ s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
311
+ if (s->seq) {
312
+ uint32_t old_m, m;
313
+ assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
314
+ // make room for p->mi->seq
315
+ old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
316
+ kroundup32(m); kroundup32(old_m);
317
+ if (old_m != m)
318
+ p->mi->seq = (mm_idx_seq_t*)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
319
+ // make room for p->mi->S
320
+ if (!(p->mi->flag & MM_I_NO_SEQ)) {
321
+ uint64_t sum_len, old_max_len, max_len;
322
+ for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
323
+ old_max_len = (p->sum_len + 7) / 8;
324
+ max_len = (p->sum_len + sum_len + 7) / 8;
325
+ kroundup64(old_max_len); kroundup64(max_len);
326
+ if (old_max_len != max_len) {
327
+ p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
328
+ memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
329
+ }
330
+ }
331
+ // populate p->mi->seq
332
+ for (i = 0; i < s->n_seq; ++i) {
333
+ mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
334
+ uint32_t j;
335
+ if (!(p->mi->flag & MM_I_NO_NAME)) {
336
+ seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
337
+ strcpy(seq->name, s->seq[i].name);
338
+ } else seq->name = 0;
339
+ seq->len = s->seq[i].l_seq;
340
+ seq->offset = p->sum_len;
341
+ seq->is_alt = 0;
342
+ // copy the sequence
343
+ if (!(p->mi->flag & MM_I_NO_SEQ)) {
344
+ for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
345
+ uint64_t o = p->sum_len + j;
346
+ int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
347
+ mm_seq4_set(p->mi->S, o, c);
348
+ }
349
+ }
350
+ // update p->sum_len and p->mi->n_seq
351
+ p->sum_len += seq->len;
352
+ s->seq[i].rid = p->mi->n_seq++;
353
+ }
354
+ return s;
355
+ } else free(s);
356
+ } else if (step == 1) { // step 1: compute sketch
357
+ step_t *s = (step_t*)in;
358
+ for (i = 0; i < s->n_seq; ++i) {
359
+ mm_bseq1_t *t = &s->seq[i];
360
+ if (t->l_seq > 0)
361
+ mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
362
+ else if (mm_verbose >= 2)
363
+ fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
364
+ free(t->seq); free(t->name);
365
+ }
366
+ free(s->seq); s->seq = 0;
367
+ return s;
368
+ } else if (step == 2) { // dispatch sketch to buckets
369
+ step_t *s = (step_t*)in;
370
+ mm_idx_add(p->mi, s->a.n, s->a.a);
371
+ kfree(0, s->a.a); free(s);
372
+ }
373
+ return 0;
374
+ }
375
+
376
+ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
377
+ {
378
+ pipeline_t pl;
379
+ if (fp == 0 || mm_bseq_eof(fp)) return 0;
380
+ memset(&pl, 0, sizeof(pipeline_t));
381
+ pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size;
382
+ pl.batch_size = batch_size;
383
+ pl.fp = fp;
384
+ pl.mi = mm_idx_init(w, k, b, flag);
385
+
386
+ kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
387
+ if (mm_verbose >= 3)
388
+ fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
389
+
390
+ mm_idx_post(pl.mi, n_threads);
391
+ if (mm_verbose >= 3)
392
+ fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
393
+
394
+ return pl.mi;
395
+ }
396
+
397
+ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
398
+ {
399
+ mm_bseq_file_t *fp;
400
+ mm_idx_t *mi;
401
+ fp = mm_bseq_open(fn);
402
+ if (fp == 0) return 0;
403
+ mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
404
+ mm_bseq_close(fp);
405
+ return mi;
406
+ }
407
+
408
+ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name)
409
+ {
410
+ uint64_t sum_len = 0;
411
+ mm128_v a = {0,0,0};
412
+ mm_idx_t *mi;
413
+ khash_t(str) *h;
414
+ int i, flag = 0;
415
+
416
+ if (n <= 0) return 0;
417
+ for (i = 0; i < n; ++i) // get the total length
418
+ sum_len += strlen(seq[i]);
419
+ if (is_hpc) flag |= MM_I_HPC;
420
+ if (name == 0) flag |= MM_I_NO_NAME;
421
+ if (bucket_bits < 0) bucket_bits = 14;
422
+ mi = mm_idx_init(w, k, bucket_bits, flag);
423
+ mi->n_seq = n;
424
+ mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
425
+ mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4);
426
+ mi->h = h = kh_init(str);
427
+ for (i = 0, sum_len = 0; i < n; ++i) {
428
+ const char *s = seq[i];
429
+ mm_idx_seq_t *p = &mi->seq[i];
430
+ uint32_t j;
431
+ if (name && name[i]) {
432
+ int absent;
433
+ p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1);
434
+ strcpy(p->name, name[i]);
435
+ kh_put(str, h, p->name, &absent);
436
+ assert(absent);
437
+ }
438
+ p->offset = sum_len;
439
+ p->len = strlen(s);
440
+ p->is_alt = 0;
441
+ for (j = 0; j < p->len; ++j) {
442
+ int c = seq_nt4_table[(uint8_t)s[j]];
443
+ uint64_t o = sum_len + j;
444
+ mm_seq4_set(mi->S, o, c);
445
+ }
446
+ sum_len += p->len;
447
+ if (p->len > 0) {
448
+ a.n = 0;
449
+ mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
450
+ mm_idx_add(mi, a.n, a.a);
451
+ }
452
+ }
453
+ free(a.a);
454
+ mm_idx_post(mi, 1);
455
+ return mi;
456
+ }
457
+
458
+ /*************
459
+ * index I/O *
460
+ *************/
461
+
462
+ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
463
+ {
464
+ uint64_t sum_len = 0;
465
+ uint32_t x[5], i;
466
+
467
+ x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
468
+ fwrite(MM_IDX_MAGIC, 1, 4, fp);
469
+ fwrite(x, 4, 5, fp);
470
+ for (i = 0; i < mi->n_seq; ++i) {
471
+ if (mi->seq[i].name) {
472
+ uint8_t l = strlen(mi->seq[i].name);
473
+ fwrite(&l, 1, 1, fp);
474
+ fwrite(mi->seq[i].name, 1, l, fp);
475
+ } else {
476
+ uint8_t l = 0;
477
+ fwrite(&l, 1, 1, fp);
478
+ }
479
+ fwrite(&mi->seq[i].len, 4, 1, fp);
480
+ sum_len += mi->seq[i].len;
481
+ }
482
+ for (i = 0; i < 1<<mi->b; ++i) {
483
+ mm_idx_bucket_t *b = &mi->B[i];
484
+ khint_t k;
485
+ idxhash_t *h = (idxhash_t*)b->h;
486
+ uint32_t size = h? h->size : 0;
487
+ fwrite(&b->n, 4, 1, fp);
488
+ fwrite(b->p, 8, b->n, fp);
489
+ fwrite(&size, 4, 1, fp);
490
+ if (size == 0) continue;
491
+ for (k = 0; k < kh_end(h); ++k) {
492
+ uint64_t x[2];
493
+ if (!kh_exist(h, k)) continue;
494
+ x[0] = kh_key(h, k), x[1] = kh_val(h, k);
495
+ fwrite(x, 8, 2, fp);
496
+ }
497
+ }
498
+ if (!(mi->flag & MM_I_NO_SEQ))
499
+ fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
500
+ fflush(fp);
501
+ }
502
+
503
+ mm_idx_t *mm_idx_load(FILE *fp)
504
+ {
505
+ char magic[4];
506
+ uint32_t x[5], i;
507
+ uint64_t sum_len = 0;
508
+ mm_idx_t *mi;
509
+
510
+ if (fread(magic, 1, 4, fp) != 4) return 0;
511
+ if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
512
+ if (fread(x, 4, 5, fp) != 5) return 0;
513
+ mi = mm_idx_init(x[0], x[1], x[2], x[4]);
514
+ mi->n_seq = x[3];
515
+ mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
516
+ for (i = 0; i < mi->n_seq; ++i) {
517
+ uint8_t l;
518
+ mm_idx_seq_t *s = &mi->seq[i];
519
+ fread(&l, 1, 1, fp);
520
+ if (l) {
521
+ s->name = (char*)kmalloc(mi->km, l + 1);
522
+ fread(s->name, 1, l, fp);
523
+ s->name[l] = 0;
524
+ }
525
+ fread(&s->len, 4, 1, fp);
526
+ s->offset = sum_len;
527
+ s->is_alt = 0;
528
+ sum_len += s->len;
529
+ }
530
+ for (i = 0; i < 1<<mi->b; ++i) {
531
+ mm_idx_bucket_t *b = &mi->B[i];
532
+ uint32_t j, size;
533
+ khint_t k;
534
+ idxhash_t *h;
535
+ fread(&b->n, 4, 1, fp);
536
+ b->p = (uint64_t*)malloc(b->n * 8);
537
+ fread(b->p, 8, b->n, fp);
538
+ fread(&size, 4, 1, fp);
539
+ if (size == 0) continue;
540
+ b->h = h = kh_init(idx);
541
+ kh_resize(idx, h, size);
542
+ for (j = 0; j < size; ++j) {
543
+ uint64_t x[2];
544
+ int absent;
545
+ fread(x, 8, 2, fp);
546
+ k = kh_put(idx, h, x[0], &absent);
547
+ assert(absent);
548
+ kh_val(h, k) = x[1];
549
+ }
550
+ }
551
+ if (!(mi->flag & MM_I_NO_SEQ)) {
552
+ mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
553
+ fread(mi->S, 4, (sum_len + 7) / 8, fp);
554
+ }
555
+ return mi;
556
+ }
557
+
558
+ int64_t mm_idx_is_idx(const char *fn)
559
+ {
560
+ int fd, is_idx = 0;
561
+ int64_t ret, off_end;
562
+ char magic[4];
563
+
564
+ if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
565
+ fd = open(fn, O_RDONLY);
566
+ if (fd < 0) return -1; // error
567
+ #ifdef WIN32
568
+ if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4) {
569
+ _lseeki64(fd, 0, SEEK_SET);
570
+ #else
571
+ if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
572
+ lseek(fd, 0, SEEK_SET);
573
+ #endif // WIN32
574
+ ret = read(fd, magic, 4);
575
+ if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
576
+ is_idx = 1;
577
+ }
578
+ close(fd);
579
+ return is_idx? off_end : 0;
580
+ }
581
+
582
+ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
583
+ {
584
+ int64_t is_idx;
585
+ mm_idx_reader_t *r;
586
+ is_idx = mm_idx_is_idx(fn);
587
+ if (is_idx < 0) return 0; // failed to open the index
588
+ r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t));
589
+ r->is_idx = is_idx;
590
+ if (opt) r->opt = *opt;
591
+ else mm_idxopt_init(&r->opt);
592
+ if (r->is_idx) {
593
+ r->fp.idx = fopen(fn, "rb");
594
+ r->idx_size = is_idx;
595
+ } else r->fp.seq = mm_bseq_open(fn);
596
+ if (fn_out) r->fp_out = fopen(fn_out, "wb");
597
+ return r;
598
+ }
599
+
600
+ void mm_idx_reader_close(mm_idx_reader_t *r)
601
+ {
602
+ if (r->is_idx) fclose(r->fp.idx);
603
+ else mm_bseq_close(r->fp.seq);
604
+ if (r->fp_out) fclose(r->fp_out);
605
+ free(r);
606
+ }
607
+
608
+ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
609
+ {
610
+ mm_idx_t *mi;
611
+ if (r->is_idx) {
612
+ mi = mm_idx_load(r->fp.idx);
613
+ if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
614
+ fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
615
+ } else
616
+ mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
617
+ if (mi) {
618
+ if (r->fp_out) mm_idx_dump(r->fp_out, mi);
619
+ mi->index = r->n_parts++;
620
+ }
621
+ return mi;
622
+ }
623
+
624
+ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
625
+ {
626
+ return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
627
+ }
628
+
629
+ #include <ctype.h>
630
+ #include <zlib.h>
631
+ #include "ksort.h"
632
+ #include "kseq.h"
633
+ KSTREAM_DECLARE(gzFile, gzread)
634
+
635
+ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
636
+ {
637
+ int n_alt = 0;
638
+ gzFile fp;
639
+ kstream_t *ks;
640
+ kstring_t str = {0,0,0};
641
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
642
+ if (fp == 0) return -1;
643
+ ks = ks_init(fp);
644
+ if (mi->h == 0) mm_idx_index_name(mi);
645
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
646
+ char *p;
647
+ int id;
648
+ for (p = str.s; *p && !isspace(*p); ++p) { }
649
+ *p = 0;
650
+ id = mm_idx_name2id(mi, str.s);
651
+ if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
652
+ }
653
+ mi->n_alt = n_alt;
654
+ if (mm_verbose >= 3)
655
+ fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt);
656
+ return n_alt;
657
+ }
658
+
659
+ #define sort_key_bed(a) ((a).st)
660
+ KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
661
+
662
+ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
663
+ {
664
+ gzFile fp;
665
+ kstream_t *ks;
666
+ kstring_t str = {0,0,0};
667
+ mm_idx_intv_t *I;
668
+
669
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
670
+ if (fp == 0) return 0;
671
+ I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
672
+ ks = ks_init(fp);
673
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
674
+ mm_idx_intv_t *r;
675
+ mm_idx_intv1_t t = {-1,-1,-1,-1,0};
676
+ char *p, *q, *bl, *bs;
677
+ int32_t i, id = -1, n_blk = 0;
678
+ for (p = q = str.s, i = 0;; ++p) {
679
+ if (*p == 0 || *p == '\t') {
680
+ int32_t c = *p;
681
+ *p = 0;
682
+ if (i == 0) { // chr
683
+ id = mm_idx_name2id(mi, q);
684
+ if (id < 0) break; // unknown name; TODO: throw a warning
685
+ } else if (i == 1) { // start
686
+ t.st = atol(q); // TODO: watch out integer overflow!
687
+ if (t.st < 0) break;
688
+ } else if (i == 2) { // end
689
+ t.en = atol(q);
690
+ if (t.en < 0) break;
691
+ } else if (i == 4) { // BED score
692
+ t.score = atol(q);
693
+ } else if (i == 5) { // strand
694
+ t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
695
+ } else if (i == 9) {
696
+ if (!isdigit(*q)) break;
697
+ n_blk = atol(q);
698
+ } else if (i == 10) {
699
+ bl = q;
700
+ } else if (i == 11) {
701
+ bs = q;
702
+ break;
703
+ }
704
+ if (c == 0) break;
705
+ ++i, q = p + 1;
706
+ }
707
+ }
708
+ if (id < 0 || t.st < 0 || t.st >= t.en) continue;
709
+ r = &I[id];
710
+ if (i >= 11 && read_junc) { // BED12
711
+ int32_t st, sz, en;
712
+ st = strtol(bs, &bs, 10); ++bs;
713
+ sz = strtol(bl, &bl, 10); ++bl;
714
+ en = t.st + st + sz;
715
+ for (i = 1; i < n_blk; ++i) {
716
+ mm_idx_intv1_t s = t;
717
+ if (r->n == r->m) {
718
+ r->m = r->m? r->m + (r->m>>1) : 16;
719
+ r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
720
+ }
721
+ st = strtol(bs, &bs, 10); ++bs;
722
+ sz = strtol(bl, &bl, 10); ++bl;
723
+ s.st = en, s.en = t.st + st;
724
+ en = t.st + st + sz;
725
+ if (s.en > s.st) r->a[r->n++] = s;
726
+ }
727
+ } else {
728
+ if (r->n == r->m) {
729
+ r->m = r->m? r->m + (r->m>>1) : 16;
730
+ r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
731
+ }
732
+ r->a[r->n++] = t;
733
+ }
734
+ }
735
+ free(str.s);
736
+ ks_destroy(ks);
737
+ gzclose(fp);
738
+ return I;
739
+ }
740
+
741
+ int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
742
+ {
743
+ int32_t i;
744
+ if (mi->h == 0) mm_idx_index_name(mi);
745
+ mi->I = mm_idx_read_bed(mi, fn, read_junc);
746
+ if (mi->I == 0) return -1;
747
+ for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
748
+ radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
749
+ return 0;
750
+ }
751
+
752
+ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s)
753
+ {
754
+ int32_t i, left, right;
755
+ mm_idx_intv_t *r;
756
+ memset(s, 0, en - st);
757
+ if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
758
+ r = &mi->I[ctg];
759
+ left = 0, right = r->n;
760
+ while (right > left) {
761
+ int32_t mid = left + ((right - left) >> 1);
762
+ if (r->a[mid].st >= st) right = mid;
763
+ else left = mid + 1;
764
+ }
765
+ for (i = left; i < r->n; ++i) {
766
+ if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) {
767
+ if (r->a[i].strand > 0) {
768
+ s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
769
+ } else {
770
+ s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
771
+ }
772
+ }
773
+ }
774
+ return left;
775
+ }