minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,775 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #if defined(WIN32) || defined(_WIN32)
4
+ #include <io.h> // for open(2)
5
+ #else
6
+ #include <unistd.h>
7
+ #endif
8
+ #include <fcntl.h>
9
+ #include <stdio.h>
10
+ #define __STDC_LIMIT_MACROS
11
+ #include "kthread.h"
12
+ #include "bseq.h"
13
+ #include "minimap.h"
14
+ #include "mmpriv.h"
15
+ #include "kvec.h"
16
+ #include "khash.h"
17
+
18
+ #define idx_hash(a) ((a)>>1)
19
+ #define idx_eq(a, b) ((a)>>1 == (b)>>1)
20
+ KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
21
+ typedef khash_t(idx) idxhash_t;
22
+
23
+ KHASH_MAP_INIT_STR(str, uint32_t)
24
+
25
+ #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
26
+
27
+ typedef struct mm_idx_bucket_s {
28
+ mm128_v a; // (minimizer, position) array
29
+ int32_t n; // size of the _p_ array
30
+ uint64_t *p; // position array for minimizers appearing >1 times
31
+ void *h; // hash table indexing _p_ and minimizers appearing once
32
+ } mm_idx_bucket_t;
33
+
34
+ typedef struct {
35
+ int32_t st, en, max; // max is not used for now
36
+ int32_t score:30, strand:2;
37
+ } mm_idx_intv1_t;
38
+
39
+ typedef struct mm_idx_intv_s {
40
+ int32_t n, m;
41
+ mm_idx_intv1_t *a;
42
+ } mm_idx_intv_t;
43
+
44
+ mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
45
+ {
46
+ mm_idx_t *mi;
47
+ if (k*2 < b) b = k * 2;
48
+ if (w < 1) w = 1;
49
+ mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
50
+ mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
51
+ mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
52
+ if (!(mm_dbg_flag & 1)) mi->km = km_init();
53
+ return mi;
54
+ }
55
+
56
+ void mm_idx_destroy(mm_idx_t *mi)
57
+ {
58
+ uint32_t i;
59
+ if (mi == 0) return;
60
+ if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
61
+ if (mi->B) {
62
+ for (i = 0; i < 1U<<mi->b; ++i) {
63
+ free(mi->B[i].p);
64
+ free(mi->B[i].a.a);
65
+ kh_destroy(idx, (idxhash_t*)mi->B[i].h);
66
+ }
67
+ }
68
+ if (mi->I) {
69
+ for (i = 0; i < mi->n_seq; ++i)
70
+ free(mi->I[i].a);
71
+ free(mi->I);
72
+ }
73
+ if (!mi->km) {
74
+ for (i = 0; i < mi->n_seq; ++i)
75
+ free(mi->seq[i].name);
76
+ free(mi->seq);
77
+ } else km_destroy(mi->km);
78
+ free(mi->B); free(mi->S); free(mi);
79
+ }
80
+
81
+ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
82
+ {
83
+ int mask = (1<<mi->b) - 1;
84
+ khint_t k;
85
+ mm_idx_bucket_t *b = &mi->B[minier&mask];
86
+ idxhash_t *h = (idxhash_t*)b->h;
87
+ *n = 0;
88
+ if (h == 0) return 0;
89
+ k = kh_get(idx, h, minier>>mi->b<<1);
90
+ if (k == kh_end(h)) return 0;
91
+ if (kh_key(h, k)&1) { // special casing when there is only one k-mer
92
+ *n = 1;
93
+ return &kh_val(h, k);
94
+ } else {
95
+ *n = (uint32_t)kh_val(h, k);
96
+ return &b->p[kh_val(h, k)>>32];
97
+ }
98
+ }
99
+
100
+ void mm_idx_stat(const mm_idx_t *mi)
101
+ {
102
+ int n = 0, n1 = 0;
103
+ uint32_t i;
104
+ uint64_t sum = 0, len = 0;
105
+ fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
106
+ for (i = 0; i < mi->n_seq; ++i)
107
+ len += mi->seq[i].len;
108
+ for (i = 0; i < 1U<<mi->b; ++i)
109
+ if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
110
+ for (i = 0; i < 1U<<mi->b; ++i) {
111
+ idxhash_t *h = (idxhash_t*)mi->B[i].h;
112
+ khint_t k;
113
+ if (h == 0) continue;
114
+ for (k = 0; k < kh_end(h); ++k)
115
+ if (kh_exist(h, k)) {
116
+ sum += kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
117
+ if (kh_key(h, k)&1) ++n1;
118
+ }
119
+ }
120
+ fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
121
+ __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
122
+ }
123
+
124
+ int mm_idx_index_name(mm_idx_t *mi)
125
+ {
126
+ khash_t(str) *h;
127
+ uint32_t i;
128
+ int has_dup = 0, absent;
129
+ if (mi->h) return 0;
130
+ h = kh_init(str);
131
+ for (i = 0; i < mi->n_seq; ++i) {
132
+ khint_t k;
133
+ k = kh_put(str, h, mi->seq[i].name, &absent);
134
+ if (absent) kh_val(h, k) = i;
135
+ else has_dup = 1;
136
+ }
137
+ mi->h = h;
138
+ if (has_dup && mm_verbose >= 2)
139
+ fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
140
+ return has_dup;
141
+ }
142
+
143
+ int mm_idx_name2id(const mm_idx_t *mi, const char *name)
144
+ {
145
+ khash_t(str) *h = (khash_t(str)*)mi->h;
146
+ khint_t k;
147
+ if (h == 0) return -2;
148
+ k = kh_get(str, h, name);
149
+ return k == kh_end(h)? -1 : kh_val(h, k);
150
+ }
151
+
152
+ int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
153
+ {
154
+ uint64_t i, st1, en1;
155
+ if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
156
+ if (en > mi->seq[rid].len) en = mi->seq[rid].len;
157
+ st1 = mi->seq[rid].offset + st;
158
+ en1 = mi->seq[rid].offset + en;
159
+ for (i = st1; i < en1; ++i)
160
+ seq[i - st1] = mm_seq4_get(mi->S, i);
161
+ return en - st;
162
+ }
163
+
164
+ int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
165
+ {
166
+ uint64_t i, st1, en1;
167
+ const mm_idx_seq_t *s;
168
+ if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
169
+ s = &mi->seq[rid];
170
+ if (en > s->len) en = s->len;
171
+ st1 = s->offset + (s->len - en);
172
+ en1 = s->offset + (s->len - st);
173
+ for (i = st1; i < en1; ++i) {
174
+ uint8_t c = mm_seq4_get(mi->S, i);
175
+ seq[en1 - i - 1] = c < 4? 3 - c : c;
176
+ }
177
+ return en - st;
178
+ }
179
+
180
+ int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
181
+ {
182
+ if (is_rev) return mm_idx_getseq_rev(mi, rid, st, en, seq);
183
+ else return mm_idx_getseq(mi, rid, st, en, seq);
184
+ }
185
+
186
+ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
187
+ {
188
+ int i;
189
+ size_t n = 0;
190
+ uint32_t thres;
191
+ khint_t *a, k;
192
+ if (f <= 0.) return INT32_MAX;
193
+ for (i = 0; i < 1<<mi->b; ++i)
194
+ if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
195
+ a = (uint32_t*)malloc(n * 4);
196
+ for (i = n = 0; i < 1<<mi->b; ++i) {
197
+ idxhash_t *h = (idxhash_t*)mi->B[i].h;
198
+ if (h == 0) continue;
199
+ for (k = 0; k < kh_end(h); ++k) {
200
+ if (!kh_exist(h, k)) continue;
201
+ a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
202
+ }
203
+ }
204
+ thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
205
+ free(a);
206
+ return thres;
207
+ }
208
+
209
+ /*********************************
210
+ * Sort and generate hash tables *
211
+ *********************************/
212
+
213
+ static void worker_post(void *g, long i, int tid)
214
+ {
215
+ int n, n_keys;
216
+ size_t j, start_a, start_p;
217
+ idxhash_t *h;
218
+ mm_idx_t *mi = (mm_idx_t*)g;
219
+ mm_idx_bucket_t *b = &mi->B[i];
220
+ if (b->a.n == 0) return;
221
+
222
+ // sort by minimizer
223
+ radix_sort_128x(b->a.a, b->a.a + b->a.n);
224
+
225
+ // count and preallocate
226
+ for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
227
+ if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
228
+ ++n_keys;
229
+ if (n > 1) b->n += n;
230
+ n = 1;
231
+ } else ++n;
232
+ }
233
+ h = kh_init(idx);
234
+ kh_resize(idx, h, n_keys);
235
+ b->p = (uint64_t*)calloc(b->n, 8);
236
+
237
+ // create the hash table
238
+ for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
239
+ if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
240
+ khint_t itr;
241
+ int absent;
242
+ mm128_t *p = &b->a.a[j-1];
243
+ itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
244
+ assert(absent && j == start_a + n);
245
+ if (n == 1) {
246
+ kh_key(h, itr) |= 1;
247
+ kh_val(h, itr) = p->y;
248
+ } else {
249
+ int k;
250
+ for (k = 0; k < n; ++k)
251
+ b->p[start_p + k] = b->a.a[start_a + k].y;
252
+ radix_sort_64(&b->p[start_p], &b->p[start_p + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
253
+ kh_val(h, itr) = (uint64_t)start_p<<32 | n;
254
+ start_p += n;
255
+ }
256
+ start_a = j, n = 1;
257
+ } else ++n;
258
+ }
259
+ b->h = h;
260
+ assert(b->n == (int32_t)start_p);
261
+
262
+ // deallocate and clear b->a
263
+ kfree(0, b->a.a);
264
+ b->a.n = b->a.m = 0, b->a.a = 0;
265
+ }
266
+
267
+ static void mm_idx_post(mm_idx_t *mi, int n_threads)
268
+ {
269
+ kt_for(n_threads, worker_post, mi, 1<<mi->b);
270
+ }
271
+
272
+ /******************
273
+ * Generate index *
274
+ ******************/
275
+
276
+ #include <string.h>
277
+ #include <zlib.h>
278
+ #include "bseq.h"
279
+
280
+ typedef struct {
281
+ int mini_batch_size;
282
+ uint64_t batch_size, sum_len;
283
+ mm_bseq_file_t *fp;
284
+ mm_idx_t *mi;
285
+ } pipeline_t;
286
+
287
+ typedef struct {
288
+ int n_seq;
289
+ mm_bseq1_t *seq;
290
+ mm128_v a;
291
+ } step_t;
292
+
293
+ static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
294
+ {
295
+ int i, mask = (1<<mi->b) - 1;
296
+ for (i = 0; i < n; ++i) {
297
+ mm128_v *p = &mi->B[a[i].x>>8&mask].a;
298
+ kv_push(mm128_t, 0, *p, a[i]);
299
+ }
300
+ }
301
+
302
+ static void *worker_pipeline(void *shared, int step, void *in)
303
+ {
304
+ int i;
305
+ pipeline_t *p = (pipeline_t*)shared;
306
+ if (step == 0) { // step 0: read sequences
307
+ step_t *s;
308
+ if (p->sum_len > p->batch_size) return 0;
309
+ s = (step_t*)calloc(1, sizeof(step_t));
310
+ s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
311
+ if (s->seq) {
312
+ uint32_t old_m, m;
313
+ assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
314
+ // make room for p->mi->seq
315
+ old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
316
+ kroundup32(m); kroundup32(old_m);
317
+ if (old_m != m)
318
+ p->mi->seq = (mm_idx_seq_t*)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
319
+ // make room for p->mi->S
320
+ if (!(p->mi->flag & MM_I_NO_SEQ)) {
321
+ uint64_t sum_len, old_max_len, max_len;
322
+ for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
323
+ old_max_len = (p->sum_len + 7) / 8;
324
+ max_len = (p->sum_len + sum_len + 7) / 8;
325
+ kroundup64(old_max_len); kroundup64(max_len);
326
+ if (old_max_len != max_len) {
327
+ p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
328
+ memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
329
+ }
330
+ }
331
+ // populate p->mi->seq
332
+ for (i = 0; i < s->n_seq; ++i) {
333
+ mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
334
+ uint32_t j;
335
+ if (!(p->mi->flag & MM_I_NO_NAME)) {
336
+ seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
337
+ strcpy(seq->name, s->seq[i].name);
338
+ } else seq->name = 0;
339
+ seq->len = s->seq[i].l_seq;
340
+ seq->offset = p->sum_len;
341
+ seq->is_alt = 0;
342
+ // copy the sequence
343
+ if (!(p->mi->flag & MM_I_NO_SEQ)) {
344
+ for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
345
+ uint64_t o = p->sum_len + j;
346
+ int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
347
+ mm_seq4_set(p->mi->S, o, c);
348
+ }
349
+ }
350
+ // update p->sum_len and p->mi->n_seq
351
+ p->sum_len += seq->len;
352
+ s->seq[i].rid = p->mi->n_seq++;
353
+ }
354
+ return s;
355
+ } else free(s);
356
+ } else if (step == 1) { // step 1: compute sketch
357
+ step_t *s = (step_t*)in;
358
+ for (i = 0; i < s->n_seq; ++i) {
359
+ mm_bseq1_t *t = &s->seq[i];
360
+ if (t->l_seq > 0)
361
+ mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
362
+ else if (mm_verbose >= 2)
363
+ fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
364
+ free(t->seq); free(t->name);
365
+ }
366
+ free(s->seq); s->seq = 0;
367
+ return s;
368
+ } else if (step == 2) { // dispatch sketch to buckets
369
+ step_t *s = (step_t*)in;
370
+ mm_idx_add(p->mi, s->a.n, s->a.a);
371
+ kfree(0, s->a.a); free(s);
372
+ }
373
+ return 0;
374
+ }
375
+
376
+ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
377
+ {
378
+ pipeline_t pl;
379
+ if (fp == 0 || mm_bseq_eof(fp)) return 0;
380
+ memset(&pl, 0, sizeof(pipeline_t));
381
+ pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size;
382
+ pl.batch_size = batch_size;
383
+ pl.fp = fp;
384
+ pl.mi = mm_idx_init(w, k, b, flag);
385
+
386
+ kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
387
+ if (mm_verbose >= 3)
388
+ fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
389
+
390
+ mm_idx_post(pl.mi, n_threads);
391
+ if (mm_verbose >= 3)
392
+ fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
393
+
394
+ return pl.mi;
395
+ }
396
+
397
+ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
398
+ {
399
+ mm_bseq_file_t *fp;
400
+ mm_idx_t *mi;
401
+ fp = mm_bseq_open(fn);
402
+ if (fp == 0) return 0;
403
+ mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
404
+ mm_bseq_close(fp);
405
+ return mi;
406
+ }
407
+
408
+ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name)
409
+ {
410
+ uint64_t sum_len = 0;
411
+ mm128_v a = {0,0,0};
412
+ mm_idx_t *mi;
413
+ khash_t(str) *h;
414
+ int i, flag = 0;
415
+
416
+ if (n <= 0) return 0;
417
+ for (i = 0; i < n; ++i) // get the total length
418
+ sum_len += strlen(seq[i]);
419
+ if (is_hpc) flag |= MM_I_HPC;
420
+ if (name == 0) flag |= MM_I_NO_NAME;
421
+ if (bucket_bits < 0) bucket_bits = 14;
422
+ mi = mm_idx_init(w, k, bucket_bits, flag);
423
+ mi->n_seq = n;
424
+ mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
425
+ mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4);
426
+ mi->h = h = kh_init(str);
427
+ for (i = 0, sum_len = 0; i < n; ++i) {
428
+ const char *s = seq[i];
429
+ mm_idx_seq_t *p = &mi->seq[i];
430
+ uint32_t j;
431
+ if (name && name[i]) {
432
+ int absent;
433
+ p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1);
434
+ strcpy(p->name, name[i]);
435
+ kh_put(str, h, p->name, &absent);
436
+ assert(absent);
437
+ }
438
+ p->offset = sum_len;
439
+ p->len = strlen(s);
440
+ p->is_alt = 0;
441
+ for (j = 0; j < p->len; ++j) {
442
+ int c = seq_nt4_table[(uint8_t)s[j]];
443
+ uint64_t o = sum_len + j;
444
+ mm_seq4_set(mi->S, o, c);
445
+ }
446
+ sum_len += p->len;
447
+ if (p->len > 0) {
448
+ a.n = 0;
449
+ mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
450
+ mm_idx_add(mi, a.n, a.a);
451
+ }
452
+ }
453
+ free(a.a);
454
+ mm_idx_post(mi, 1);
455
+ return mi;
456
+ }
457
+
458
+ /*************
459
+ * index I/O *
460
+ *************/
461
+
462
+ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
463
+ {
464
+ uint64_t sum_len = 0;
465
+ uint32_t x[5], i;
466
+
467
+ x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
468
+ fwrite(MM_IDX_MAGIC, 1, 4, fp);
469
+ fwrite(x, 4, 5, fp);
470
+ for (i = 0; i < mi->n_seq; ++i) {
471
+ if (mi->seq[i].name) {
472
+ uint8_t l = strlen(mi->seq[i].name);
473
+ fwrite(&l, 1, 1, fp);
474
+ fwrite(mi->seq[i].name, 1, l, fp);
475
+ } else {
476
+ uint8_t l = 0;
477
+ fwrite(&l, 1, 1, fp);
478
+ }
479
+ fwrite(&mi->seq[i].len, 4, 1, fp);
480
+ sum_len += mi->seq[i].len;
481
+ }
482
+ for (i = 0; i < 1<<mi->b; ++i) {
483
+ mm_idx_bucket_t *b = &mi->B[i];
484
+ khint_t k;
485
+ idxhash_t *h = (idxhash_t*)b->h;
486
+ uint32_t size = h? h->size : 0;
487
+ fwrite(&b->n, 4, 1, fp);
488
+ fwrite(b->p, 8, b->n, fp);
489
+ fwrite(&size, 4, 1, fp);
490
+ if (size == 0) continue;
491
+ for (k = 0; k < kh_end(h); ++k) {
492
+ uint64_t x[2];
493
+ if (!kh_exist(h, k)) continue;
494
+ x[0] = kh_key(h, k), x[1] = kh_val(h, k);
495
+ fwrite(x, 8, 2, fp);
496
+ }
497
+ }
498
+ if (!(mi->flag & MM_I_NO_SEQ))
499
+ fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
500
+ fflush(fp);
501
+ }
502
+
503
+ mm_idx_t *mm_idx_load(FILE *fp)
504
+ {
505
+ char magic[4];
506
+ uint32_t x[5], i;
507
+ uint64_t sum_len = 0;
508
+ mm_idx_t *mi;
509
+
510
+ if (fread(magic, 1, 4, fp) != 4) return 0;
511
+ if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
512
+ if (fread(x, 4, 5, fp) != 5) return 0;
513
+ mi = mm_idx_init(x[0], x[1], x[2], x[4]);
514
+ mi->n_seq = x[3];
515
+ mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
516
+ for (i = 0; i < mi->n_seq; ++i) {
517
+ uint8_t l;
518
+ mm_idx_seq_t *s = &mi->seq[i];
519
+ fread(&l, 1, 1, fp);
520
+ if (l) {
521
+ s->name = (char*)kmalloc(mi->km, l + 1);
522
+ fread(s->name, 1, l, fp);
523
+ s->name[l] = 0;
524
+ }
525
+ fread(&s->len, 4, 1, fp);
526
+ s->offset = sum_len;
527
+ s->is_alt = 0;
528
+ sum_len += s->len;
529
+ }
530
+ for (i = 0; i < 1<<mi->b; ++i) {
531
+ mm_idx_bucket_t *b = &mi->B[i];
532
+ uint32_t j, size;
533
+ khint_t k;
534
+ idxhash_t *h;
535
+ fread(&b->n, 4, 1, fp);
536
+ b->p = (uint64_t*)malloc(b->n * 8);
537
+ fread(b->p, 8, b->n, fp);
538
+ fread(&size, 4, 1, fp);
539
+ if (size == 0) continue;
540
+ b->h = h = kh_init(idx);
541
+ kh_resize(idx, h, size);
542
+ for (j = 0; j < size; ++j) {
543
+ uint64_t x[2];
544
+ int absent;
545
+ fread(x, 8, 2, fp);
546
+ k = kh_put(idx, h, x[0], &absent);
547
+ assert(absent);
548
+ kh_val(h, k) = x[1];
549
+ }
550
+ }
551
+ if (!(mi->flag & MM_I_NO_SEQ)) {
552
+ mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
553
+ fread(mi->S, 4, (sum_len + 7) / 8, fp);
554
+ }
555
+ return mi;
556
+ }
557
+
558
+ int64_t mm_idx_is_idx(const char *fn)
559
+ {
560
+ int fd, is_idx = 0;
561
+ int64_t ret, off_end;
562
+ char magic[4];
563
+
564
+ if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
565
+ fd = open(fn, O_RDONLY);
566
+ if (fd < 0) return -1; // error
567
+ #ifdef WIN32
568
+ if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4) {
569
+ _lseeki64(fd, 0, SEEK_SET);
570
+ #else
571
+ if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
572
+ lseek(fd, 0, SEEK_SET);
573
+ #endif // WIN32
574
+ ret = read(fd, magic, 4);
575
+ if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
576
+ is_idx = 1;
577
+ }
578
+ close(fd);
579
+ return is_idx? off_end : 0;
580
+ }
581
+
582
+ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
583
+ {
584
+ int64_t is_idx;
585
+ mm_idx_reader_t *r;
586
+ is_idx = mm_idx_is_idx(fn);
587
+ if (is_idx < 0) return 0; // failed to open the index
588
+ r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t));
589
+ r->is_idx = is_idx;
590
+ if (opt) r->opt = *opt;
591
+ else mm_idxopt_init(&r->opt);
592
+ if (r->is_idx) {
593
+ r->fp.idx = fopen(fn, "rb");
594
+ r->idx_size = is_idx;
595
+ } else r->fp.seq = mm_bseq_open(fn);
596
+ if (fn_out) r->fp_out = fopen(fn_out, "wb");
597
+ return r;
598
+ }
599
+
600
+ void mm_idx_reader_close(mm_idx_reader_t *r)
601
+ {
602
+ if (r->is_idx) fclose(r->fp.idx);
603
+ else mm_bseq_close(r->fp.seq);
604
+ if (r->fp_out) fclose(r->fp_out);
605
+ free(r);
606
+ }
607
+
608
+ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
609
+ {
610
+ mm_idx_t *mi;
611
+ if (r->is_idx) {
612
+ mi = mm_idx_load(r->fp.idx);
613
+ if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
614
+ fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
615
+ } else
616
+ mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
617
+ if (mi) {
618
+ if (r->fp_out) mm_idx_dump(r->fp_out, mi);
619
+ mi->index = r->n_parts++;
620
+ }
621
+ return mi;
622
+ }
623
+
624
+ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
625
+ {
626
+ return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
627
+ }
628
+
629
+ #include <ctype.h>
630
+ #include <zlib.h>
631
+ #include "ksort.h"
632
+ #include "kseq.h"
633
+ KSTREAM_DECLARE(gzFile, gzread)
634
+
635
+ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
636
+ {
637
+ int n_alt = 0;
638
+ gzFile fp;
639
+ kstream_t *ks;
640
+ kstring_t str = {0,0,0};
641
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
642
+ if (fp == 0) return -1;
643
+ ks = ks_init(fp);
644
+ if (mi->h == 0) mm_idx_index_name(mi);
645
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
646
+ char *p;
647
+ int id;
648
+ for (p = str.s; *p && !isspace(*p); ++p) { }
649
+ *p = 0;
650
+ id = mm_idx_name2id(mi, str.s);
651
+ if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
652
+ }
653
+ mi->n_alt = n_alt;
654
+ if (mm_verbose >= 3)
655
+ fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt);
656
+ return n_alt;
657
+ }
658
+
659
+ #define sort_key_bed(a) ((a).st)
660
+ KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
661
+
662
+ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
663
+ {
664
+ gzFile fp;
665
+ kstream_t *ks;
666
+ kstring_t str = {0,0,0};
667
+ mm_idx_intv_t *I;
668
+
669
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
670
+ if (fp == 0) return 0;
671
+ I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
672
+ ks = ks_init(fp);
673
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
674
+ mm_idx_intv_t *r;
675
+ mm_idx_intv1_t t = {-1,-1,-1,-1,0};
676
+ char *p, *q, *bl, *bs;
677
+ int32_t i, id = -1, n_blk = 0;
678
+ for (p = q = str.s, i = 0;; ++p) {
679
+ if (*p == 0 || *p == '\t') {
680
+ int32_t c = *p;
681
+ *p = 0;
682
+ if (i == 0) { // chr
683
+ id = mm_idx_name2id(mi, q);
684
+ if (id < 0) break; // unknown name; TODO: throw a warning
685
+ } else if (i == 1) { // start
686
+ t.st = atol(q); // TODO: watch out integer overflow!
687
+ if (t.st < 0) break;
688
+ } else if (i == 2) { // end
689
+ t.en = atol(q);
690
+ if (t.en < 0) break;
691
+ } else if (i == 4) { // BED score
692
+ t.score = atol(q);
693
+ } else if (i == 5) { // strand
694
+ t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
695
+ } else if (i == 9) {
696
+ if (!isdigit(*q)) break;
697
+ n_blk = atol(q);
698
+ } else if (i == 10) {
699
+ bl = q;
700
+ } else if (i == 11) {
701
+ bs = q;
702
+ break;
703
+ }
704
+ if (c == 0) break;
705
+ ++i, q = p + 1;
706
+ }
707
+ }
708
+ if (id < 0 || t.st < 0 || t.st >= t.en) continue;
709
+ r = &I[id];
710
+ if (i >= 11 && read_junc) { // BED12
711
+ int32_t st, sz, en;
712
+ st = strtol(bs, &bs, 10); ++bs;
713
+ sz = strtol(bl, &bl, 10); ++bl;
714
+ en = t.st + st + sz;
715
+ for (i = 1; i < n_blk; ++i) {
716
+ mm_idx_intv1_t s = t;
717
+ if (r->n == r->m) {
718
+ r->m = r->m? r->m + (r->m>>1) : 16;
719
+ r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
720
+ }
721
+ st = strtol(bs, &bs, 10); ++bs;
722
+ sz = strtol(bl, &bl, 10); ++bl;
723
+ s.st = en, s.en = t.st + st;
724
+ en = t.st + st + sz;
725
+ if (s.en > s.st) r->a[r->n++] = s;
726
+ }
727
+ } else {
728
+ if (r->n == r->m) {
729
+ r->m = r->m? r->m + (r->m>>1) : 16;
730
+ r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
731
+ }
732
+ r->a[r->n++] = t;
733
+ }
734
+ }
735
+ free(str.s);
736
+ ks_destroy(ks);
737
+ gzclose(fp);
738
+ return I;
739
+ }
740
+
741
+ int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
742
+ {
743
+ int32_t i;
744
+ if (mi->h == 0) mm_idx_index_name(mi);
745
+ mi->I = mm_idx_read_bed(mi, fn, read_junc);
746
+ if (mi->I == 0) return -1;
747
+ for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
748
+ radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
749
+ return 0;
750
+ }
751
+
752
+ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s)
753
+ {
754
+ int32_t i, left, right;
755
+ mm_idx_intv_t *r;
756
+ memset(s, 0, en - st);
757
+ if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
758
+ r = &mi->I[ctg];
759
+ left = 0, right = r->n;
760
+ while (right > left) {
761
+ int32_t mid = left + ((right - left) >> 1);
762
+ if (r->a[mid].st >= st) right = mid;
763
+ else left = mid + 1;
764
+ }
765
+ for (i = left; i < r->n; ++i) {
766
+ if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) {
767
+ if (r->a[i].strand > 0) {
768
+ s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
769
+ } else {
770
+ s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
771
+ }
772
+ }
773
+ }
774
+ return left;
775
+ }