minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,714 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <assert.h>
4
+ #include <errno.h>
5
+ #include "kthread.h"
6
+ #include "kvec.h"
7
+ #include "kalloc.h"
8
+ #include "sdust.h"
9
+ #include "mmpriv.h"
10
+ #include "bseq.h"
11
+ #include "khash.h"
12
+
13
+ struct mm_tbuf_s {
14
+ void *km;
15
+ int rep_len, frag_gap;
16
+ };
17
+
18
+ mm_tbuf_t *mm_tbuf_init(void)
19
+ {
20
+ mm_tbuf_t *b;
21
+ b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t));
22
+ if (!(mm_dbg_flag & 1)) b->km = km_init();
23
+ return b;
24
+ }
25
+
26
+ void mm_tbuf_destroy(mm_tbuf_t *b)
27
+ {
28
+ if (b == 0) return;
29
+ km_destroy(b->km);
30
+ free(b);
31
+ }
32
+
33
+ void *mm_tbuf_get_km(mm_tbuf_t *b)
34
+ {
35
+ return b->km;
36
+ }
37
+
38
+ static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres)
39
+ {
40
+ int n_dreg, j, k, u = 0;
41
+ const uint64_t *dreg;
42
+ sdust_buf_t *sdb;
43
+ if (sdust_thres <= 0) return n;
44
+ sdb = sdust_buf_init(km);
45
+ dreg = sdust_core((const uint8_t*)seq, l_seq, sdust_thres, 64, &n_dreg, sdb);
46
+ for (j = k = 0; j < n; ++j) { // squeeze out minimizers that significantly overlap with LCRs
47
+ int32_t qpos = (uint32_t)a[j].y>>1, span = a[j].x&0xff;
48
+ int32_t s = qpos - (span - 1), e = s + span;
49
+ while (u < n_dreg && (int32_t)dreg[u] <= s) ++u;
50
+ if (u < n_dreg && (int32_t)(dreg[u]>>32) < e) {
51
+ int v, l = 0;
52
+ for (v = u; v < n_dreg && (int32_t)(dreg[v]>>32) < e; ++v) { // iterate over LCRs overlapping this minimizer
53
+ int ss = s > (int32_t)(dreg[v]>>32)? s : dreg[v]>>32;
54
+ int ee = e < (int32_t)dreg[v]? e : (uint32_t)dreg[v];
55
+ l += ee - ss;
56
+ }
57
+ if (l <= span>>1) a[k++] = a[j]; // keep the minimizer if less than half of it falls in masked region
58
+ } else a[k++] = a[j];
59
+ }
60
+ sdust_buf_destroy(sdb);
61
+ return k; // the new size
62
+ }
63
+
64
+ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, mm128_v *mv)
65
+ {
66
+ int i, n, sum = 0;
67
+ mv->n = 0;
68
+ for (i = n = 0; i < n_segs; ++i) {
69
+ size_t j;
70
+ mm_sketch(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mv);
71
+ for (j = n; j < mv->n; ++j)
72
+ mv->a[j].y += sum << 1;
73
+ if (opt->sdust_thres > 0) // mask low-complexity minimizers
74
+ mv->n = n + mm_dust_minier(km, mv->n - n, mv->a + n, qlens[i], seqs[i], opt->sdust_thres);
75
+ sum += qlens[i], n = mv->n;
76
+ }
77
+ }
78
+
79
+ #include "ksort.h"
80
+ #define heap_lt(a, b) ((a).x > (b).x)
81
+ KSORT_INIT(heap, mm128_t, heap_lt)
82
+
83
+ static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
84
+ {
85
+ *is_self = 0;
86
+ if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) {
87
+ const mm_idx_seq_t *s = &mi->seq[r>>32];
88
+ int cmp;
89
+ cmp = strcmp(qname, s->name);
90
+ if ((flag&MM_F_NO_DIAG) && cmp == 0 && (int)s->len == qlen) {
91
+ if ((uint32_t)r>>1 == (q->q_pos>>1)) return 1; // avoid the diagnonal anchors
92
+ if ((r&1) == (q->q_pos&1)) *is_self = 1; // this flag is used to avoid spurious extension on self chain
93
+ }
94
+ if ((flag&MM_F_NO_DUAL) && cmp > 0) // all-vs-all mode: map once
95
+ return 1;
96
+ }
97
+ if (flag & (MM_F_FOR_ONLY|MM_F_REV_ONLY)) {
98
+ if ((r&1) == (q->q_pos&1)) { // forward strand
99
+ if (flag & MM_F_REV_ONLY) return 1;
100
+ } else {
101
+ if (flag & MM_F_FOR_ONLY) return 1;
102
+ }
103
+ }
104
+ return 0;
105
+ }
106
+
107
+ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max_occ, const mm_idx_t *mi, const char *qname, const mm128_v *mv, int qlen, int64_t *n_a, int *rep_len,
108
+ int *n_mini_pos, uint64_t **mini_pos)
109
+ {
110
+ int i, n_m, heap_size = 0;
111
+ int64_t j, n_for = 0, n_rev = 0;
112
+ mm_seed_t *m;
113
+ mm128_t *a, *heap;
114
+
115
+ m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
116
+
117
+ heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
118
+ a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
119
+
120
+ for (i = 0, heap_size = 0; i < n_m; ++i) {
121
+ if (m[i].n > 0) {
122
+ heap[heap_size].x = m[i].cr[0];
123
+ heap[heap_size].y = (uint64_t)i<<32;
124
+ ++heap_size;
125
+ }
126
+ }
127
+ ks_heapmake_heap(heap_size, heap);
128
+ while (heap_size > 0) {
129
+ mm_seed_t *q = &m[heap->y>>32];
130
+ mm128_t *p;
131
+ uint64_t r = heap->x;
132
+ int32_t is_self, rpos = (uint32_t)r >> 1;
133
+ if (!skip_seed(opt->flag, r, q, qname, qlen, mi, &is_self)) {
134
+ if ((r&1) == (q->q_pos&1)) { // forward strand
135
+ p = &a[n_for++];
136
+ p->x = (r&0xffffffff00000000ULL) | rpos;
137
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
138
+ } else { // reverse strand
139
+ p = &a[(*n_a) - (++n_rev)];
140
+ p->x = 1ULL<<63 | (r&0xffffffff00000000ULL) | rpos;
141
+ p->y = (uint64_t)q->q_span << 32 | (qlen - ((q->q_pos>>1) + 1 - q->q_span) - 1);
142
+ }
143
+ p->y |= (uint64_t)q->seg_id << MM_SEED_SEG_SHIFT;
144
+ if (q->is_tandem) p->y |= MM_SEED_TANDEM;
145
+ if (is_self) p->y |= MM_SEED_SELF;
146
+ }
147
+ // update the heap
148
+ if ((uint32_t)heap->y < q->n - 1) {
149
+ ++heap[0].y;
150
+ heap[0].x = m[heap[0].y>>32].cr[(uint32_t)heap[0].y];
151
+ } else {
152
+ heap[0] = heap[heap_size - 1];
153
+ --heap_size;
154
+ }
155
+ ks_heapdown_heap(0, heap_size, heap);
156
+ }
157
+ kfree(km, m);
158
+ kfree(km, heap);
159
+
160
+ // reverse anchors on the reverse strand, as they are in the descending order
161
+ for (j = 0; j < n_rev>>1; ++j) {
162
+ mm128_t t = a[(*n_a) - 1 - j];
163
+ a[(*n_a) - 1 - j] = a[(*n_a) - (n_rev - j)];
164
+ a[(*n_a) - (n_rev - j)] = t;
165
+ }
166
+ if (*n_a > n_for + n_rev) {
167
+ memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mm128_t));
168
+ *n_a = n_for + n_rev;
169
+ }
170
+ return a;
171
+ }
172
+
173
+ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, const mm_idx_t *mi, const char *qname, const mm128_v *mv, int qlen, int64_t *n_a, int *rep_len,
174
+ int *n_mini_pos, uint64_t **mini_pos)
175
+ {
176
+ int i, n_m;
177
+ mm_seed_t *m;
178
+ mm128_t *a;
179
+ m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
180
+ a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
181
+ for (i = 0, *n_a = 0; i < n_m; ++i) {
182
+ mm_seed_t *q = &m[i];
183
+ const uint64_t *r = q->cr;
184
+ uint32_t k;
185
+ for (k = 0; k < q->n; ++k) {
186
+ int32_t is_self, rpos = (uint32_t)r[k] >> 1;
187
+ mm128_t *p;
188
+ if (skip_seed(opt->flag, r[k], q, qname, qlen, mi, &is_self)) continue;
189
+ p = &a[(*n_a)++];
190
+ if ((r[k]&1) == (q->q_pos&1)) { // forward strand
191
+ p->x = (r[k]&0xffffffff00000000ULL) | rpos;
192
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
193
+ } else if (!(opt->flag & MM_F_QSTRAND)) { // reverse strand and not in the query-strand mode
194
+ p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | rpos;
195
+ p->y = (uint64_t)q->q_span << 32 | (qlen - ((q->q_pos>>1) + 1 - q->q_span) - 1);
196
+ } else { // reverse strand; query-strand
197
+ int32_t len = mi->seq[r[k]>>32].len;
198
+ p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (len - (rpos + 1 - q->q_span) - 1); // coordinate only accurate for non-HPC seeds
199
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
200
+ }
201
+ p->y |= (uint64_t)q->seg_id << MM_SEED_SEG_SHIFT;
202
+ if (q->is_tandem) p->y |= MM_SEED_TANDEM;
203
+ if (is_self) p->y |= MM_SEED_SELF;
204
+ }
205
+ }
206
+ kfree(km, m);
207
+ radix_sort_128x(a, a + (*n_a));
208
+ return a;
209
+ }
210
+
211
+ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a)
212
+ {
213
+ if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
214
+ mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
215
+ if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, 1, opt->max_gap * 0.8, n_regs, regs);
216
+ else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
217
+ }
218
+ }
219
+
220
+ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a)
221
+ {
222
+ if (!(opt->flag & MM_F_CIGAR)) return regs;
223
+ regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
224
+ if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
225
+ mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
226
+ mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, 0, opt->max_gap * 0.8, n_regs, regs);
227
+ mm_set_sam_pri(*n_regs, regs);
228
+ }
229
+ return regs;
230
+ }
231
+
232
+ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
233
+ {
234
+ int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos;
235
+ int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR);
236
+ uint32_t hash;
237
+ int64_t n_a;
238
+ uint64_t *u, *mini_pos;
239
+ mm128_t *a;
240
+ mm128_v mv = {0,0,0};
241
+ mm_reg1_t *regs0;
242
+ km_stat_t kmst;
243
+ float chn_pen_gap, chn_pen_skip;
244
+
245
+ for (i = 0, qlen_sum = 0; i < n_segs; ++i)
246
+ qlen_sum += qlens[i], n_regs[i] = 0, regs[i] = 0;
247
+
248
+ if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return;
249
+ if (opt->max_qlen > 0 && qlen_sum > opt->max_qlen) return;
250
+
251
+ hash = qname && !(opt->flag & MM_F_NO_HASH_NAME)? __ac_X31_hash_string(qname) : 0;
252
+ hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed);
253
+ hash = __ac_Wang_hash(hash);
254
+
255
+ collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
256
+ if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac);
257
+ if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
258
+ else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
259
+
260
+ if (mm_dbg_flag & MM_DBG_PRINT_SEED) {
261
+ fprintf(stderr, "RS\t%d\n", rep_len);
262
+ for (i = 0; i < n_a; ++i)
263
+ fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
264
+ i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
265
+ }
266
+
267
+ // set max chaining gap on the query and the reference sequence
268
+ if (is_sr)
269
+ max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap;
270
+ else max_chain_gap_qry = opt->max_gap;
271
+ if (opt->max_gap_ref > 0) {
272
+ max_chain_gap_ref = opt->max_gap_ref; // always honor mm_mapopt_t::max_gap_ref if set
273
+ } else if (opt->max_frag_len > 0) {
274
+ max_chain_gap_ref = opt->max_frag_len - qlen_sum;
275
+ if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
276
+ } else max_chain_gap_ref = opt->max_gap;
277
+
278
+ chn_pen_gap = opt->chain_gap_scale * 0.01 * mi->k;
279
+ chn_pen_skip = opt->chain_skip_scale * 0.01 * mi->k;
280
+ if (opt->flag & MM_F_RMQ) {
281
+ a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
282
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_regs0, &u, b->km);
283
+ } else {
284
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
285
+ chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
286
+ }
287
+
288
+ if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences
289
+ int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
290
+ if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) {
291
+ int32_t i;
292
+ for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i];
293
+ kfree(b->km, u);
294
+ radix_sort_128x(a, a + n_a);
295
+ a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
296
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_regs0, &u, b->km);
297
+ }
298
+ } else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads
299
+ int rechain = 0;
300
+ if (n_regs0 > 0) { // test if the best chain has all the segments
301
+ int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0;
302
+ for (i = 0; i < n_regs0; ++i) { // find the best chain
303
+ if (max < (int)(u[i]>>32)) max = u[i]>>32, max_i = i, max_off = off;
304
+ off += (uint32_t)u[i];
305
+ }
306
+ for (i = 1; i < (int32_t)u[max_i]; ++i) // count the number of segments in the best chain
307
+ if ((a[max_off+i].y&MM_SEED_SEG_MASK) != (a[max_off+i-1].y&MM_SEED_SEG_MASK))
308
+ ++n_chained_segs;
309
+ if (n_chained_segs < n_segs)
310
+ rechain = 1;
311
+ } else rechain = 1;
312
+ if (rechain) { // redo chaining with a higher max_occ threshold
313
+ kfree(b->km, a);
314
+ kfree(b->km, u);
315
+ kfree(b->km, mini_pos);
316
+ if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
317
+ else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
318
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
319
+ chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
320
+ }
321
+ }
322
+ b->frag_gap = max_chain_gap_ref;
323
+ b->rep_len = rep_len;
324
+
325
+ regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a, !!(opt->flag&MM_F_QSTRAND));
326
+ if (mi->n_alt) {
327
+ mm_mark_alt(mi, n_regs0, regs0);
328
+ mm_hit_sort(b->km, &n_regs0, regs0, opt->alt_drop); // this step can be merged into mm_gen_regs(); will do if this shows up in profile
329
+ }
330
+
331
+ if (mm_dbg_flag & (MM_DBG_PRINT_SEED|MM_DBG_PRINT_CHAIN))
332
+ for (j = 0; j < n_regs0; ++j)
333
+ for (i = regs0[j].as; i < regs0[j].as + regs0[j].cnt; ++i)
334
+ fprintf(stderr, "CN\t%d\t%s\t%d\t%c\t%d\t%d\t%d\n", j, mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
335
+ i == regs0[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
336
+
337
+ chain_post(opt, max_chain_gap_ref, mi, b->km, qlen_sum, n_segs, qlens, &n_regs0, regs0, a);
338
+ if (!is_sr && !(opt->flag&MM_F_QSTRAND)) {
339
+ mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos);
340
+ n_regs0 = mm_filter_strand_retained(n_regs0, regs0);
341
+ }
342
+
343
+ if (n_segs == 1) { // uni-segment
344
+ regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a);
345
+ regs0 = (mm_reg1_t*)realloc(regs0, sizeof(*regs0) * n_regs0);
346
+ mm_set_mapq(b->km, n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr);
347
+ n_regs[0] = n_regs0, regs[0] = regs0;
348
+ } else { // multi-segment
349
+ mm_seg_t *seg;
350
+ seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains
351
+ free(regs0);
352
+ for (i = 0; i < n_segs; ++i) {
353
+ mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
354
+ regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
355
+ mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
356
+ }
357
+ mm_seg_free(b->km, n_segs, seg);
358
+ if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
359
+ mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing
360
+ }
361
+
362
+ kfree(b->km, mv.a);
363
+ kfree(b->km, a);
364
+ kfree(b->km, u);
365
+ kfree(b->km, mini_pos);
366
+
367
+ if (b->km) {
368
+ km_stat(b->km, &kmst);
369
+ if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
370
+ fprintf(stderr, "QM\t%s\t%d\tcap=%ld,nCore=%ld,largest=%ld\n", qname, qlen_sum, kmst.capacity, kmst.n_cores, kmst.largest);
371
+ assert(kmst.n_blocks == kmst.n_cores); // otherwise, there is a memory leak
372
+ if (kmst.largest > 1U<<28 || (opt->cap_kalloc > 0 && kmst.capacity > opt->cap_kalloc)) {
373
+ if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
374
+ fprintf(stderr, "[W::%s] reset thread-local memory after read %s\n", __func__, qname);
375
+ km_destroy(b->km);
376
+ b->km = km_init();
377
+ }
378
+ }
379
+ }
380
+
381
+ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
382
+ {
383
+ mm_reg1_t *regs;
384
+ mm_map_frag(mi, 1, &qlen, &seq, n_regs, &regs, b, opt, qname);
385
+ return regs;
386
+ }
387
+
388
+ /**************************
389
+ * Multi-threaded mapping *
390
+ **************************/
391
+
392
+ typedef struct {
393
+ int n_processed, n_threads, n_fp;
394
+ int64_t mini_batch_size;
395
+ const mm_mapopt_t *opt;
396
+ mm_bseq_file_t **fp;
397
+ const mm_idx_t *mi;
398
+ kstring_t str;
399
+
400
+ int n_parts;
401
+ uint32_t *rid_shift;
402
+ FILE *fp_split, **fp_parts;
403
+ } pipeline_t;
404
+
405
+ typedef struct {
406
+ const pipeline_t *p;
407
+ int n_seq, n_frag;
408
+ mm_bseq1_t *seq;
409
+ int *n_reg, *seg_off, *n_seg, *rep_len, *frag_gap;
410
+ mm_reg1_t **reg;
411
+ mm_tbuf_t **buf;
412
+ } step_t;
413
+
414
+ static void worker_for(void *_data, long i, int tid) // kt_for() callback
415
+ {
416
+ step_t *s = (step_t*)_data;
417
+ int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
418
+ const char *qseqs[MM_MAX_SEG];
419
+ double t = 0.0;
420
+ mm_tbuf_t *b = s->buf[tid];
421
+ assert(s->n_seg[i] <= MM_MAX_SEG);
422
+ if (mm_dbg_flag & MM_DBG_PRINT_QNAME) {
423
+ fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
424
+ t = realtime();
425
+ }
426
+ for (j = 0; j < s->n_seg[i]; ++j) {
427
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
428
+ mm_revcomp_bseq(&s->seq[off + j]);
429
+ qlens[j] = s->seq[off + j].l_seq;
430
+ qseqs[j] = s->seq[off + j].seq;
431
+ }
432
+ if (s->p->opt->flag & MM_F_INDEPEND_SEG) {
433
+ for (j = 0; j < s->n_seg[i]; ++j) {
434
+ mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
435
+ s->rep_len[off + j] = b->rep_len;
436
+ s->frag_gap[off + j] = b->frag_gap;
437
+ }
438
+ } else {
439
+ mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
440
+ for (j = 0; j < s->n_seg[i]; ++j) {
441
+ s->rep_len[off + j] = b->rep_len;
442
+ s->frag_gap[off + j] = b->frag_gap;
443
+ }
444
+ }
445
+ for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
446
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
447
+ int k, t;
448
+ mm_revcomp_bseq(&s->seq[off + j]);
449
+ for (k = 0; k < s->n_reg[off + j]; ++k) {
450
+ mm_reg1_t *r = &s->reg[off + j][k];
451
+ t = r->qs;
452
+ r->qs = qlens[j] - r->qe;
453
+ r->qe = qlens[j] - t;
454
+ r->rev = !r->rev;
455
+ }
456
+ }
457
+ if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
458
+ fprintf(stderr, "QT\t%s\t%d\t%.6f\n", s->seq[off].name, tid, realtime() - t);
459
+ }
460
+
461
+ static void merge_hits(step_t *s)
462
+ {
463
+ int f, i, k0, k, max_seg = 0, *n_reg_part, *rep_len_part, *frag_gap_part, *qlens;
464
+ void *km;
465
+ FILE **fp = s->p->fp_parts;
466
+ const mm_mapopt_t *opt = s->p->opt;
467
+
468
+ km = km_init();
469
+ for (f = 0; f < s->n_frag; ++f)
470
+ max_seg = max_seg > s->n_seg[f]? max_seg : s->n_seg[f];
471
+ qlens = CALLOC(int, max_seg + s->p->n_parts * 3);
472
+ n_reg_part = qlens + max_seg;
473
+ rep_len_part = n_reg_part + s->p->n_parts;
474
+ frag_gap_part = rep_len_part + s->p->n_parts;
475
+ for (f = 0, k = k0 = 0; f < s->n_frag; ++f) {
476
+ k0 = k;
477
+ for (i = 0; i < s->n_seg[f]; ++i, ++k) {
478
+ int j, l, t, rep_len = 0;
479
+ qlens[i] = s->seq[k].l_seq;
480
+ for (j = 0, s->n_reg[k] = 0; j < s->p->n_parts; ++j) {
481
+ mm_err_fread(&n_reg_part[j], sizeof(int), 1, fp[j]);
482
+ mm_err_fread(&rep_len_part[j], sizeof(int), 1, fp[j]);
483
+ mm_err_fread(&frag_gap_part[j], sizeof(int), 1, fp[j]);
484
+ s->n_reg[k] += n_reg_part[j];
485
+ if (rep_len < rep_len_part[j])
486
+ rep_len = rep_len_part[j];
487
+ }
488
+ s->reg[k] = CALLOC(mm_reg1_t, s->n_reg[k]);
489
+ for (j = 0, l = 0; j < s->p->n_parts; ++j) {
490
+ for (t = 0; t < n_reg_part[j]; ++t, ++l) {
491
+ mm_reg1_t *r = &s->reg[k][l];
492
+ uint32_t capacity;
493
+ mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]);
494
+ r->rid += s->p->rid_shift[j];
495
+ if (opt->flag & MM_F_CIGAR) {
496
+ mm_err_fread(&capacity, 4, 1, fp[j]);
497
+ r->p = (mm_extra_t*)calloc(capacity, 4);
498
+ r->p->capacity = capacity;
499
+ mm_err_fread(r->p, r->p->capacity, 4, fp[j]);
500
+ }
501
+ }
502
+ }
503
+ if (!(opt->flag&MM_F_SR) && s->seq[k].l_seq >= opt->rank_min_len)
504
+ mm_update_dp_max(s->seq[k].l_seq, s->n_reg[k], s->reg[k], opt->rank_frac, opt->a, opt->b);
505
+ for (j = 0; j < s->n_reg[k]; ++j) {
506
+ mm_reg1_t *r = &s->reg[k][j];
507
+ if (r->p) r->p->dp_max2 = 0; // reset ->dp_max2 as mm_set_parent() doesn't clear it; necessary with mm_update_dp_max()
508
+ r->subsc = 0; // this may not be necessary
509
+ r->n_sub = 0; // n_sub will be an underestimate as we don't see all the chains now, but it can't be accurate anyway
510
+ }
511
+ mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop);
512
+ mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
513
+ if (!(opt->flag & MM_F_ALL_CHAINS)) {
514
+ mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, 0, opt->max_gap * 0.8, &s->n_reg[k], s->reg[k]);
515
+ mm_set_sam_pri(s->n_reg[k], s->reg[k]);
516
+ }
517
+ mm_set_mapq(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & MM_F_SR));
518
+ }
519
+ if (s->n_seg[f] == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
520
+ mm_pair(km, frag_gap_part[0], opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, &s->n_reg[k0], &s->reg[k0]);
521
+ }
522
+ free(qlens);
523
+ km_destroy(km);
524
+ }
525
+
526
+ static void *worker_pipeline(void *shared, int step, void *in)
527
+ {
528
+ int i, j, k;
529
+ pipeline_t *p = (pipeline_t*)shared;
530
+ if (step == 0) { // step 0: read sequences
531
+ int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL));
532
+ int with_comment = !!(p->opt->flag & MM_F_COPY_COMMENT);
533
+ int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MM_F_FRAG_MODE));
534
+ step_t *s;
535
+ s = (step_t*)calloc(1, sizeof(step_t));
536
+ if (p->n_fp > 1) s->seq = mm_bseq_read_frag2(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
537
+ else s->seq = mm_bseq_read3(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
538
+ if (s->seq) {
539
+ s->p = p;
540
+ for (i = 0; i < s->n_seq; ++i)
541
+ s->seq[i].rid = p->n_processed++;
542
+ s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*));
543
+ for (i = 0; i < p->n_threads; ++i)
544
+ s->buf[i] = mm_tbuf_init();
545
+ s->n_reg = (int*)calloc(5 * s->n_seq, sizeof(int));
546
+ s->seg_off = s->n_reg + s->n_seq; // seg_off, n_seg, rep_len and frag_gap are allocated together with n_reg
547
+ s->n_seg = s->seg_off + s->n_seq;
548
+ s->rep_len = s->n_seg + s->n_seq;
549
+ s->frag_gap = s->rep_len + s->n_seq;
550
+ s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*));
551
+ for (i = 1, j = 0; i <= s->n_seq; ++i)
552
+ if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) {
553
+ s->n_seg[s->n_frag] = i - j;
554
+ s->seg_off[s->n_frag++] = j;
555
+ j = i;
556
+ }
557
+ return s;
558
+ } else free(s);
559
+ } else if (step == 1) { // step 1: map
560
+ if (p->n_parts > 0) merge_hits((step_t*)in);
561
+ else kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
562
+ return in;
563
+ } else if (step == 2) { // step 2: output
564
+ void *km = 0;
565
+ step_t *s = (step_t*)in;
566
+ const mm_idx_t *mi = p->mi;
567
+ for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
568
+ free(s->buf);
569
+ if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
570
+ for (k = 0; k < s->n_frag; ++k) {
571
+ int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
572
+ for (i = seg_st; i < seg_en; ++i) {
573
+ mm_bseq1_t *t = &s->seq[i];
574
+ if (p->opt->split_prefix && p->n_parts == 0) { // then write to temporary files
575
+ mm_err_fwrite(&s->n_reg[i], sizeof(int), 1, p->fp_split);
576
+ mm_err_fwrite(&s->rep_len[i], sizeof(int), 1, p->fp_split);
577
+ mm_err_fwrite(&s->frag_gap[i], sizeof(int), 1, p->fp_split);
578
+ for (j = 0; j < s->n_reg[i]; ++j) {
579
+ mm_reg1_t *r = &s->reg[i][j];
580
+ mm_err_fwrite(r, sizeof(mm_reg1_t), 1, p->fp_split);
581
+ if (p->opt->flag & MM_F_CIGAR) {
582
+ mm_err_fwrite(&r->p->capacity, 4, 1, p->fp_split);
583
+ mm_err_fwrite(r->p, r->p->capacity, 4, p->fp_split);
584
+ }
585
+ }
586
+ } else if (s->n_reg[i] > 0) { // the query has at least one hit
587
+ for (j = 0; j < s->n_reg[i]; ++j) {
588
+ mm_reg1_t *r = &s->reg[i][j];
589
+ assert(!r->sam_pri || r->id == r->parent);
590
+ if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent)
591
+ continue;
592
+ if (p->opt->flag & MM_F_OUT_SAM)
593
+ mm_write_sam3(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]);
594
+ else
595
+ mm_write_paf3(&p->str, mi, t, r, km, p->opt->flag, s->rep_len[i]);
596
+ mm_err_puts(p->str.s);
597
+ }
598
+ } else if ((p->opt->flag & MM_F_PAF_NO_HIT) || ((p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_SAM_HIT_ONLY))) { // output an empty hit, if requested
599
+ if (p->opt->flag & MM_F_OUT_SAM)
600
+ mm_write_sam3(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]);
601
+ else
602
+ mm_write_paf3(&p->str, mi, t, 0, 0, p->opt->flag, s->rep_len[i]);
603
+ mm_err_puts(p->str.s);
604
+ }
605
+ }
606
+ for (i = seg_st; i < seg_en; ++i) {
607
+ for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p);
608
+ free(s->reg[i]);
609
+ free(s->seq[i].seq); free(s->seq[i].name);
610
+ if (s->seq[i].qual) free(s->seq[i].qual);
611
+ if (s->seq[i].comment) free(s->seq[i].comment);
612
+ }
613
+ }
614
+ free(s->reg); free(s->n_reg); free(s->seq); // seg_off, n_seg, rep_len and frag_gap were allocated with reg; no memory leak here
615
+ km_destroy(km);
616
+ if (mm_verbose >= 3)
617
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
618
+ free(s);
619
+ }
620
+ return 0;
621
+ }
622
+
623
+ static mm_bseq_file_t **open_bseqs(int n, const char **fn)
624
+ {
625
+ mm_bseq_file_t **fp;
626
+ int i, j;
627
+ fp = (mm_bseq_file_t**)calloc(n, sizeof(mm_bseq_file_t*));
628
+ for (i = 0; i < n; ++i) {
629
+ if ((fp[i] = mm_bseq_open(fn[i])) == 0) {
630
+ if (mm_verbose >= 1)
631
+ fprintf(stderr, "ERROR: failed to open file '%s': %s\n", fn[i], strerror(errno));
632
+ for (j = 0; j < i; ++j)
633
+ mm_bseq_close(fp[j]);
634
+ free(fp);
635
+ return 0;
636
+ }
637
+ }
638
+ return fp;
639
+ }
640
+
641
+ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
642
+ {
643
+ int i, pl_threads;
644
+ pipeline_t pl;
645
+ if (n_segs < 1) return -1;
646
+ memset(&pl, 0, sizeof(pipeline_t));
647
+ pl.n_fp = n_segs;
648
+ pl.fp = open_bseqs(pl.n_fp, fn);
649
+ if (pl.fp == 0) return -1;
650
+ pl.opt = opt, pl.mi = idx;
651
+ pl.n_threads = n_threads > 1? n_threads : 1;
652
+ pl.mini_batch_size = opt->mini_batch_size;
653
+ if (opt->split_prefix)
654
+ pl.fp_split = mm_split_init(opt->split_prefix, idx);
655
+ pl_threads = n_threads == 1? 1 : (opt->flag&MM_F_2_IO_THREADS)? 3 : 2;
656
+ kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
657
+
658
+ free(pl.str.s);
659
+ if (pl.fp_split) fclose(pl.fp_split);
660
+ for (i = 0; i < pl.n_fp; ++i)
661
+ mm_bseq_close(pl.fp[i]);
662
+ free(pl.fp);
663
+ return 0;
664
+ }
665
+
666
+ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads)
667
+ {
668
+ return mm_map_file_frag(idx, 1, &fn, opt, n_threads);
669
+ }
670
+
671
+ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx)
672
+ {
673
+ int i;
674
+ pipeline_t pl;
675
+ mm_idx_t *mi;
676
+ if (n_segs < 1 || n_split_idx < 1) return -1;
677
+ memset(&pl, 0, sizeof(pipeline_t));
678
+ pl.n_fp = n_segs;
679
+ pl.fp = open_bseqs(pl.n_fp, fn);
680
+ if (pl.fp == 0) return -1;
681
+ pl.opt = opt;
682
+ pl.mini_batch_size = opt->mini_batch_size;
683
+
684
+ pl.n_parts = n_split_idx;
685
+ pl.fp_parts = CALLOC(FILE*, pl.n_parts);
686
+ pl.rid_shift = CALLOC(uint32_t, pl.n_parts);
687
+ pl.mi = mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift);
688
+ if (pl.mi == 0) {
689
+ free(pl.fp_parts);
690
+ free(pl.rid_shift);
691
+ return -1;
692
+ }
693
+ for (i = n_split_idx - 1; i > 0; --i)
694
+ pl.rid_shift[i] = pl.rid_shift[i - 1];
695
+ for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
696
+ pl.rid_shift[i] += pl.rid_shift[i - 1];
697
+ if (opt->flag & MM_F_OUT_SAM)
698
+ for (i = 0; i < (int32_t)pl.mi->n_seq; ++i)
699
+ printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);
700
+
701
+ kt_pipeline(2, worker_pipeline, &pl, 3);
702
+
703
+ free(pl.str.s);
704
+ mm_idx_destroy(mi);
705
+ free(pl.rid_shift);
706
+ for (i = 0; i < n_split_idx; ++i)
707
+ fclose(pl.fp_parts[i]);
708
+ free(pl.fp_parts);
709
+ for (i = 0; i < pl.n_fp; ++i)
710
+ mm_bseq_close(pl.fp[i]);
711
+ free(pl.fp);
712
+ mm_split_rm_tmp(opt->split_prefix, n_split_idx);
713
+ return 0;
714
+ }