ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,500 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #include <math.h>
4
+ #include "kalloc.h"
5
+ #include "mgpriv.h"
6
+ #include "khashl.h"
7
+ #include "sys.h"
8
+
9
+ struct mg_tbuf_s {
10
+ void *km;
11
+ int frag_gap;
12
+ };
13
+
14
+ mg_tbuf_t *mg_tbuf_init(void)
15
+ {
16
+ mg_tbuf_t *b;
17
+ b = (mg_tbuf_t*)calloc(1, sizeof(mg_tbuf_t));
18
+ if (!(mg_dbg_flag & MG_DBG_NO_KALLOC)) b->km = km_init();
19
+ return b;
20
+ }
21
+
22
+ void mg_tbuf_destroy(mg_tbuf_t *b)
23
+ {
24
+ if (b == 0) return;
25
+ if (b->km) km_destroy(b->km);
26
+ free(b);
27
+ }
28
+
29
+ void *mg_tbuf_get_km(mg_tbuf_t *b)
30
+ {
31
+ return b->km;
32
+ }
33
+
34
+ static void collect_minimizers(void *km, const mg_mapopt_t *opt, const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg128_v *mv)
35
+ {
36
+ int i, n, sum = 0;
37
+ mv->n = 0;
38
+ for (i = n = 0; i < n_segs; ++i) {
39
+ size_t j;
40
+ mg_sketch(km, seqs[i], qlens[i], gi->w, gi->k, i, mv);
41
+ for (j = n; j < mv->n; ++j)
42
+ mv->a[j].y += sum << 1;
43
+ sum += qlens[i], n = mv->n;
44
+ }
45
+ }
46
+
47
+ #include "ksort.h"
48
+ #define heap_lt(a, b) ((a).x > (b).x)
49
+ KSORT_INIT(heap, mg128_t, heap_lt)
50
+
51
+ typedef struct {
52
+ uint32_t n;
53
+ uint32_t q_pos, q_span;
54
+ uint32_t seg_id:31, is_tandem:1;
55
+ const uint64_t *cr;
56
+ } mg_match_t;
57
+
58
+ static mg_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mg_idx_t *gi, const mg128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos)
59
+ {
60
+ int rep_st = 0, rep_en = 0, n_m;
61
+ size_t i;
62
+ mg_match_t *m;
63
+ *n_mini_pos = 0;
64
+ KMALLOC(km, *mini_pos, mv->n);
65
+ m = (mg_match_t*)kmalloc(km, mv->n * sizeof(mg_match_t));
66
+ for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
67
+ const uint64_t *cr;
68
+ mg128_t *p = &mv->a[i];
69
+ uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
70
+ int t;
71
+ cr = mg_idx_get(gi, p->x>>8, &t);
72
+ if (t >= max_occ) {
73
+ int en = (q_pos >> 1) + 1, st = en - q_span;
74
+ if (st > rep_en) {
75
+ *rep_len += rep_en - rep_st;
76
+ rep_st = st, rep_en = en;
77
+ } else rep_en = en;
78
+ } else {
79
+ mg_match_t *q = &m[n_m++];
80
+ q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
81
+ q->is_tandem = 0;
82
+ if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
83
+ if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
84
+ *n_a += q->n;
85
+ (*mini_pos)[(*n_mini_pos)++] = q_pos>>1;
86
+ }
87
+ }
88
+ *rep_len += rep_en - rep_st;
89
+ *_n_m = n_m;
90
+ return m;
91
+ }
92
+
93
+ static mg128_t *collect_seed_hits_heap(void *km, const mg_mapopt_t *opt, int max_occ, const mg_idx_t *gi, const char *qname, const mg128_v *mv, int qlen, int64_t *n_a, int *rep_len,
94
+ int *n_mini_pos, int32_t **mini_pos)
95
+ {
96
+ int i, n_m, heap_size = 0;
97
+ int64_t n_for = 0, n_rev = 0;
98
+ mg_match_t *m;
99
+ mg128_t *a, *heap;
100
+
101
+ m = collect_matches(km, &n_m, max_occ, gi, mv, n_a, rep_len, n_mini_pos, mini_pos);
102
+
103
+ heap = (mg128_t*)kmalloc(km, n_m * sizeof(mg128_t));
104
+ a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));
105
+
106
+ for (i = 0, heap_size = 0; i < n_m; ++i) {
107
+ if (m[i].n > 0) {
108
+ heap[heap_size].x = m[i].cr[0];
109
+ heap[heap_size].y = (uint64_t)i<<32;
110
+ ++heap_size;
111
+ }
112
+ }
113
+ ks_heapmake_heap(heap_size, heap);
114
+ while (heap_size > 0) {
115
+ mg_match_t *q = &m[heap->y>>32];
116
+ mg128_t *p;
117
+ uint64_t r = heap->x;
118
+ int32_t rpos = (uint32_t)r >> 1;
119
+ // TODO: skip anchor if MG_F_NO_DIAL
120
+ if ((r&1) == (q->q_pos&1)) { // forward strand
121
+ p = &a[n_for++];
122
+ p->x = r>>32<<33 | rpos;
123
+ } else { // reverse strand; TODO: more testing needed for this block
124
+ p = &a[(*n_a) - (++n_rev)];
125
+ p->x = r>>32<<33 | 1ULL<<32 | (gi->g->seg[r>>32].len - (rpos + 1 - q->q_span) - 1);
126
+ }
127
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
128
+ p->y |= (uint64_t)q->seg_id << MG_SEED_SEG_SHIFT;
129
+ if (q->is_tandem) p->y |= MG_SEED_TANDEM;
130
+ p->y |= (uint64_t)(q->n < 255? q->n : 255) << MG_SEED_OCC_SHIFT;
131
+ // update the heap
132
+ if ((uint32_t)heap->y < q->n - 1) {
133
+ ++heap[0].y;
134
+ heap[0].x = m[heap[0].y>>32].cr[(uint32_t)heap[0].y];
135
+ } else {
136
+ heap[0] = heap[heap_size - 1];
137
+ --heap_size;
138
+ }
139
+ ks_heapdown_heap(0, heap_size, heap);
140
+ }
141
+ kfree(km, m);
142
+ kfree(km, heap);
143
+
144
+ // reverse anchors on the reverse strand, as they are in the descending order
145
+ if (*n_a > n_for + n_rev) {
146
+ memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mg128_t));
147
+ *n_a = n_for + n_rev;
148
+ }
149
+ return a;
150
+ }
151
+
152
+ static mg128_t *collect_seed_hits(void *km, const mg_mapopt_t *opt, int max_occ, const mg_idx_t *gi, const char *qname, const mg128_v *mv, int qlen, int64_t *n_a, int *rep_len,
153
+ int *n_mini_pos, int32_t **mini_pos)
154
+ {
155
+ int i, n_m;
156
+ mg_match_t *m;
157
+ mg128_t *a;
158
+ m = collect_matches(km, &n_m, max_occ, gi, mv, n_a, rep_len, n_mini_pos, mini_pos);
159
+ a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));
160
+ for (i = 0, *n_a = 0; i < n_m; ++i) {
161
+ mg_match_t *q = &m[i];
162
+ const uint64_t *r = q->cr;
163
+ uint32_t k;
164
+ for (k = 0; k < q->n; ++k) {
165
+ int32_t rpos = (uint32_t)r[k] >> 1;
166
+ mg128_t *p;
167
+ if (qname && (opt->flag & MG_M_NO_DIAG)) {
168
+ const gfa_seg_t *s = &gi->g->seg[r[k]>>32];
169
+ const char *gname = s->snid >= 0 && gi->g->sseq? gi->g->sseq[s->snid].name : s->name;
170
+ int32_t g_pos;
171
+ if (s->snid >= 0 && gi->g->sseq)
172
+ gname = gi->g->sseq[s->snid].name, g_pos = s->soff + (uint32_t)r[k];
173
+ else
174
+ gname = s->name, g_pos = (uint32_t)r[k];
175
+ if (g_pos == q->q_pos && strcmp(qname, gname) == 0)
176
+ continue;
177
+ }
178
+ p = &a[(*n_a)++];
179
+ if ((r[k]&1) == (q->q_pos&1)) // forward strand
180
+ p->x = r[k]>>32<<33 | rpos;
181
+ else // reverse strand
182
+ p->x = r[k]>>32<<33 | 1ULL<<32 | (gi->g->seg[r[k]>>32].len - (rpos + 1 - q->q_span) - 1);
183
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
184
+ p->y |= (uint64_t)q->seg_id << MG_SEED_SEG_SHIFT;
185
+ if (q->is_tandem) p->y |= MG_SEED_TANDEM;
186
+ p->y |= (uint64_t)(q->n < 255? q->n : 255) << MG_SEED_OCC_SHIFT;
187
+ }
188
+ }
189
+ kfree(km, m);
190
+ radix_sort_128x(a, a + (*n_a));
191
+ return a;
192
+ }
193
+
194
+ static void mm_fix_bad_ends(const mg128_t *a, int32_t lc_max_occ, int32_t lc_max_trim, int32_t *as, int32_t *cnt)
195
+ {
196
+ int32_t i, k, as0 = *as, cnt0 = *cnt;
197
+ for (i = as0 + cnt0 - 1, k = 0; k < lc_max_trim && k < cnt0; ++k, --i)
198
+ if (a[i].y>>MG_SEED_OCC_SHIFT <= lc_max_occ)
199
+ break;
200
+ *cnt -= k;
201
+ for (i = as0, k = 0; k < *cnt && k < lc_max_trim; ++i, ++k)
202
+ if (a[i].y>>MG_SEED_OCC_SHIFT <= lc_max_occ)
203
+ break;
204
+ *as += k, *cnt -= k;
205
+ }
206
+
207
+ static void mm_fix_bad_ends_alt(const mg128_t *a, int32_t score, int bw, int min_match, int32_t *as, int32_t *cnt)
208
+ {
209
+ int32_t i, l, m, as0 = *as, cnt0 = *cnt;
210
+ if (cnt0 < 3) return;
211
+ m = l = a[as0].y >> 32 & 0xff;
212
+ for (i = as0 + 1; i < as0 + cnt0 - 1; ++i) {
213
+ int32_t lq, lr, min, max;
214
+ int32_t q_span = a[i].y >> 32 & 0xff;
215
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
216
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
217
+ min = lr < lq? lr : lq;
218
+ max = lr > lq? lr : lq;
219
+ if (max - min > l >> 1) *as = i;
220
+ l += min;
221
+ m += min < q_span? min : q_span;
222
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= score>>1) break;
223
+ }
224
+ *cnt = as0 + cnt0 - *as;
225
+ m = l = a[as0 + cnt0 - 1].y >> 32 & 0xff;
226
+ for (i = as0 + cnt0 - 2; i > *as; --i) {
227
+ int32_t lq, lr, min, max;
228
+ int32_t q_span = a[i+1].y >> 32 & 0xff;
229
+ lr = (int32_t)a[i+1].x - (int32_t)a[i].x;
230
+ lq = (int32_t)a[i+1].y - (int32_t)a[i].y;
231
+ min = lr < lq? lr : lq;
232
+ max = lr > lq? lr : lq;
233
+ if (max - min > l >> 1) *cnt = i + 1 - *as;
234
+ l += min;
235
+ m += min < q_span? min : q_span;
236
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= score>>1) break;
237
+ }
238
+ }
239
+
240
+ static int *collect_long_gaps(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int *n_)
241
+ {
242
+ int i, n, *K;
243
+ *n_ = 0;
244
+ for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap
245
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
246
+ if (gap < -min_gap || gap > min_gap) ++n;
247
+ }
248
+ if (n <= 1) return 0;
249
+ K = (int*)kmalloc(km, n * sizeof(int));
250
+ for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps
251
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
252
+ if (gap < -min_gap || gap > min_gap)
253
+ K[n++] = i;
254
+ }
255
+ *n_ = n;
256
+ return K;
257
+ }
258
+
259
+ static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt)
260
+ {
261
+ int max_st, max_en, n, i, k, max, *K;
262
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
263
+ if (K == 0) return;
264
+ max = 0, max_st = max_en = -1;
265
+ for (k = 0;; ++k) { // traverse long gaps
266
+ int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1;
267
+ if (k == n || k >= max_en) {
268
+ if (max_en > 0)
269
+ for (i = K[max_st]; i < K[max_en]; ++i)
270
+ a[as1 + i].y |= MG_SEED_IGNORE;
271
+ max = 0, max_st = max_en = -1;
272
+ if (k == n) break;
273
+ }
274
+ i = K[k];
275
+ gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x);
276
+ if (gap > 0) n_ins += gap;
277
+ else n_del += -gap;
278
+ qs = (int32_t)a[as1 + i - 1].y;
279
+ rs = (int32_t)a[as1 + i - 1].x;
280
+ for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) {
281
+ int j = K[l], diff;
282
+ if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break;
283
+ gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
284
+ if (gap > 0) n_ins += gap;
285
+ else n_del += -gap;
286
+ diff = n_ins + n_del - abs(n_ins - n_del);
287
+ if (max_diff < diff)
288
+ max_diff = diff, max_diff_l = l;
289
+ }
290
+ if (max_diff > diff_thres && max_diff > max)
291
+ max = max_diff, max_st = k, max_en = max_diff_l;
292
+ }
293
+ kfree(km, K);
294
+ }
295
+
296
+ static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int max_ext)
297
+ {
298
+ int n, k, *K;
299
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
300
+ if (K == 0) return;
301
+ for (k = 0; k < n;) {
302
+ int i = K[k], l;
303
+ int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x);
304
+ int re1 = (int32_t)a[as1 + i].x;
305
+ int qe1 = (int32_t)a[as1 + i].y;
306
+ gap1 = gap1 > 0? gap1 : -gap1;
307
+ for (l = k + 1; l < n; ++l) {
308
+ int j = K[l], gap2, q_span_pre, rs2, qs2, m;
309
+ if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break;
310
+ gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
311
+ q_span_pre = a[as1 + j - 1].y >> 32 & 0xff;
312
+ rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre;
313
+ qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre;
314
+ m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1;
315
+ gap2 = gap2 > 0? gap2 : -gap2;
316
+ if (m > gap1 + gap2) break;
317
+ re1 = (int32_t)a[as1 + j].x;
318
+ qe1 = (int32_t)a[as1 + j].y;
319
+ gap1 = gap2;
320
+ }
321
+ if (l > k + 1) {
322
+ int j, end = K[l - 1];
323
+ for (j = K[k]; j < end; ++j)
324
+ a[as1 + j].y |= MG_SEED_IGNORE;
325
+ a[as1 + end].y |= MG_SEED_FIXED;
326
+ }
327
+ k = l;
328
+ }
329
+ kfree(km, K);
330
+ }
331
+
332
+ static double print_time(double t0, int stage, const char *qname)
333
+ {
334
+ double t;
335
+ t = realtime();
336
+ fprintf(stderr, "Q%d\t%s\t%.3f\n", stage, qname, t - t0);
337
+ return t;
338
+ }
339
+
340
+ void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg_gchains_t **gcs, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname)
341
+ {
342
+ int i, l, rep_len, qlen_sum, n_lc, n_gc, n_mini_pos;
343
+ int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MG_M_SPLICE), is_sr = !!(opt->flag & MG_M_SR);
344
+ uint32_t hash;
345
+ int64_t n_a;
346
+ uint64_t *u;
347
+ int32_t *mini_pos;
348
+ mg128_t *a;
349
+ mg128_v mv = {0,0,0};
350
+ mg_lchain_t *lc;
351
+ char *seq_cat;
352
+ km_stat_t kmst;
353
+ float tmp, chn_pen_gap, chn_pen_skip;
354
+ double t = 0.0;
355
+
356
+ for (i = 0, qlen_sum = 0; i < n_segs; ++i)
357
+ qlen_sum += qlens[i], gcs[i] = 0;
358
+
359
+ if (qlen_sum == 0 || n_segs <= 0 || n_segs > MG_MAX_SEG) return;
360
+ if (opt->max_qlen > 0 && qlen_sum > opt->max_qlen) return;
361
+
362
+ hash = qname? kh_hash_str(qname) : 0;
363
+ hash ^= kh_hash_uint32(qlen_sum) + kh_hash_uint32(opt->seed);
364
+ hash = kh_hash_uint32(hash);
365
+
366
+ collect_minimizers(b->km, opt, gi, n_segs, qlens, seqs, &mv);
367
+ if (opt->flag & MG_M_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->occ_max1, gi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
368
+ else a = collect_seed_hits(b->km, opt, opt->occ_max1, gi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
369
+
370
+ if (mg_dbg_flag & MG_DBG_SEED) {
371
+ fprintf(stderr, "RS\t%d\n", rep_len);
372
+ for (i = 0; i < n_a; ++i)
373
+ fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", gi->g->seg[a[i].x>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>32&1], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
374
+ i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
375
+ }
376
+
377
+ // set max chaining gap on the query and the reference sequence
378
+ if (is_sr)
379
+ max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap;
380
+ else max_chain_gap_qry = opt->max_gap;
381
+ if (opt->max_gap_ref > 0) {
382
+ max_chain_gap_ref = opt->max_gap_ref; // always honor mg_mapopt_t::max_gap_ref if set
383
+ } else if (opt->max_frag_len > 0) {
384
+ max_chain_gap_ref = opt->max_frag_len - qlen_sum;
385
+ if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
386
+ } else max_chain_gap_ref = opt->max_gap;
387
+
388
+ tmp = expf(-opt->div * gi->k);
389
+ chn_pen_gap = opt->chn_pen_gap * tmp;
390
+ chn_pen_skip = opt->chn_pen_skip * tmp;
391
+
392
+ if (mg_dbg_flag & MG_DBG_QNAME) t = realtime();
393
+ if (n_a == 0) {
394
+ if (a) kfree(b->km, a);
395
+ a = 0, n_lc = 0, u = 0;
396
+ } else {
397
+ if (opt->flag & MG_M_RMQ) {
398
+ a = mg_lchain_rmq(opt->max_gap, opt->max_gap_pre, opt->bw, opt->max_lc_skip, opt->rmq_size_cap, opt->min_lc_cnt, opt->min_lc_score,
399
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_lc, &u, b->km);
400
+ } else {
401
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_lc_skip, opt->max_lc_iter, opt->min_lc_cnt, opt->min_lc_score,
402
+ chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_lc, &u, b->km);
403
+ }
404
+ }
405
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 1, qname);
406
+
407
+ if (opt->bw_long > opt->bw && (opt->flag & (MG_M_SPLICE|MG_M_SR)) == 0 && n_segs == 1 && n_lc > 1) { // re-chain/long-join for long sequences
408
+ int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
409
+ if (qlen_sum - (en - st) > opt->rmq_rescue_size || qlen_sum - (en - st) > qlen_sum * opt->rmq_rescue_ratio) {
410
+ int32_t i;
411
+ for (i = 0, n_a = 0; i < n_lc; ++i) n_a += (int32_t)u[i];
412
+ kfree(b->km, u);
413
+ radix_sort_128x(a, a + n_a);
414
+ a = mg_lchain_rmq(opt->max_gap, opt->max_gap_pre, opt->bw_long, opt->max_lc_skip, opt->rmq_size_cap, opt->min_lc_cnt, opt->min_lc_score,
415
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_lc, &u, b->km);
416
+ }
417
+ }
418
+
419
+ b->frag_gap = max_chain_gap_ref;
420
+ kfree(b->km, mv.a);
421
+
422
+ if (n_lc) {
423
+ lc = mg_lchain_gen(b->km, hash, qlen_sum, n_lc, u, a);
424
+ if (n_lc > 1) {
425
+ int32_t n_lc_new = 0;
426
+ for (i = 0; i < n_lc; ++i) {
427
+ mg_lchain_t *p = &lc[i];
428
+ int32_t cnt = p->cnt, off = p->off;
429
+ mm_fix_bad_ends(a, opt->lc_max_occ, opt->lc_max_trim, &off, &cnt);
430
+ mm_fix_bad_ends_alt(a, p->score, opt->bw, 100, &off, &cnt);
431
+ mm_filter_bad_seeds(b->km, off, cnt, a, 10, 40, opt->max_gap>>1, 10);
432
+ mm_filter_bad_seeds_alt(b->km, off, cnt, a, 30, opt->max_gap>>1);
433
+ //printf("X\t%d\t%d\t%d\t%d\t%d\t%d\n", p->qs, p->qe, p->off, p->cnt, off, cnt);
434
+ p->off = off, p->cnt = cnt;
435
+ if (cnt >= opt->min_lc_cnt) {
436
+ int32_t q_span = a[p->off].y>>32 & 0xff;
437
+ p->rs = (int32_t)a[p->off].x + 1 - q_span;
438
+ p->qs = (int32_t)a[p->off].y + 1 - q_span;
439
+ p->re = (int32_t)a[p->off + p->cnt - 1].x + 1;
440
+ p->qe = (int32_t)a[p->off + p->cnt - 1].y + 1;
441
+ lc[n_lc_new++] = *p;
442
+ }
443
+ }
444
+ n_lc = n_lc_new;
445
+ }
446
+ for (i = 0; i < n_lc; ++i)
447
+ mg_update_anchors(lc[i].cnt, &a[lc[i].off], n_mini_pos, mini_pos);
448
+ } else lc = 0;
449
+ kfree(b->km, mini_pos);
450
+ kfree(b->km, u);
451
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 2, qname);
452
+
453
+ if (mg_dbg_flag & MG_DBG_LCHAIN)
454
+ mg_print_lchain(stdout, gi, n_lc, lc, a, qname);
455
+
456
+ KMALLOC(b->km, seq_cat, qlen_sum);
457
+ for (i = l = 0; i < n_segs; ++i) {
458
+ strncpy(&seq_cat[l], seqs[i], qlens[i]);
459
+ l += qlens[i];
460
+ }
461
+ n_gc = mg_gchain1_dp(b->km, gi->g, &n_lc, lc, qlen_sum, opt->bw_long, opt->bw_long, opt->bw_long, opt->max_gc_skip, opt->ref_bonus,
462
+ chn_pen_gap, chn_pen_skip, opt->mask_level, a, &u);
463
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 3, qname);
464
+ gcs[0] = mg_gchain_gen(0, b->km, gi->g, gi->es, n_gc, u, lc, a, hash, opt->min_gc_cnt, opt->min_gc_score, opt->gdp_max_ed, n_segs, seq_cat);
465
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 4, qname);
466
+ gcs[0]->rep_len = rep_len;
467
+ kfree(b->km, a);
468
+ kfree(b->km, lc);
469
+ kfree(b->km, u);
470
+
471
+ mg_gchain_set_parent(b->km, opt->mask_level, gcs[0]->n_gc, gcs[0]->gc, opt->sub_diff, 0);
472
+ mg_gchain_flt_sub(opt->pri_ratio, gi->k * 2, opt->best_n, gcs[0]->n_gc, gcs[0]->gc);
473
+ mg_gchain_drop_flt(b->km, gcs[0]);
474
+ mg_gchain_set_mapq(b->km, gcs[0], qlen_sum, mv.n, opt->min_gc_score);
475
+ if ((opt->flag&MG_M_CIGAR) && n_segs == 1)
476
+ mg_gchain_cigar(b->km, gi->g, gi->es, seq_cat, gcs[0], qname);
477
+ kfree(b->km, seq_cat);
478
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 5, qname);
479
+
480
+ if (b->km) {
481
+ km_stat(b->km, &kmst);
482
+ if (mg_dbg_flag & MG_DBG_QNAME)
483
+ fprintf(stderr, "QM\t%s\t%d\tcap=%ld,nCore=%ld,largest=%ld\n", qname, qlen_sum, kmst.capacity, kmst.n_cores, kmst.largest);
484
+ if (kmst.n_blocks != kmst.n_cores) {
485
+ fprintf(stderr, "[E::%s] memory leak at %s\n", __func__, qname);
486
+ abort();
487
+ }
488
+ if (kmst.largest > 1U<<28 || (opt->cap_kalloc > 0 && kmst.capacity > opt->cap_kalloc)) {
489
+ km_destroy(b->km);
490
+ b->km = km_init();
491
+ }
492
+ }
493
+ }
494
+
495
+ mg_gchains_t *mg_map(const mg_idx_t *gi, int qlen, const char *seq, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname)
496
+ {
497
+ mg_gchains_t *gcs;
498
+ mg_map_frag(gi, 1, &qlen, &seq, &gcs, b, opt, qname);
499
+ return gcs;
500
+ }
@@ -0,0 +1,128 @@
1
+ #ifndef MGPRIV_H
2
+ #define MGPRIV_H
3
+
4
+ #include <stdlib.h>
5
+ #include "minigraph.h"
6
+
7
+ #define MG_DBG_NO_KALLOC 0x1
8
+ #define MG_DBG_QNAME 0x2
9
+ #define MG_DBG_SEED 0x4
10
+ #define MG_DBG_LCHAIN 0x8
11
+ #define MG_DBG_INSERT 0x10
12
+ #define MG_DBG_SHORTK 0x20
13
+ #define MG_DBG_GC1 0x40
14
+ #define MG_DBG_LC_PROF 0x80
15
+ #define MG_DBG_MINIWFA 0x100
16
+ #define MG_DBG_MWF_SEQ 0x200
17
+
18
+ #define MG_SEED_IGNORE (1ULL<<41)
19
+ #define MG_SEED_TANDEM (1ULL<<42)
20
+ #define MG_SEED_FIXED (1ULL<<43)
21
+
22
+ #define MG_MAX_SEG 255
23
+ #define MG_SEED_SEG_SHIFT 48
24
+ #define MG_SEED_SEG_MASK (0xffULL<<(MG_SEED_SEG_SHIFT))
25
+ #define mg_seg_id(a) ((int32_t)(((a).y&MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT))
26
+
27
+ #define MG_SEED_OCC_SHIFT 56
28
+
29
+ #define MG_MAX_SHORT_K 15
30
+
31
+ #ifndef KSTRING_T
32
+ #define KSTRING_T kstring_t
33
+ typedef struct __kstring_t {
34
+ unsigned l, m;
35
+ char *s;
36
+ } kstring_t;
37
+ #endif
38
+
39
+ // shortest path
40
+ typedef struct {
41
+ // input
42
+ uint32_t v;
43
+ int32_t target_dist;
44
+ uint32_t target_hash;
45
+ uint32_t meta:30, check_hash:1, inner:1;
46
+ int32_t qlen;
47
+ // output
48
+ uint32_t n_path:31, is_0:1;
49
+ int32_t path_end;
50
+ int32_t dist;
51
+ uint32_t hash;
52
+ } mg_path_dst_t;
53
+
54
+ typedef struct {
55
+ uint32_t v, d;
56
+ int32_t pre;
57
+ } mg_pathv_t;
58
+
59
+ #ifdef __cplusplus
60
+ extern "C" {
61
+ #endif
62
+
63
+ static inline float mg_log2(float x) // NB: this doesn't work when x<2
64
+ {
65
+ union { float f; uint32_t i; } z = { x };
66
+ float log_2 = ((z.i >> 23) & 255) - 128;
67
+ z.i &= ~(255 << 23);
68
+ z.i += 127 << 23;
69
+ log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
70
+ return log_2;
71
+ }
72
+
73
+ extern unsigned char seq_nt4_table[256];
74
+
75
+ void mg_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, mg128_v *p);
76
+
77
+ void *mg_idx_a2h(void *km, int32_t n_a, mg128_t *a, int suflen, uint64_t **q_, int32_t *n_);
78
+ const uint64_t *mg_idx_hget(const void *h_, const uint64_t *q, int suflen, uint64_t minier, int *n);
79
+ void mg_idx_hfree(void *h_);
80
+
81
+ const uint64_t *mg_idx_get(const mg_idx_t *gi, uint64_t minier, int *n);
82
+ void mg_idx_cal_quantile(const mg_idx_t *gi, int32_t m, float f[], int32_t q[]);
83
+
84
+ uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t max_drop,
85
+ int32_t extra_u, int32_t *n_u_, int32_t *n_v_);
86
+ mg128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
87
+ int is_cdna, int n_seg, int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km);
88
+ mg128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
89
+ int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km);
90
+ mg_lchain_t *mg_lchain_gen(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mg128_t *a);
91
+ void mg_update_anchors(int32_t n_a, mg128_t *a, int32_t n, const int32_t *mini_pos);
92
+
93
+ mg_pathv_t *mg_shortest_k(void *km0, const gfa_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t *n_pathv);
94
+ int32_t mg_gchain1_dp(void *km, const gfa_t *g, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip,
95
+ int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, const mg128_t *an, uint64_t **u_);
96
+ mg_gchains_t *mg_gchain_gen(void *km_dst, void *km, const gfa_t *g, const gfa_edseq_t *es, int32_t n_u, const uint64_t *u,
97
+ mg_lchain_t *lc, const mg128_t *a, uint32_t hash, int32_t min_gc_cnt, int32_t min_gc_score,
98
+ int32_t gdp_max_ed, int32_t n_seg, const char *qseq);
99
+ void mg_gchain_cigar(void *km, const gfa_t *g, const gfa_edseq_t *es, const char *qseq, mg_gchains_t *gt, const char *qname);
100
+ void mg_gchain_free(mg_gchains_t *gs);
101
+
102
+ uint32_t *lv_ed_unified(void *km, int32_t tl, const char *ts, int32_t ql, const char *qs, int32_t is_ext, int32_t *score, int32_t *t_endl, int32_t *q_endl, int32_t *n_cigar);
103
+
104
+ void mg_gchain_restore_order(void *km, mg_gchains_t *gcs);
105
+ void mg_gchain_restore_offset(mg_gchains_t *gcs);
106
+ void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs);
107
+ void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level);
108
+ int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r);
109
+ void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs);
110
+ void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score);
111
+
112
+ void mg_cov_map(const gfa_t *g, const mg_gchains_t *gt, int32_t min_mapq, int32_t min_blen, double *c_seg, double *c_link, const char *qname);
113
+ void mg_cov_asm(const gfa_t *g, int32_t n_seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen, double *cov_seg, double *cov_link);
114
+
115
+ void mg_print_lchain(FILE *fp, const mg_idx_t *gi, int n_lc0, const mg_lchain_t *lc, const mg128_t *a, const char *qname);
116
+ void mg_write_gaf(kstring_t *s, const gfa_t *g, const mg_gchains_t *gs, int32_t n_seg, const int32_t *qlens, const char *qname, uint64_t flag, void *km);
117
+
118
+ void mg_sprintf_lite(kstring_t *s, const char *fmt, ...);
119
+
120
+ void radix_sort_128x(mg128_t *beg, mg128_t *end);
121
+ void radix_sort_gfa64(uint64_t *beg, uint64_t *end);
122
+ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
123
+
124
+ #ifdef __cplusplus
125
+ }
126
+ #endif
127
+
128
+ #endif