ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,500 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #include <math.h>
4
+ #include "kalloc.h"
5
+ #include "mgpriv.h"
6
+ #include "khashl.h"
7
+ #include "sys.h"
8
+
9
+ struct mg_tbuf_s {
10
+ void *km;
11
+ int frag_gap;
12
+ };
13
+
14
+ mg_tbuf_t *mg_tbuf_init(void)
15
+ {
16
+ mg_tbuf_t *b;
17
+ b = (mg_tbuf_t*)calloc(1, sizeof(mg_tbuf_t));
18
+ if (!(mg_dbg_flag & MG_DBG_NO_KALLOC)) b->km = km_init();
19
+ return b;
20
+ }
21
+
22
+ void mg_tbuf_destroy(mg_tbuf_t *b)
23
+ {
24
+ if (b == 0) return;
25
+ if (b->km) km_destroy(b->km);
26
+ free(b);
27
+ }
28
+
29
+ void *mg_tbuf_get_km(mg_tbuf_t *b)
30
+ {
31
+ return b->km;
32
+ }
33
+
34
+ static void collect_minimizers(void *km, const mg_mapopt_t *opt, const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg128_v *mv)
35
+ {
36
+ int i, n, sum = 0;
37
+ mv->n = 0;
38
+ for (i = n = 0; i < n_segs; ++i) {
39
+ size_t j;
40
+ mg_sketch(km, seqs[i], qlens[i], gi->w, gi->k, i, mv);
41
+ for (j = n; j < mv->n; ++j)
42
+ mv->a[j].y += sum << 1;
43
+ sum += qlens[i], n = mv->n;
44
+ }
45
+ }
46
+
47
+ #include "ksort.h"
48
+ #define heap_lt(a, b) ((a).x > (b).x)
49
+ KSORT_INIT(heap, mg128_t, heap_lt)
50
+
51
+ typedef struct {
52
+ uint32_t n;
53
+ uint32_t q_pos, q_span;
54
+ uint32_t seg_id:31, is_tandem:1;
55
+ const uint64_t *cr;
56
+ } mg_match_t;
57
+
58
+ static mg_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mg_idx_t *gi, const mg128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos)
59
+ {
60
+ int rep_st = 0, rep_en = 0, n_m;
61
+ size_t i;
62
+ mg_match_t *m;
63
+ *n_mini_pos = 0;
64
+ KMALLOC(km, *mini_pos, mv->n);
65
+ m = (mg_match_t*)kmalloc(km, mv->n * sizeof(mg_match_t));
66
+ for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
67
+ const uint64_t *cr;
68
+ mg128_t *p = &mv->a[i];
69
+ uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
70
+ int t;
71
+ cr = mg_idx_get(gi, p->x>>8, &t);
72
+ if (t >= max_occ) {
73
+ int en = (q_pos >> 1) + 1, st = en - q_span;
74
+ if (st > rep_en) {
75
+ *rep_len += rep_en - rep_st;
76
+ rep_st = st, rep_en = en;
77
+ } else rep_en = en;
78
+ } else {
79
+ mg_match_t *q = &m[n_m++];
80
+ q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
81
+ q->is_tandem = 0;
82
+ if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
83
+ if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
84
+ *n_a += q->n;
85
+ (*mini_pos)[(*n_mini_pos)++] = q_pos>>1;
86
+ }
87
+ }
88
+ *rep_len += rep_en - rep_st;
89
+ *_n_m = n_m;
90
+ return m;
91
+ }
92
+
93
+ static mg128_t *collect_seed_hits_heap(void *km, const mg_mapopt_t *opt, int max_occ, const mg_idx_t *gi, const char *qname, const mg128_v *mv, int qlen, int64_t *n_a, int *rep_len,
94
+ int *n_mini_pos, int32_t **mini_pos)
95
+ {
96
+ int i, n_m, heap_size = 0;
97
+ int64_t n_for = 0, n_rev = 0;
98
+ mg_match_t *m;
99
+ mg128_t *a, *heap;
100
+
101
+ m = collect_matches(km, &n_m, max_occ, gi, mv, n_a, rep_len, n_mini_pos, mini_pos);
102
+
103
+ heap = (mg128_t*)kmalloc(km, n_m * sizeof(mg128_t));
104
+ a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));
105
+
106
+ for (i = 0, heap_size = 0; i < n_m; ++i) {
107
+ if (m[i].n > 0) {
108
+ heap[heap_size].x = m[i].cr[0];
109
+ heap[heap_size].y = (uint64_t)i<<32;
110
+ ++heap_size;
111
+ }
112
+ }
113
+ ks_heapmake_heap(heap_size, heap);
114
+ while (heap_size > 0) {
115
+ mg_match_t *q = &m[heap->y>>32];
116
+ mg128_t *p;
117
+ uint64_t r = heap->x;
118
+ int32_t rpos = (uint32_t)r >> 1;
119
+ // TODO: skip anchor if MG_F_NO_DIAL
120
+ if ((r&1) == (q->q_pos&1)) { // forward strand
121
+ p = &a[n_for++];
122
+ p->x = r>>32<<33 | rpos;
123
+ } else { // reverse strand; TODO: more testing needed for this block
124
+ p = &a[(*n_a) - (++n_rev)];
125
+ p->x = r>>32<<33 | 1ULL<<32 | (gi->g->seg[r>>32].len - (rpos + 1 - q->q_span) - 1);
126
+ }
127
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
128
+ p->y |= (uint64_t)q->seg_id << MG_SEED_SEG_SHIFT;
129
+ if (q->is_tandem) p->y |= MG_SEED_TANDEM;
130
+ p->y |= (uint64_t)(q->n < 255? q->n : 255) << MG_SEED_OCC_SHIFT;
131
+ // update the heap
132
+ if ((uint32_t)heap->y < q->n - 1) {
133
+ ++heap[0].y;
134
+ heap[0].x = m[heap[0].y>>32].cr[(uint32_t)heap[0].y];
135
+ } else {
136
+ heap[0] = heap[heap_size - 1];
137
+ --heap_size;
138
+ }
139
+ ks_heapdown_heap(0, heap_size, heap);
140
+ }
141
+ kfree(km, m);
142
+ kfree(km, heap);
143
+
144
+ // reverse anchors on the reverse strand, as they are in the descending order
145
+ if (*n_a > n_for + n_rev) {
146
+ memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mg128_t));
147
+ *n_a = n_for + n_rev;
148
+ }
149
+ return a;
150
+ }
151
+
152
+ static mg128_t *collect_seed_hits(void *km, const mg_mapopt_t *opt, int max_occ, const mg_idx_t *gi, const char *qname, const mg128_v *mv, int qlen, int64_t *n_a, int *rep_len,
153
+ int *n_mini_pos, int32_t **mini_pos)
154
+ {
155
+ int i, n_m;
156
+ mg_match_t *m;
157
+ mg128_t *a;
158
+ m = collect_matches(km, &n_m, max_occ, gi, mv, n_a, rep_len, n_mini_pos, mini_pos);
159
+ a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));
160
+ for (i = 0, *n_a = 0; i < n_m; ++i) {
161
+ mg_match_t *q = &m[i];
162
+ const uint64_t *r = q->cr;
163
+ uint32_t k;
164
+ for (k = 0; k < q->n; ++k) {
165
+ int32_t rpos = (uint32_t)r[k] >> 1;
166
+ mg128_t *p;
167
+ if (qname && (opt->flag & MG_M_NO_DIAG)) {
168
+ const gfa_seg_t *s = &gi->g->seg[r[k]>>32];
169
+ const char *gname = s->snid >= 0 && gi->g->sseq? gi->g->sseq[s->snid].name : s->name;
170
+ int32_t g_pos;
171
+ if (s->snid >= 0 && gi->g->sseq)
172
+ gname = gi->g->sseq[s->snid].name, g_pos = s->soff + (uint32_t)r[k];
173
+ else
174
+ gname = s->name, g_pos = (uint32_t)r[k];
175
+ if (g_pos == q->q_pos && strcmp(qname, gname) == 0)
176
+ continue;
177
+ }
178
+ p = &a[(*n_a)++];
179
+ if ((r[k]&1) == (q->q_pos&1)) // forward strand
180
+ p->x = r[k]>>32<<33 | rpos;
181
+ else // reverse strand
182
+ p->x = r[k]>>32<<33 | 1ULL<<32 | (gi->g->seg[r[k]>>32].len - (rpos + 1 - q->q_span) - 1);
183
+ p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
184
+ p->y |= (uint64_t)q->seg_id << MG_SEED_SEG_SHIFT;
185
+ if (q->is_tandem) p->y |= MG_SEED_TANDEM;
186
+ p->y |= (uint64_t)(q->n < 255? q->n : 255) << MG_SEED_OCC_SHIFT;
187
+ }
188
+ }
189
+ kfree(km, m);
190
+ radix_sort_128x(a, a + (*n_a));
191
+ return a;
192
+ }
193
+
194
+ static void mm_fix_bad_ends(const mg128_t *a, int32_t lc_max_occ, int32_t lc_max_trim, int32_t *as, int32_t *cnt)
195
+ {
196
+ int32_t i, k, as0 = *as, cnt0 = *cnt;
197
+ for (i = as0 + cnt0 - 1, k = 0; k < lc_max_trim && k < cnt0; ++k, --i)
198
+ if (a[i].y>>MG_SEED_OCC_SHIFT <= lc_max_occ)
199
+ break;
200
+ *cnt -= k;
201
+ for (i = as0, k = 0; k < *cnt && k < lc_max_trim; ++i, ++k)
202
+ if (a[i].y>>MG_SEED_OCC_SHIFT <= lc_max_occ)
203
+ break;
204
+ *as += k, *cnt -= k;
205
+ }
206
+
207
+ static void mm_fix_bad_ends_alt(const mg128_t *a, int32_t score, int bw, int min_match, int32_t *as, int32_t *cnt)
208
+ {
209
+ int32_t i, l, m, as0 = *as, cnt0 = *cnt;
210
+ if (cnt0 < 3) return;
211
+ m = l = a[as0].y >> 32 & 0xff;
212
+ for (i = as0 + 1; i < as0 + cnt0 - 1; ++i) {
213
+ int32_t lq, lr, min, max;
214
+ int32_t q_span = a[i].y >> 32 & 0xff;
215
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
216
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
217
+ min = lr < lq? lr : lq;
218
+ max = lr > lq? lr : lq;
219
+ if (max - min > l >> 1) *as = i;
220
+ l += min;
221
+ m += min < q_span? min : q_span;
222
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= score>>1) break;
223
+ }
224
+ *cnt = as0 + cnt0 - *as;
225
+ m = l = a[as0 + cnt0 - 1].y >> 32 & 0xff;
226
+ for (i = as0 + cnt0 - 2; i > *as; --i) {
227
+ int32_t lq, lr, min, max;
228
+ int32_t q_span = a[i+1].y >> 32 & 0xff;
229
+ lr = (int32_t)a[i+1].x - (int32_t)a[i].x;
230
+ lq = (int32_t)a[i+1].y - (int32_t)a[i].y;
231
+ min = lr < lq? lr : lq;
232
+ max = lr > lq? lr : lq;
233
+ if (max - min > l >> 1) *cnt = i + 1 - *as;
234
+ l += min;
235
+ m += min < q_span? min : q_span;
236
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= score>>1) break;
237
+ }
238
+ }
239
+
240
+ static int *collect_long_gaps(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int *n_)
241
+ {
242
+ int i, n, *K;
243
+ *n_ = 0;
244
+ for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap
245
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
246
+ if (gap < -min_gap || gap > min_gap) ++n;
247
+ }
248
+ if (n <= 1) return 0;
249
+ K = (int*)kmalloc(km, n * sizeof(int));
250
+ for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps
251
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
252
+ if (gap < -min_gap || gap > min_gap)
253
+ K[n++] = i;
254
+ }
255
+ *n_ = n;
256
+ return K;
257
+ }
258
+
259
+ static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt)
260
+ {
261
+ int max_st, max_en, n, i, k, max, *K;
262
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
263
+ if (K == 0) return;
264
+ max = 0, max_st = max_en = -1;
265
+ for (k = 0;; ++k) { // traverse long gaps
266
+ int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1;
267
+ if (k == n || k >= max_en) {
268
+ if (max_en > 0)
269
+ for (i = K[max_st]; i < K[max_en]; ++i)
270
+ a[as1 + i].y |= MG_SEED_IGNORE;
271
+ max = 0, max_st = max_en = -1;
272
+ if (k == n) break;
273
+ }
274
+ i = K[k];
275
+ gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x);
276
+ if (gap > 0) n_ins += gap;
277
+ else n_del += -gap;
278
+ qs = (int32_t)a[as1 + i - 1].y;
279
+ rs = (int32_t)a[as1 + i - 1].x;
280
+ for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) {
281
+ int j = K[l], diff;
282
+ if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break;
283
+ gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
284
+ if (gap > 0) n_ins += gap;
285
+ else n_del += -gap;
286
+ diff = n_ins + n_del - abs(n_ins - n_del);
287
+ if (max_diff < diff)
288
+ max_diff = diff, max_diff_l = l;
289
+ }
290
+ if (max_diff > diff_thres && max_diff > max)
291
+ max = max_diff, max_st = k, max_en = max_diff_l;
292
+ }
293
+ kfree(km, K);
294
+ }
295
+
296
+ static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mg128_t *a, int min_gap, int max_ext)
297
+ {
298
+ int n, k, *K;
299
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
300
+ if (K == 0) return;
301
+ for (k = 0; k < n;) {
302
+ int i = K[k], l;
303
+ int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x);
304
+ int re1 = (int32_t)a[as1 + i].x;
305
+ int qe1 = (int32_t)a[as1 + i].y;
306
+ gap1 = gap1 > 0? gap1 : -gap1;
307
+ for (l = k + 1; l < n; ++l) {
308
+ int j = K[l], gap2, q_span_pre, rs2, qs2, m;
309
+ if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break;
310
+ gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
311
+ q_span_pre = a[as1 + j - 1].y >> 32 & 0xff;
312
+ rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre;
313
+ qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre;
314
+ m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1;
315
+ gap2 = gap2 > 0? gap2 : -gap2;
316
+ if (m > gap1 + gap2) break;
317
+ re1 = (int32_t)a[as1 + j].x;
318
+ qe1 = (int32_t)a[as1 + j].y;
319
+ gap1 = gap2;
320
+ }
321
+ if (l > k + 1) {
322
+ int j, end = K[l - 1];
323
+ for (j = K[k]; j < end; ++j)
324
+ a[as1 + j].y |= MG_SEED_IGNORE;
325
+ a[as1 + end].y |= MG_SEED_FIXED;
326
+ }
327
+ k = l;
328
+ }
329
+ kfree(km, K);
330
+ }
331
+
332
+ static double print_time(double t0, int stage, const char *qname)
333
+ {
334
+ double t;
335
+ t = realtime();
336
+ fprintf(stderr, "Q%d\t%s\t%.3f\n", stage, qname, t - t0);
337
+ return t;
338
+ }
339
+
340
+ void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **seqs, mg_gchains_t **gcs, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname)
341
+ {
342
+ int i, l, rep_len, qlen_sum, n_lc, n_gc, n_mini_pos;
343
+ int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MG_M_SPLICE), is_sr = !!(opt->flag & MG_M_SR);
344
+ uint32_t hash;
345
+ int64_t n_a;
346
+ uint64_t *u;
347
+ int32_t *mini_pos;
348
+ mg128_t *a;
349
+ mg128_v mv = {0,0,0};
350
+ mg_lchain_t *lc;
351
+ char *seq_cat;
352
+ km_stat_t kmst;
353
+ float tmp, chn_pen_gap, chn_pen_skip;
354
+ double t = 0.0;
355
+
356
+ for (i = 0, qlen_sum = 0; i < n_segs; ++i)
357
+ qlen_sum += qlens[i], gcs[i] = 0;
358
+
359
+ if (qlen_sum == 0 || n_segs <= 0 || n_segs > MG_MAX_SEG) return;
360
+ if (opt->max_qlen > 0 && qlen_sum > opt->max_qlen) return;
361
+
362
+ hash = qname? kh_hash_str(qname) : 0;
363
+ hash ^= kh_hash_uint32(qlen_sum) + kh_hash_uint32(opt->seed);
364
+ hash = kh_hash_uint32(hash);
365
+
366
+ collect_minimizers(b->km, opt, gi, n_segs, qlens, seqs, &mv);
367
+ if (opt->flag & MG_M_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->occ_max1, gi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
368
+ else a = collect_seed_hits(b->km, opt, opt->occ_max1, gi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
369
+
370
+ if (mg_dbg_flag & MG_DBG_SEED) {
371
+ fprintf(stderr, "RS\t%d\n", rep_len);
372
+ for (i = 0; i < n_a; ++i)
373
+ fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", gi->g->seg[a[i].x>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>32&1], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
374
+ i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
375
+ }
376
+
377
+ // set max chaining gap on the query and the reference sequence
378
+ if (is_sr)
379
+ max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap;
380
+ else max_chain_gap_qry = opt->max_gap;
381
+ if (opt->max_gap_ref > 0) {
382
+ max_chain_gap_ref = opt->max_gap_ref; // always honor mg_mapopt_t::max_gap_ref if set
383
+ } else if (opt->max_frag_len > 0) {
384
+ max_chain_gap_ref = opt->max_frag_len - qlen_sum;
385
+ if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
386
+ } else max_chain_gap_ref = opt->max_gap;
387
+
388
+ tmp = expf(-opt->div * gi->k);
389
+ chn_pen_gap = opt->chn_pen_gap * tmp;
390
+ chn_pen_skip = opt->chn_pen_skip * tmp;
391
+
392
+ if (mg_dbg_flag & MG_DBG_QNAME) t = realtime();
393
+ if (n_a == 0) {
394
+ if (a) kfree(b->km, a);
395
+ a = 0, n_lc = 0, u = 0;
396
+ } else {
397
+ if (opt->flag & MG_M_RMQ) {
398
+ a = mg_lchain_rmq(opt->max_gap, opt->max_gap_pre, opt->bw, opt->max_lc_skip, opt->rmq_size_cap, opt->min_lc_cnt, opt->min_lc_score,
399
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_lc, &u, b->km);
400
+ } else {
401
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_lc_skip, opt->max_lc_iter, opt->min_lc_cnt, opt->min_lc_score,
402
+ chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_lc, &u, b->km);
403
+ }
404
+ }
405
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 1, qname);
406
+
407
+ if (opt->bw_long > opt->bw && (opt->flag & (MG_M_SPLICE|MG_M_SR)) == 0 && n_segs == 1 && n_lc > 1) { // re-chain/long-join for long sequences
408
+ int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
409
+ if (qlen_sum - (en - st) > opt->rmq_rescue_size || qlen_sum - (en - st) > qlen_sum * opt->rmq_rescue_ratio) {
410
+ int32_t i;
411
+ for (i = 0, n_a = 0; i < n_lc; ++i) n_a += (int32_t)u[i];
412
+ kfree(b->km, u);
413
+ radix_sort_128x(a, a + n_a);
414
+ a = mg_lchain_rmq(opt->max_gap, opt->max_gap_pre, opt->bw_long, opt->max_lc_skip, opt->rmq_size_cap, opt->min_lc_cnt, opt->min_lc_score,
415
+ chn_pen_gap, chn_pen_skip, n_a, a, &n_lc, &u, b->km);
416
+ }
417
+ }
418
+
419
+ b->frag_gap = max_chain_gap_ref;
420
+ kfree(b->km, mv.a);
421
+
422
+ if (n_lc) {
423
+ lc = mg_lchain_gen(b->km, hash, qlen_sum, n_lc, u, a);
424
+ if (n_lc > 1) {
425
+ int32_t n_lc_new = 0;
426
+ for (i = 0; i < n_lc; ++i) {
427
+ mg_lchain_t *p = &lc[i];
428
+ int32_t cnt = p->cnt, off = p->off;
429
+ mm_fix_bad_ends(a, opt->lc_max_occ, opt->lc_max_trim, &off, &cnt);
430
+ mm_fix_bad_ends_alt(a, p->score, opt->bw, 100, &off, &cnt);
431
+ mm_filter_bad_seeds(b->km, off, cnt, a, 10, 40, opt->max_gap>>1, 10);
432
+ mm_filter_bad_seeds_alt(b->km, off, cnt, a, 30, opt->max_gap>>1);
433
+ //printf("X\t%d\t%d\t%d\t%d\t%d\t%d\n", p->qs, p->qe, p->off, p->cnt, off, cnt);
434
+ p->off = off, p->cnt = cnt;
435
+ if (cnt >= opt->min_lc_cnt) {
436
+ int32_t q_span = a[p->off].y>>32 & 0xff;
437
+ p->rs = (int32_t)a[p->off].x + 1 - q_span;
438
+ p->qs = (int32_t)a[p->off].y + 1 - q_span;
439
+ p->re = (int32_t)a[p->off + p->cnt - 1].x + 1;
440
+ p->qe = (int32_t)a[p->off + p->cnt - 1].y + 1;
441
+ lc[n_lc_new++] = *p;
442
+ }
443
+ }
444
+ n_lc = n_lc_new;
445
+ }
446
+ for (i = 0; i < n_lc; ++i)
447
+ mg_update_anchors(lc[i].cnt, &a[lc[i].off], n_mini_pos, mini_pos);
448
+ } else lc = 0;
449
+ kfree(b->km, mini_pos);
450
+ kfree(b->km, u);
451
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 2, qname);
452
+
453
+ if (mg_dbg_flag & MG_DBG_LCHAIN)
454
+ mg_print_lchain(stdout, gi, n_lc, lc, a, qname);
455
+
456
+ KMALLOC(b->km, seq_cat, qlen_sum);
457
+ for (i = l = 0; i < n_segs; ++i) {
458
+ strncpy(&seq_cat[l], seqs[i], qlens[i]);
459
+ l += qlens[i];
460
+ }
461
+ n_gc = mg_gchain1_dp(b->km, gi->g, &n_lc, lc, qlen_sum, opt->bw_long, opt->bw_long, opt->bw_long, opt->max_gc_skip, opt->ref_bonus,
462
+ chn_pen_gap, chn_pen_skip, opt->mask_level, a, &u);
463
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 3, qname);
464
+ gcs[0] = mg_gchain_gen(0, b->km, gi->g, gi->es, n_gc, u, lc, a, hash, opt->min_gc_cnt, opt->min_gc_score, opt->gdp_max_ed, n_segs, seq_cat);
465
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 4, qname);
466
+ gcs[0]->rep_len = rep_len;
467
+ kfree(b->km, a);
468
+ kfree(b->km, lc);
469
+ kfree(b->km, u);
470
+
471
+ mg_gchain_set_parent(b->km, opt->mask_level, gcs[0]->n_gc, gcs[0]->gc, opt->sub_diff, 0);
472
+ mg_gchain_flt_sub(opt->pri_ratio, gi->k * 2, opt->best_n, gcs[0]->n_gc, gcs[0]->gc);
473
+ mg_gchain_drop_flt(b->km, gcs[0]);
474
+ mg_gchain_set_mapq(b->km, gcs[0], qlen_sum, mv.n, opt->min_gc_score);
475
+ if ((opt->flag&MG_M_CIGAR) && n_segs == 1)
476
+ mg_gchain_cigar(b->km, gi->g, gi->es, seq_cat, gcs[0], qname);
477
+ kfree(b->km, seq_cat);
478
+ if (mg_dbg_flag & MG_DBG_QNAME) t = print_time(t, 5, qname);
479
+
480
+ if (b->km) {
481
+ km_stat(b->km, &kmst);
482
+ if (mg_dbg_flag & MG_DBG_QNAME)
483
+ fprintf(stderr, "QM\t%s\t%d\tcap=%ld,nCore=%ld,largest=%ld\n", qname, qlen_sum, kmst.capacity, kmst.n_cores, kmst.largest);
484
+ if (kmst.n_blocks != kmst.n_cores) {
485
+ fprintf(stderr, "[E::%s] memory leak at %s\n", __func__, qname);
486
+ abort();
487
+ }
488
+ if (kmst.largest > 1U<<28 || (opt->cap_kalloc > 0 && kmst.capacity > opt->cap_kalloc)) {
489
+ km_destroy(b->km);
490
+ b->km = km_init();
491
+ }
492
+ }
493
+ }
494
+
495
+ mg_gchains_t *mg_map(const mg_idx_t *gi, int qlen, const char *seq, mg_tbuf_t *b, const mg_mapopt_t *opt, const char *qname)
496
+ {
497
+ mg_gchains_t *gcs;
498
+ mg_map_frag(gi, 1, &qlen, &seq, &gcs, b, opt, qname);
499
+ return gcs;
500
+ }
@@ -0,0 +1,128 @@
1
+ #ifndef MGPRIV_H
2
+ #define MGPRIV_H
3
+
4
+ #include <stdlib.h>
5
+ #include "minigraph.h"
6
+
7
+ #define MG_DBG_NO_KALLOC 0x1
8
+ #define MG_DBG_QNAME 0x2
9
+ #define MG_DBG_SEED 0x4
10
+ #define MG_DBG_LCHAIN 0x8
11
+ #define MG_DBG_INSERT 0x10
12
+ #define MG_DBG_SHORTK 0x20
13
+ #define MG_DBG_GC1 0x40
14
+ #define MG_DBG_LC_PROF 0x80
15
+ #define MG_DBG_MINIWFA 0x100
16
+ #define MG_DBG_MWF_SEQ 0x200
17
+
18
+ #define MG_SEED_IGNORE (1ULL<<41)
19
+ #define MG_SEED_TANDEM (1ULL<<42)
20
+ #define MG_SEED_FIXED (1ULL<<43)
21
+
22
+ #define MG_MAX_SEG 255
23
+ #define MG_SEED_SEG_SHIFT 48
24
+ #define MG_SEED_SEG_MASK (0xffULL<<(MG_SEED_SEG_SHIFT))
25
+ #define mg_seg_id(a) ((int32_t)(((a).y&MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT))
26
+
27
+ #define MG_SEED_OCC_SHIFT 56
28
+
29
+ #define MG_MAX_SHORT_K 15
30
+
31
+ #ifndef KSTRING_T
32
+ #define KSTRING_T kstring_t
33
+ typedef struct __kstring_t {
34
+ unsigned l, m;
35
+ char *s;
36
+ } kstring_t;
37
+ #endif
38
+
39
+ // shortest path
40
+ typedef struct {
41
+ // input
42
+ uint32_t v;
43
+ int32_t target_dist;
44
+ uint32_t target_hash;
45
+ uint32_t meta:30, check_hash:1, inner:1;
46
+ int32_t qlen;
47
+ // output
48
+ uint32_t n_path:31, is_0:1;
49
+ int32_t path_end;
50
+ int32_t dist;
51
+ uint32_t hash;
52
+ } mg_path_dst_t;
53
+
54
+ typedef struct {
55
+ uint32_t v, d;
56
+ int32_t pre;
57
+ } mg_pathv_t;
58
+
59
+ #ifdef __cplusplus
60
+ extern "C" {
61
+ #endif
62
+
63
+ static inline float mg_log2(float x) // NB: this doesn't work when x<2
64
+ {
65
+ union { float f; uint32_t i; } z = { x };
66
+ float log_2 = ((z.i >> 23) & 255) - 128;
67
+ z.i &= ~(255 << 23);
68
+ z.i += 127 << 23;
69
+ log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
70
+ return log_2;
71
+ }
72
+
73
+ extern unsigned char seq_nt4_table[256];
74
+
75
+ void mg_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, mg128_v *p);
76
+
77
+ void *mg_idx_a2h(void *km, int32_t n_a, mg128_t *a, int suflen, uint64_t **q_, int32_t *n_);
78
+ const uint64_t *mg_idx_hget(const void *h_, const uint64_t *q, int suflen, uint64_t minier, int *n);
79
+ void mg_idx_hfree(void *h_);
80
+
81
+ const uint64_t *mg_idx_get(const mg_idx_t *gi, uint64_t minier, int *n);
82
+ void mg_idx_cal_quantile(const mg_idx_t *gi, int32_t m, float f[], int32_t q[]);
83
+
84
+ uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t max_drop,
85
+ int32_t extra_u, int32_t *n_u_, int32_t *n_v_);
86
+ mg128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
87
+ int is_cdna, int n_seg, int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km);
88
+ mg128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
89
+ int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km);
90
+ mg_lchain_t *mg_lchain_gen(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mg128_t *a);
91
+ void mg_update_anchors(int32_t n_a, mg128_t *a, int32_t n, const int32_t *mini_pos);
92
+
93
+ mg_pathv_t *mg_shortest_k(void *km0, const gfa_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t *n_pathv);
94
+ int32_t mg_gchain1_dp(void *km, const gfa_t *g, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip,
95
+ int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, const mg128_t *an, uint64_t **u_);
96
+ mg_gchains_t *mg_gchain_gen(void *km_dst, void *km, const gfa_t *g, const gfa_edseq_t *es, int32_t n_u, const uint64_t *u,
97
+ mg_lchain_t *lc, const mg128_t *a, uint32_t hash, int32_t min_gc_cnt, int32_t min_gc_score,
98
+ int32_t gdp_max_ed, int32_t n_seg, const char *qseq);
99
+ void mg_gchain_cigar(void *km, const gfa_t *g, const gfa_edseq_t *es, const char *qseq, mg_gchains_t *gt, const char *qname);
100
+ void mg_gchain_free(mg_gchains_t *gs);
101
+
102
+ uint32_t *lv_ed_unified(void *km, int32_t tl, const char *ts, int32_t ql, const char *qs, int32_t is_ext, int32_t *score, int32_t *t_endl, int32_t *q_endl, int32_t *n_cigar);
103
+
104
+ void mg_gchain_restore_order(void *km, mg_gchains_t *gcs);
105
+ void mg_gchain_restore_offset(mg_gchains_t *gcs);
106
+ void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs);
107
+ void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level);
108
+ int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r);
109
+ void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs);
110
+ void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score);
111
+
112
+ void mg_cov_map(const gfa_t *g, const mg_gchains_t *gt, int32_t min_mapq, int32_t min_blen, double *c_seg, double *c_link, const char *qname);
113
+ void mg_cov_asm(const gfa_t *g, int32_t n_seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen, double *cov_seg, double *cov_link);
114
+
115
+ void mg_print_lchain(FILE *fp, const mg_idx_t *gi, int n_lc0, const mg_lchain_t *lc, const mg128_t *a, const char *qname);
116
+ void mg_write_gaf(kstring_t *s, const gfa_t *g, const mg_gchains_t *gs, int32_t n_seg, const int32_t *qlens, const char *qname, uint64_t flag, void *km);
117
+
118
+ void mg_sprintf_lite(kstring_t *s, const char *fmt, ...);
119
+
120
+ void radix_sort_128x(mg128_t *beg, mg128_t *end);
121
+ void radix_sort_gfa64(uint64_t *beg, uint64_t *end);
122
+ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
123
+
124
+ #ifdef __cplusplus
125
+ }
126
+ #endif
127
+
128
+ #endif