ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,570 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "gfa-priv.h"
4
+ #include "kalloc.h"
5
+ #include "bseq.h"
6
+ #include "algo.h"
7
+ #include "sys.h"
8
+ #include "ggen.h"
9
+ #include "kvec-km.h"
10
+
11
+ int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
12
+ double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_)
13
+ {
14
+ int32_t t, i, j, max_acnt, *scnt, *soff, *qcnt, *qoff;
15
+ int64_t sum_acnt, sum_alen;
16
+ mg_intv_t *sintv, *qintv;
17
+
18
+ // count the number of intervals on each segment
19
+ KCALLOC(km, scnt, g->n_seg);
20
+ KCALLOC(km, qcnt, n_seq);
21
+ for (t = 0, max_acnt = 0; t < n_seq; ++t) {
22
+ const mg_gchains_t *gt = gcs[t];
23
+ for (i = 0; i < gt->n_gc; ++i) {
24
+ const mg_gchain_t *gc = &gt->gc[i];
25
+ if (gc->id != gc->parent) continue;
26
+ if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
27
+ if (gc->n_anchor > max_acnt) max_acnt = gc->n_anchor;
28
+ ++qcnt[t];
29
+ for (j = 0; j < gc->cnt; ++j)
30
+ ++scnt[gt->lc[gc->off + j].v>>1];
31
+ }
32
+ }
33
+ if (max_acnt == 0) { // no gchain
34
+ kfree(km, scnt); kfree(km, qcnt);
35
+ return 0;
36
+ }
37
+
38
+ // compute soff[] and qoff[]
39
+ KMALLOC(km, soff, g->n_seg + 1);
40
+ KMALLOC(km, qoff, n_seq + 1);
41
+ for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
42
+ soff[i] = soff[i - 1] + scnt[i - 1];
43
+ for (qoff[0] = 0, i = 1; i <= n_seq; ++i)
44
+ qoff[i] = qoff[i - 1] + qcnt[i - 1];
45
+
46
+ // populate the interval list
47
+ memset(scnt, 0, 4 * g->n_seg);
48
+ memset(qcnt, 0, 4 * n_seq);
49
+ KMALLOC(km, sintv, soff[g->n_seg]);
50
+ KMALLOC(km, qintv, qoff[n_seq]);
51
+ sum_acnt = sum_alen = 0;
52
+ for (t = 0; t < n_seq; ++t) {
53
+ const mg_gchains_t *gt = gcs[t];
54
+ for (i = 0; i < gt->n_gc; ++i) {
55
+ const mg_gchain_t *gc = &gt->gc[i];
56
+ mg_intv_t *p;
57
+ if (gc->id != gc->parent) continue;
58
+ if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
59
+ p = &qintv[qoff[t] + qcnt[t]];
60
+ ++qcnt[t];
61
+ p->st = gc->qs, p->en = gc->qe, p->rev = 0, p->far = -1, p->i = -1;
62
+ for (j = 0; j < gc->cnt; ++j) {
63
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
64
+ int32_t rs, re, tmp;
65
+ if (lc->cnt > 0) { // compute start and end on the forward strand on the segment
66
+ const mg128_t *qs = &gt->a[lc->off];
67
+ const mg128_t *qe = &gt->a[lc->off + lc->cnt - 1];
68
+ int32_t rs0 = (int32_t)qs->x + 1 - (int32_t)(qs->y>>32&0xff);
69
+ int32_t re0 = (int32_t)qe->x;
70
+ assert(rs0 >= 0 && re0 > rs0 && re0 < g->seg[lc->v>>1].len);
71
+ sum_alen += re0 - rs0, sum_acnt += (qe->x>>32) - (qs->x>>32) + 1;
72
+ rs = 0, re = g->seg[lc->v>>1].len;
73
+ if (j == 0) rs = gc->p? gc->p->ss : rs0;
74
+ if (j == gc->cnt - 1) re = gc->p? gc->p->ee : re0;
75
+ if (lc->v&1) // swap rs and re
76
+ tmp = rs, rs = g->seg[lc->v>>1].len - re, re = g->seg[lc->v>>1].len - tmp;
77
+ } else rs = 0, re = g->seg[lc->v>>1].len;
78
+ p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
79
+ ++scnt[lc->v>>1];
80
+ p->st = rs, p->en = re, p->rev = lc->v&1, p->far = -1, p->i = -1;
81
+ }
82
+ }
83
+ }
84
+ *a_dens = (double)sum_acnt / sum_alen;
85
+
86
+ // sort and index intervals
87
+ for (i = 0; i < g->n_seg; ++i) {
88
+ assert(soff[i+1] - soff[i] == scnt[i]);
89
+ mg_intv_index(soff[i+1] - soff[i], &sintv[soff[i]]);
90
+ }
91
+ kfree(km, scnt);
92
+ for (i = 0; i < n_seq; ++i) {
93
+ assert(qoff[i+1] - qoff[i] == qcnt[i]);
94
+ mg_intv_index(qoff[i+1] - qoff[i], &qintv[qoff[i]]);
95
+ }
96
+ kfree(km, qcnt);
97
+
98
+ *sintv_ = sintv, *qintv_ = qintv;
99
+ *soff_ = soff, *qoff_ = qoff;
100
+ return max_acnt;
101
+ }
102
+
103
+ /**********************
104
+ * Graph augmentation *
105
+ **********************/
106
+
107
+ void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
108
+ {
109
+ int32_t t, i, j, *soff, *qoff, max_acnt, *sc, m_ovlp = 0, *ovlp = 0, n_ins, m_ins, n_inv;
110
+ int32_t l_pseq, m_pseq;
111
+ uint64_t *meta;
112
+ mg_intv_t *sintv, *qintv;
113
+ double a_dens;
114
+ gfa_ins_t *ins;
115
+ char *pseq;
116
+
117
+ max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
118
+ if (max_acnt == 0) return;
119
+
120
+ // extract poorly regions
121
+ m_pseq = l_pseq = 0, pseq = 0;
122
+ m_ins = n_ins = 0, ins = 0;
123
+ n_inv = 0;
124
+ KMALLOC(km, sc, max_acnt);
125
+ KMALLOC(km, meta, max_acnt);
126
+ for (t = 0; t < n_seq; ++t) {
127
+ const mg_gchains_t *gt = gcs[t];
128
+ for (i = 0; i < gt->n_gc; ++i) {
129
+ const mg_gchain_t *gc = &gt->gc[i];
130
+ int32_t off_a, off_l, n_ss, far_q;
131
+ mg_msseg_t *ss;
132
+ if (gc->id != gc->parent) continue;
133
+ if (gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
134
+ assert(gc->cnt > 0);
135
+
136
+ // fill sc[]. This part achieves a similar goal to the one in mg_gchain_extra(). It makes more assumptions, but is logically simpler.
137
+ off_l = gc->off;
138
+ off_a = gt->lc[off_l].off + 1;
139
+ far_q = 0;
140
+ for (j = 1; j < gc->n_anchor; ++j, ++off_a) {
141
+ const mg128_t *q = &gt->a[off_a - 1], *p = &gt->a[off_a];
142
+ const mg_llchain_t *lc = &gt->lc[off_l];
143
+ int32_t s, ed = -1, off_l0 = off_l, pd, qd = (int32_t)p->y - (int32_t)q->y, c = (int32_t)(p->x>>32) - (int32_t)(q->x>>32) - 1;
144
+ if ((int32_t)q->y > far_q) far_q = (int32_t)q->y; // far_q keeps the rightmost query position seen so far
145
+ if (off_a == lc->off + lc->cnt) { // we are at the end of the current lchain
146
+ pd = g->seg[lc->v>>1].len - (int32_t)q->x - 1;
147
+ for (++off_l; off_l < gc->off + gc->cnt && gt->lc[off_l].cnt == 0; ++off_l)
148
+ pd += g->seg[gt->lc[off_l].v>>1].len;
149
+ assert(off_l < gc->off + gc->cnt);
150
+ if (gt->lc[off_l].ed >= 0) ed = gt->lc[off_l].ed;
151
+ pd += (int32_t)p->x + 1;
152
+ } else pd = (int32_t)p->x - (int32_t)q->x;
153
+ if ((opt->flag&MG_G_NO_QOVLP) && (int32_t)p->y < far_q) s = 1; // query overlap
154
+ else if (pd == qd && c == 0) s = -opt->match_pen;
155
+ else if (ed >= 0) {
156
+ int32_t min_d = pd < qd? pd : qd;
157
+ double t = 1. / (1.01 - opt->ggs_max_iden);
158
+ if (t > 10.) t = 10.;
159
+ s = (int32_t)(ed * t - min_d);
160
+ } else if (pd > qd) {
161
+ double x = qd * a_dens;
162
+ x = x > c? x : c;
163
+ s = (int32_t)(x + (pd - qd) * a_dens + .499);
164
+ } else {
165
+ s = (int32_t)(qd * a_dens + .499);
166
+ s = s > c? s : c;
167
+ }
168
+ sc[j - 1] = s;
169
+ meta[j-1] = (uint64_t)pd<<32 | off_l0;
170
+ }
171
+
172
+ // get regions to insert
173
+ ss = mg_mss_all(0, gc->n_anchor - 1, sc, 10, 0, &n_ss);
174
+ off_a = gt->lc[gc->off].off;
175
+ for (j = 0; j < n_ss; ++j) {
176
+ const mg128_t *p, *q;
177
+ int32_t st, en, ls, le, span, pd, k, n_ovlp, min_len, is_inv = 0;
178
+ gfa_ins_t I;
179
+
180
+ // find the initial positions
181
+ min_len = opt->ggs_min_end_cnt > 0? opt->ggs_min_end_cnt : 0;
182
+ if (min_len < ss[j].sc * opt->ggs_min_end_frac) min_len = ss[j].sc * opt->ggs_min_end_frac;
183
+ if (ss[j].st <= min_len || ss[j].en >= gc->n_anchor - 1 - min_len) continue; // too close to ends
184
+ st = ss[j].st, en = ss[j].en;
185
+ q = &gt->a[off_a + st];
186
+ p = &gt->a[off_a + en];
187
+ span = p->y>>32&0xff;
188
+ I.ctg = t;
189
+ ls = (int32_t)meta[st], le = (int32_t)meta[en]; // first and last lchain; CLOSED interval
190
+ assert(ls <= le);
191
+ I.v[0] = gt->lc[ls].v;
192
+ I.v[1] = gt->lc[le].v;
193
+ I.voff[0] = (int32_t)q->x + 1 - span;
194
+ I.voff[1] = (int32_t)p->x + 1;
195
+ I.coff[0] = (int32_t)q->y + 1 - span;
196
+ I.coff[1] = (int32_t)p->y + 1;
197
+ assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
198
+ assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
199
+ for (k = st, pd = span; k < en; ++k)
200
+ pd += meta[k]>>32;
201
+
202
+ if (I.coff[0] > I.coff[1]) {
203
+ if (mg_verbose >= 2 && pd + (I.coff[0] - I.coff[1]) >= opt->min_var_len)
204
+ fprintf(stderr, "[W::%s] query overlap on gchain %d: [%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d]\n", __func__, t, "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0]);
205
+ continue; // such overlap can't be properly resolved
206
+ }
207
+ pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
208
+
209
+ min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
210
+ if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
211
+
212
+ // filtering
213
+ if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
214
+ continue;
215
+ for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
216
+ int c = seq[t].seq[k];
217
+ if (c == 'n' || c == 'N') break;
218
+ }
219
+ if (k != I.coff[1]) continue; // no ambiguous bases on the insert
220
+ n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
221
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
222
+ if (n_ovlp != 1) continue;
223
+ for (k = ls; k <= le; ++k) { // find other mappings overlapping with the insert on the graph
224
+ uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
225
+ int32_t s = 0, e = len, tmp;
226
+ if (k == ls) s = (int32_t)gt->a[off_a+st].x + 1 - (int32_t)(gt->a[off_a+st].y>>32&0xff);
227
+ if (k == le) e = (int32_t)gt->a[off_a+en].x + 1;
228
+ if (v&1) tmp = s, s = len - e, e = len - tmp;
229
+ n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
230
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %s:%d-%d is not covered by %s:%d-%d\n", __func__, g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
231
+ if (n_ovlp != 1) break;
232
+ }
233
+ if (k <= le) continue;
234
+ if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
235
+ int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score;
236
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
237
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
238
+ if (score > 0) {
239
+ if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
240
+ if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
241
+ } else if (!(opt->flag & MG_G_NO_INV)) {
242
+ mg_revcomp_seq(l_pseq, pseq);
243
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
244
+ if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
245
+ }
246
+ }
247
+ if (mg_dbg_flag & MG_DBG_INSERT) {
248
+ int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
249
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
250
+ fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
251
+ fprintf(stderr, "IP\t%s\nIQ\t", pseq);
252
+ fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
253
+ if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
254
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
255
+ } else score = -1, mlen = 0, blen = pd > qd? pd : qd;
256
+ fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
257
+ }
258
+ if (is_inv) { // turn one inversion to two events
259
+ gfa_ins_t I_inv[2];
260
+ I_inv[0].ctg = I_inv[1].ctg = I.ctg;
261
+ // the first event
262
+ I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
263
+ I_inv[0].v[0] = I.v[0];
264
+ I_inv[0].voff[0] = I.voff[0];
265
+ I_inv[0].v[1] = I.v[1]^1;
266
+ I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
267
+ // the second event
268
+ I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
269
+ I_inv[1].v[0] = I.v[0]^1;
270
+ I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
271
+ I_inv[1].v[1] = I.v[1];
272
+ I_inv[1].voff[1] = I.voff[1];
273
+ // insert
274
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
275
+ ins[n_ins++] = I_inv[0];
276
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
277
+ ins[n_ins++] = I_inv[1];
278
+ ++n_inv;
279
+ } else {
280
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
281
+ ins[n_ins++] = I;
282
+ }
283
+ }
284
+ kfree(0, ss);
285
+ }
286
+ }
287
+ kfree(km, pseq);
288
+ kfree(km, ovlp);
289
+ kfree(km, sc);
290
+ kfree(km, meta);
291
+ kfree(km, soff); kfree(km, qoff);
292
+ kfree(km, sintv); kfree(km, qintv);
293
+
294
+ if (n_ins > 0) {
295
+ char **names, **seqs;
296
+ KMALLOC(km, names, n_seq);
297
+ KMALLOC(km, seqs, n_seq);
298
+ for (i = 0; i < n_seq; ++i)
299
+ names[i] = seq[i].name, seqs[i] = seq[i].seq;
300
+ n_ins = gfa_ins_filter(g, n_ins, ins);
301
+ gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
302
+ kfree(km, ins);
303
+ kfree(km, names);
304
+ kfree(km, seqs);
305
+ }
306
+ if (mg_verbose >= 3)
307
+ fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
308
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
309
+ }
310
+
311
+ /**********************
312
+ * Graph augmentation *
313
+ **********************/
314
+
315
+ typedef struct {
316
+ int32_t lc, vo, qo, po, len, op, sc;
317
+ } ed_intv_t;
318
+
319
+ static int32_t gg_count_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i)
320
+ {
321
+ const mg_gchain_t *gc = &gt->gc[i];
322
+ int32_t j, l = gc->off, x = gc->ps, n = 0;
323
+ assert(gc->p);
324
+ for (j = 0; j < gc->p->n_cigar; ++j) {
325
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
326
+ assert(op == 1 || op == 2 || op == 7 || op == 8);
327
+ if (op == 2 || op == 7 || op == 8) {
328
+ while (x + rl > g->seg[gt->lc[l].v>>1].len) {
329
+ rl -= g->seg[gt->lc[l].v>>1].len - x;
330
+ ++n, ++l, x = 0;
331
+ }
332
+ x += rl;
333
+ }
334
+ ++n;
335
+ }
336
+ return n;
337
+ }
338
+
339
+ static void gg_write_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i, ed_intv_t *intv)
340
+ {
341
+ const mg_gchain_t *gc = &gt->gc[i];
342
+ int32_t j, l = gc->off, pl = 0, x = gc->ps, y = gc->qs, n = 0;
343
+ ed_intv_t *p;
344
+ assert(gc->p);
345
+ for (j = 0; j < gc->p->n_cigar; ++j) {
346
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
347
+ if (op == 2 || op == 7 || op == 8) {
348
+ while (x + rl > g->seg[gt->lc[l].v>>1].len) {
349
+ p = &intv[n++];
350
+ p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = g->seg[gt->lc[l].v>>1].len - x, p->op = op;
351
+ if (op == 7 || op == 8) y += p->len;
352
+ rl -= p->len, pl += p->len, ++l, x = 0;
353
+ }
354
+ }
355
+ p = &intv[n++];
356
+ p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = rl, p->op = op;
357
+ if (op == 7 || op == 8) x += rl, y += rl, pl += rl;
358
+ else if (op == 1) y += rl;
359
+ else if (op == 2) x += rl, pl += rl;
360
+ }
361
+ assert(y == gc->qe && pl == gc->pe - gc->ps);
362
+ }
363
+
364
+ static void gg_score_intv(int32_t n_intv, ed_intv_t *intv)
365
+ {
366
+ int32_t j;
367
+ for (j = 0; j < n_intv; ++j) {
368
+ int32_t s;
369
+ if (intv[j].op == 7)
370
+ s = intv[j].len >= 10? -intv[j].len : 0;
371
+ else s = intv[j].len;
372
+ intv[j].sc = s;
373
+ }
374
+ }
375
+
376
+ static void gg_merge_seg(const ed_intv_t *intv, int32_t n_ss, mg_msseg_t *ss)
377
+ {
378
+ int32_t j0, j;
379
+ for (j0 = 0, j = 1; j < n_ss; ++j) {
380
+ mg_msseg_t *s0 = &ss[j0], *s1 = &ss[j];
381
+ int32_t i, mid = 0;
382
+ for (i = s0->en + 1; i < s1->st; ++i)
383
+ mid += intv[i].sc;
384
+ //fprintf(stderr, "XX\t%d\t%d\t%d\t%d\t%d\t%d\n", j, s0->sc, mid, s1->sc, s0->en+1, s1->st);
385
+ if (-mid < s0->sc * 0.2 && -mid < s1->sc * 0.2) { // FIXME: mid is sometimes 0
386
+ s0->en = s1->en, s0->sc += s1->sc + mid;
387
+ s1->st = s1->en, s1->sc = 0;
388
+ } else j0 = j;
389
+ }
390
+ }
391
+
392
+ void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
393
+ {
394
+ int32_t t, i, *soff, *qoff, max_acnt, m_ovlp = 0, *ovlp = 0, n_ins = 0, m_ins, n_inv;
395
+ int32_t l_pseq, m_pseq;
396
+ mg_intv_t *sintv, *qintv;
397
+ double a_dens;
398
+ gfa_ins_t *ins;
399
+ char *pseq;
400
+
401
+ max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
402
+ if (max_acnt == 0) return;
403
+
404
+ // extract poorly regions
405
+ m_pseq = l_pseq = 0, pseq = 0;
406
+ m_ins = n_ins = 0, ins = 0;
407
+ n_inv = 0;
408
+ for (t = 0; t < n_seq; ++t) {
409
+ const mg_gchains_t *gt = gcs[t];
410
+ for (i = 0; i < gt->n_gc; ++i) {
411
+ const mg_gchain_t *gc = &gt->gc[i];
412
+ int32_t j, n_ss, n_intv, *sc;
413
+ ed_intv_t *intv;
414
+ mg_msseg_t *ss;
415
+ if (gc->id != gc->parent) continue;
416
+ if (gc->p == 0 || gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
417
+ assert(gc->cnt > 0);
418
+
419
+ n_intv = gg_count_intv(g, gt, i);
420
+ KCALLOC(km, intv, n_intv);
421
+ gg_write_intv(g, gt, i, intv);
422
+ gg_score_intv(n_intv, intv);
423
+ KCALLOC(km, sc, n_intv);
424
+ for (j = 0; j < n_intv; ++j) sc[j] = intv[j].sc;
425
+ ss = mg_mss_all(0, n_intv, sc, opt->min_var_len, 2 * opt->min_var_len, &n_ss);
426
+ gg_merge_seg(intv, n_ss, ss);
427
+
428
+ // get regions to insert
429
+ for (j = 0; j < n_ss; ++j) {
430
+ int32_t st, en, pd, k, n_ovlp, min_len, is_inv = 0, ls, le;
431
+ gfa_ins_t I;
432
+ ed_intv_t *is, *ie;
433
+
434
+ // find the initial positions
435
+ st = ss[j].st, en = ss[j].en; // this is a CLOSED interval
436
+ if (st == en) continue;
437
+ is = &intv[st], ie = &intv[en - 1];
438
+ assert(is->op != 7 && ie->op != 7);
439
+
440
+ ls = is->lc, le = ie->lc;
441
+ I.ctg = t;
442
+ I.v[0] = gt->lc[ls].v;
443
+ I.v[1] = gt->lc[le].v;
444
+ I.voff[0] = is->vo;
445
+ I.voff[1] = ie->vo + (ie->op != 1? ie->len : 0);
446
+ I.coff[0] = is->qo;
447
+ I.coff[1] = ie->qo + (ie->op != 2? ie->len : 0);
448
+ assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
449
+ assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
450
+
451
+ if (I.voff[0] == 0) { // if an insert starts at pos 0, make it start at the end of the previous vertex in the chain
452
+ assert(ls - 1 >= gc->off);
453
+ I.v[0] = gt->lc[--ls].v;
454
+ I.voff[0] = g->seg[I.v[0]>>1].len;
455
+ }
456
+ if (I.voff[1] == g->seg[I.v[1]>>1].len) { // if an insert ends at the end of the vertex, make it end at the beginning of the next vertex
457
+ assert(le + 1 < gc->off + gc->cnt);
458
+ I.v[1] = gt->lc[++le].v;
459
+ I.voff[1] = 0;
460
+ }
461
+
462
+ pd = ie->po + (ie->op != 1? ie->len : 0) - is->po;
463
+ pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
464
+
465
+ min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
466
+ if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
467
+
468
+ // filtering
469
+ if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
470
+ continue;
471
+ for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
472
+ int c = seq[t].seq[k];
473
+ if (c == 'n' || c == 'N') break;
474
+ }
475
+ if (k != I.coff[1]) continue; // no ambiguous bases on the insert
476
+ n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
477
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
478
+ if (n_ovlp != 1) continue;
479
+ for (k = is->lc; k <= ie->lc; ++k) { // find other mappings overlapping with the insert on the graph
480
+ uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
481
+ int32_t s = 0, e = len, tmp;
482
+ if (k == is->lc) s = is->vo;
483
+ if (k == ie->lc) e = ie->vo + (ie->op != 1? ie->len : 0);
484
+ if (v&1) tmp = s, s = len - e, e = len - tmp;
485
+ if (s == e) {
486
+ if (s == 0) ++e;
487
+ else --s;
488
+ }
489
+ n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
490
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %c%s:%d-%d is not covered by %s:%d-%d\n", __func__, "><"[v&1], g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
491
+ if (n_ovlp != 1) break;
492
+ }
493
+ if (k <= ie->lc) continue;
494
+ if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
495
+ int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score = 0;
496
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
497
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
498
+ if (score > 0) {
499
+ if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
500
+ if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
501
+ } else if (!(opt->flag & MG_G_NO_INV)) {
502
+ mg_revcomp_seq(l_pseq, pseq);
503
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
504
+ if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
505
+ }
506
+ }
507
+ if (mg_dbg_flag & MG_DBG_INSERT) {
508
+ int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
509
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
510
+ fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
511
+ fprintf(stderr, "IP\t%s\nIQ\t", pseq);
512
+ fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
513
+ if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
514
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
515
+ } else score = -1, mlen = 0, blen = pd > qd? pd : qd;
516
+ fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
517
+ //if (I.voff[0] == 2305301) { for (k = st; k < en; ++k) fprintf(stderr, "%d%c", intv[k].len, "MIDNSHP=XB"[intv[k].op]); fprintf(stderr, "\n"); }
518
+ }
519
+ if (is_inv) { // turn one inversion to two events
520
+ gfa_ins_t I_inv[2];
521
+ I_inv[0].ctg = I_inv[1].ctg = I.ctg;
522
+ // the first event
523
+ I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
524
+ I_inv[0].v[0] = I.v[0];
525
+ I_inv[0].voff[0] = I.voff[0];
526
+ I_inv[0].v[1] = I.v[1]^1;
527
+ I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
528
+ // the second event
529
+ I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
530
+ I_inv[1].v[0] = I.v[0]^1;
531
+ I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
532
+ I_inv[1].v[1] = I.v[1];
533
+ I_inv[1].voff[1] = I.voff[1];
534
+ // insert
535
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
536
+ ins[n_ins++] = I_inv[0];
537
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
538
+ ins[n_ins++] = I_inv[1];
539
+ ++n_inv;
540
+ } else {
541
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
542
+ ins[n_ins++] = I;
543
+ }
544
+ }
545
+ kfree(0, ss); // this is allocated from malloc() inside mg_mss_all()
546
+ kfree(km, intv);
547
+ kfree(km, sc);
548
+ }
549
+ }
550
+ kfree(km, pseq);
551
+ kfree(km, ovlp);
552
+ kfree(km, soff); kfree(km, qoff);
553
+ kfree(km, sintv); kfree(km, qintv);
554
+
555
+ if (n_ins > 0) {
556
+ char **names, **seqs;
557
+ KMALLOC(km, names, n_seq);
558
+ KMALLOC(km, seqs, n_seq);
559
+ for (i = 0; i < n_seq; ++i)
560
+ names[i] = seq[i].name, seqs[i] = seq[i].seq;
561
+ n_ins = gfa_ins_filter(g, n_ins, ins);
562
+ gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
563
+ kfree(km, ins);
564
+ kfree(km, names);
565
+ kfree(km, seqs);
566
+ }
567
+ if (mg_verbose >= 3)
568
+ fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
569
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
570
+ }