ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,570 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "gfa-priv.h"
4
+ #include "kalloc.h"
5
+ #include "bseq.h"
6
+ #include "algo.h"
7
+ #include "sys.h"
8
+ #include "ggen.h"
9
+ #include "kvec-km.h"
10
+
11
+ int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
12
+ double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_)
13
+ {
14
+ int32_t t, i, j, max_acnt, *scnt, *soff, *qcnt, *qoff;
15
+ int64_t sum_acnt, sum_alen;
16
+ mg_intv_t *sintv, *qintv;
17
+
18
+ // count the number of intervals on each segment
19
+ KCALLOC(km, scnt, g->n_seg);
20
+ KCALLOC(km, qcnt, n_seq);
21
+ for (t = 0, max_acnt = 0; t < n_seq; ++t) {
22
+ const mg_gchains_t *gt = gcs[t];
23
+ for (i = 0; i < gt->n_gc; ++i) {
24
+ const mg_gchain_t *gc = &gt->gc[i];
25
+ if (gc->id != gc->parent) continue;
26
+ if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
27
+ if (gc->n_anchor > max_acnt) max_acnt = gc->n_anchor;
28
+ ++qcnt[t];
29
+ for (j = 0; j < gc->cnt; ++j)
30
+ ++scnt[gt->lc[gc->off + j].v>>1];
31
+ }
32
+ }
33
+ if (max_acnt == 0) { // no gchain
34
+ kfree(km, scnt); kfree(km, qcnt);
35
+ return 0;
36
+ }
37
+
38
+ // compute soff[] and qoff[]
39
+ KMALLOC(km, soff, g->n_seg + 1);
40
+ KMALLOC(km, qoff, n_seq + 1);
41
+ for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
42
+ soff[i] = soff[i - 1] + scnt[i - 1];
43
+ for (qoff[0] = 0, i = 1; i <= n_seq; ++i)
44
+ qoff[i] = qoff[i - 1] + qcnt[i - 1];
45
+
46
+ // populate the interval list
47
+ memset(scnt, 0, 4 * g->n_seg);
48
+ memset(qcnt, 0, 4 * n_seq);
49
+ KMALLOC(km, sintv, soff[g->n_seg]);
50
+ KMALLOC(km, qintv, qoff[n_seq]);
51
+ sum_acnt = sum_alen = 0;
52
+ for (t = 0; t < n_seq; ++t) {
53
+ const mg_gchains_t *gt = gcs[t];
54
+ for (i = 0; i < gt->n_gc; ++i) {
55
+ const mg_gchain_t *gc = &gt->gc[i];
56
+ mg_intv_t *p;
57
+ if (gc->id != gc->parent) continue;
58
+ if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
59
+ p = &qintv[qoff[t] + qcnt[t]];
60
+ ++qcnt[t];
61
+ p->st = gc->qs, p->en = gc->qe, p->rev = 0, p->far = -1, p->i = -1;
62
+ for (j = 0; j < gc->cnt; ++j) {
63
+ const mg_llchain_t *lc = &gt->lc[gc->off + j];
64
+ int32_t rs, re, tmp;
65
+ if (lc->cnt > 0) { // compute start and end on the forward strand on the segment
66
+ const mg128_t *qs = &gt->a[lc->off];
67
+ const mg128_t *qe = &gt->a[lc->off + lc->cnt - 1];
68
+ int32_t rs0 = (int32_t)qs->x + 1 - (int32_t)(qs->y>>32&0xff);
69
+ int32_t re0 = (int32_t)qe->x;
70
+ assert(rs0 >= 0 && re0 > rs0 && re0 < g->seg[lc->v>>1].len);
71
+ sum_alen += re0 - rs0, sum_acnt += (qe->x>>32) - (qs->x>>32) + 1;
72
+ rs = 0, re = g->seg[lc->v>>1].len;
73
+ if (j == 0) rs = gc->p? gc->p->ss : rs0;
74
+ if (j == gc->cnt - 1) re = gc->p? gc->p->ee : re0;
75
+ if (lc->v&1) // swap rs and re
76
+ tmp = rs, rs = g->seg[lc->v>>1].len - re, re = g->seg[lc->v>>1].len - tmp;
77
+ } else rs = 0, re = g->seg[lc->v>>1].len;
78
+ p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
79
+ ++scnt[lc->v>>1];
80
+ p->st = rs, p->en = re, p->rev = lc->v&1, p->far = -1, p->i = -1;
81
+ }
82
+ }
83
+ }
84
+ *a_dens = (double)sum_acnt / sum_alen;
85
+
86
+ // sort and index intervals
87
+ for (i = 0; i < g->n_seg; ++i) {
88
+ assert(soff[i+1] - soff[i] == scnt[i]);
89
+ mg_intv_index(soff[i+1] - soff[i], &sintv[soff[i]]);
90
+ }
91
+ kfree(km, scnt);
92
+ for (i = 0; i < n_seq; ++i) {
93
+ assert(qoff[i+1] - qoff[i] == qcnt[i]);
94
+ mg_intv_index(qoff[i+1] - qoff[i], &qintv[qoff[i]]);
95
+ }
96
+ kfree(km, qcnt);
97
+
98
+ *sintv_ = sintv, *qintv_ = qintv;
99
+ *soff_ = soff, *qoff_ = qoff;
100
+ return max_acnt;
101
+ }
102
+
103
+ /**********************
104
+ * Graph augmentation *
105
+ **********************/
106
+
107
+ void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
108
+ {
109
+ int32_t t, i, j, *soff, *qoff, max_acnt, *sc, m_ovlp = 0, *ovlp = 0, n_ins, m_ins, n_inv;
110
+ int32_t l_pseq, m_pseq;
111
+ uint64_t *meta;
112
+ mg_intv_t *sintv, *qintv;
113
+ double a_dens;
114
+ gfa_ins_t *ins;
115
+ char *pseq;
116
+
117
+ max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
118
+ if (max_acnt == 0) return;
119
+
120
+ // extract poorly regions
121
+ m_pseq = l_pseq = 0, pseq = 0;
122
+ m_ins = n_ins = 0, ins = 0;
123
+ n_inv = 0;
124
+ KMALLOC(km, sc, max_acnt);
125
+ KMALLOC(km, meta, max_acnt);
126
+ for (t = 0; t < n_seq; ++t) {
127
+ const mg_gchains_t *gt = gcs[t];
128
+ for (i = 0; i < gt->n_gc; ++i) {
129
+ const mg_gchain_t *gc = &gt->gc[i];
130
+ int32_t off_a, off_l, n_ss, far_q;
131
+ mg_msseg_t *ss;
132
+ if (gc->id != gc->parent) continue;
133
+ if (gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
134
+ assert(gc->cnt > 0);
135
+
136
+ // fill sc[]. This part achieves a similar goal to the one in mg_gchain_extra(). It makes more assumptions, but is logically simpler.
137
+ off_l = gc->off;
138
+ off_a = gt->lc[off_l].off + 1;
139
+ far_q = 0;
140
+ for (j = 1; j < gc->n_anchor; ++j, ++off_a) {
141
+ const mg128_t *q = &gt->a[off_a - 1], *p = &gt->a[off_a];
142
+ const mg_llchain_t *lc = &gt->lc[off_l];
143
+ int32_t s, ed = -1, off_l0 = off_l, pd, qd = (int32_t)p->y - (int32_t)q->y, c = (int32_t)(p->x>>32) - (int32_t)(q->x>>32) - 1;
144
+ if ((int32_t)q->y > far_q) far_q = (int32_t)q->y; // far_q keeps the rightmost query position seen so far
145
+ if (off_a == lc->off + lc->cnt) { // we are at the end of the current lchain
146
+ pd = g->seg[lc->v>>1].len - (int32_t)q->x - 1;
147
+ for (++off_l; off_l < gc->off + gc->cnt && gt->lc[off_l].cnt == 0; ++off_l)
148
+ pd += g->seg[gt->lc[off_l].v>>1].len;
149
+ assert(off_l < gc->off + gc->cnt);
150
+ if (gt->lc[off_l].ed >= 0) ed = gt->lc[off_l].ed;
151
+ pd += (int32_t)p->x + 1;
152
+ } else pd = (int32_t)p->x - (int32_t)q->x;
153
+ if ((opt->flag&MG_G_NO_QOVLP) && (int32_t)p->y < far_q) s = 1; // query overlap
154
+ else if (pd == qd && c == 0) s = -opt->match_pen;
155
+ else if (ed >= 0) {
156
+ int32_t min_d = pd < qd? pd : qd;
157
+ double t = 1. / (1.01 - opt->ggs_max_iden);
158
+ if (t > 10.) t = 10.;
159
+ s = (int32_t)(ed * t - min_d);
160
+ } else if (pd > qd) {
161
+ double x = qd * a_dens;
162
+ x = x > c? x : c;
163
+ s = (int32_t)(x + (pd - qd) * a_dens + .499);
164
+ } else {
165
+ s = (int32_t)(qd * a_dens + .499);
166
+ s = s > c? s : c;
167
+ }
168
+ sc[j - 1] = s;
169
+ meta[j-1] = (uint64_t)pd<<32 | off_l0;
170
+ }
171
+
172
+ // get regions to insert
173
+ ss = mg_mss_all(0, gc->n_anchor - 1, sc, 10, 0, &n_ss);
174
+ off_a = gt->lc[gc->off].off;
175
+ for (j = 0; j < n_ss; ++j) {
176
+ const mg128_t *p, *q;
177
+ int32_t st, en, ls, le, span, pd, k, n_ovlp, min_len, is_inv = 0;
178
+ gfa_ins_t I;
179
+
180
+ // find the initial positions
181
+ min_len = opt->ggs_min_end_cnt > 0? opt->ggs_min_end_cnt : 0;
182
+ if (min_len < ss[j].sc * opt->ggs_min_end_frac) min_len = ss[j].sc * opt->ggs_min_end_frac;
183
+ if (ss[j].st <= min_len || ss[j].en >= gc->n_anchor - 1 - min_len) continue; // too close to ends
184
+ st = ss[j].st, en = ss[j].en;
185
+ q = &gt->a[off_a + st];
186
+ p = &gt->a[off_a + en];
187
+ span = p->y>>32&0xff;
188
+ I.ctg = t;
189
+ ls = (int32_t)meta[st], le = (int32_t)meta[en]; // first and last lchain; CLOSED interval
190
+ assert(ls <= le);
191
+ I.v[0] = gt->lc[ls].v;
192
+ I.v[1] = gt->lc[le].v;
193
+ I.voff[0] = (int32_t)q->x + 1 - span;
194
+ I.voff[1] = (int32_t)p->x + 1;
195
+ I.coff[0] = (int32_t)q->y + 1 - span;
196
+ I.coff[1] = (int32_t)p->y + 1;
197
+ assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
198
+ assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
199
+ for (k = st, pd = span; k < en; ++k)
200
+ pd += meta[k]>>32;
201
+
202
+ if (I.coff[0] > I.coff[1]) {
203
+ if (mg_verbose >= 2 && pd + (I.coff[0] - I.coff[1]) >= opt->min_var_len)
204
+ fprintf(stderr, "[W::%s] query overlap on gchain %d: [%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d]\n", __func__, t, "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0]);
205
+ continue; // such overlap can't be properly resolved
206
+ }
207
+ pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
208
+
209
+ min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
210
+ if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
211
+
212
+ // filtering
213
+ if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
214
+ continue;
215
+ for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
216
+ int c = seq[t].seq[k];
217
+ if (c == 'n' || c == 'N') break;
218
+ }
219
+ if (k != I.coff[1]) continue; // no ambiguous bases on the insert
220
+ n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
221
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
222
+ if (n_ovlp != 1) continue;
223
+ for (k = ls; k <= le; ++k) { // find other mappings overlapping with the insert on the graph
224
+ uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
225
+ int32_t s = 0, e = len, tmp;
226
+ if (k == ls) s = (int32_t)gt->a[off_a+st].x + 1 - (int32_t)(gt->a[off_a+st].y>>32&0xff);
227
+ if (k == le) e = (int32_t)gt->a[off_a+en].x + 1;
228
+ if (v&1) tmp = s, s = len - e, e = len - tmp;
229
+ n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
230
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %s:%d-%d is not covered by %s:%d-%d\n", __func__, g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
231
+ if (n_ovlp != 1) break;
232
+ }
233
+ if (k <= le) continue;
234
+ if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
235
+ int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score;
236
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
237
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
238
+ if (score > 0) {
239
+ if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
240
+ if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
241
+ } else if (!(opt->flag & MG_G_NO_INV)) {
242
+ mg_revcomp_seq(l_pseq, pseq);
243
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
244
+ if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
245
+ }
246
+ }
247
+ if (mg_dbg_flag & MG_DBG_INSERT) {
248
+ int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
249
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
250
+ fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
251
+ fprintf(stderr, "IP\t%s\nIQ\t", pseq);
252
+ fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
253
+ if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
254
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
255
+ } else score = -1, mlen = 0, blen = pd > qd? pd : qd;
256
+ fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
257
+ }
258
+ if (is_inv) { // turn one inversion to two events
259
+ gfa_ins_t I_inv[2];
260
+ I_inv[0].ctg = I_inv[1].ctg = I.ctg;
261
+ // the first event
262
+ I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
263
+ I_inv[0].v[0] = I.v[0];
264
+ I_inv[0].voff[0] = I.voff[0];
265
+ I_inv[0].v[1] = I.v[1]^1;
266
+ I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
267
+ // the second event
268
+ I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
269
+ I_inv[1].v[0] = I.v[0]^1;
270
+ I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
271
+ I_inv[1].v[1] = I.v[1];
272
+ I_inv[1].voff[1] = I.voff[1];
273
+ // insert
274
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
275
+ ins[n_ins++] = I_inv[0];
276
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
277
+ ins[n_ins++] = I_inv[1];
278
+ ++n_inv;
279
+ } else {
280
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
281
+ ins[n_ins++] = I;
282
+ }
283
+ }
284
+ kfree(0, ss);
285
+ }
286
+ }
287
+ kfree(km, pseq);
288
+ kfree(km, ovlp);
289
+ kfree(km, sc);
290
+ kfree(km, meta);
291
+ kfree(km, soff); kfree(km, qoff);
292
+ kfree(km, sintv); kfree(km, qintv);
293
+
294
+ if (n_ins > 0) {
295
+ char **names, **seqs;
296
+ KMALLOC(km, names, n_seq);
297
+ KMALLOC(km, seqs, n_seq);
298
+ for (i = 0; i < n_seq; ++i)
299
+ names[i] = seq[i].name, seqs[i] = seq[i].seq;
300
+ n_ins = gfa_ins_filter(g, n_ins, ins);
301
+ gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
302
+ kfree(km, ins);
303
+ kfree(km, names);
304
+ kfree(km, seqs);
305
+ }
306
+ if (mg_verbose >= 3)
307
+ fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
308
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
309
+ }
310
+
311
+ /**********************
312
+ * Graph augmentation *
313
+ **********************/
314
+
315
+ typedef struct {
316
+ int32_t lc, vo, qo, po, len, op, sc;
317
+ } ed_intv_t;
318
+
319
+ static int32_t gg_count_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i)
320
+ {
321
+ const mg_gchain_t *gc = &gt->gc[i];
322
+ int32_t j, l = gc->off, x = gc->ps, n = 0;
323
+ assert(gc->p);
324
+ for (j = 0; j < gc->p->n_cigar; ++j) {
325
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
326
+ assert(op == 1 || op == 2 || op == 7 || op == 8);
327
+ if (op == 2 || op == 7 || op == 8) {
328
+ while (x + rl > g->seg[gt->lc[l].v>>1].len) {
329
+ rl -= g->seg[gt->lc[l].v>>1].len - x;
330
+ ++n, ++l, x = 0;
331
+ }
332
+ x += rl;
333
+ }
334
+ ++n;
335
+ }
336
+ return n;
337
+ }
338
+
339
+ static void gg_write_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i, ed_intv_t *intv)
340
+ {
341
+ const mg_gchain_t *gc = &gt->gc[i];
342
+ int32_t j, l = gc->off, pl = 0, x = gc->ps, y = gc->qs, n = 0;
343
+ ed_intv_t *p;
344
+ assert(gc->p);
345
+ for (j = 0; j < gc->p->n_cigar; ++j) {
346
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
347
+ if (op == 2 || op == 7 || op == 8) {
348
+ while (x + rl > g->seg[gt->lc[l].v>>1].len) {
349
+ p = &intv[n++];
350
+ p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = g->seg[gt->lc[l].v>>1].len - x, p->op = op;
351
+ if (op == 7 || op == 8) y += p->len;
352
+ rl -= p->len, pl += p->len, ++l, x = 0;
353
+ }
354
+ }
355
+ p = &intv[n++];
356
+ p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = rl, p->op = op;
357
+ if (op == 7 || op == 8) x += rl, y += rl, pl += rl;
358
+ else if (op == 1) y += rl;
359
+ else if (op == 2) x += rl, pl += rl;
360
+ }
361
+ assert(y == gc->qe && pl == gc->pe - gc->ps);
362
+ }
363
+
364
+ static void gg_score_intv(int32_t n_intv, ed_intv_t *intv)
365
+ {
366
+ int32_t j;
367
+ for (j = 0; j < n_intv; ++j) {
368
+ int32_t s;
369
+ if (intv[j].op == 7)
370
+ s = intv[j].len >= 10? -intv[j].len : 0;
371
+ else s = intv[j].len;
372
+ intv[j].sc = s;
373
+ }
374
+ }
375
+
376
+ static void gg_merge_seg(const ed_intv_t *intv, int32_t n_ss, mg_msseg_t *ss)
377
+ {
378
+ int32_t j0, j;
379
+ for (j0 = 0, j = 1; j < n_ss; ++j) {
380
+ mg_msseg_t *s0 = &ss[j0], *s1 = &ss[j];
381
+ int32_t i, mid = 0;
382
+ for (i = s0->en + 1; i < s1->st; ++i)
383
+ mid += intv[i].sc;
384
+ //fprintf(stderr, "XX\t%d\t%d\t%d\t%d\t%d\t%d\n", j, s0->sc, mid, s1->sc, s0->en+1, s1->st);
385
+ if (-mid < s0->sc * 0.2 && -mid < s1->sc * 0.2) { // FIXME: mid is sometimes 0
386
+ s0->en = s1->en, s0->sc += s1->sc + mid;
387
+ s1->st = s1->en, s1->sc = 0;
388
+ } else j0 = j;
389
+ }
390
+ }
391
+
392
+ void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
393
+ {
394
+ int32_t t, i, *soff, *qoff, max_acnt, m_ovlp = 0, *ovlp = 0, n_ins = 0, m_ins, n_inv;
395
+ int32_t l_pseq, m_pseq;
396
+ mg_intv_t *sintv, *qintv;
397
+ double a_dens;
398
+ gfa_ins_t *ins;
399
+ char *pseq;
400
+
401
+ max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
402
+ if (max_acnt == 0) return;
403
+
404
+ // extract poorly regions
405
+ m_pseq = l_pseq = 0, pseq = 0;
406
+ m_ins = n_ins = 0, ins = 0;
407
+ n_inv = 0;
408
+ for (t = 0; t < n_seq; ++t) {
409
+ const mg_gchains_t *gt = gcs[t];
410
+ for (i = 0; i < gt->n_gc; ++i) {
411
+ const mg_gchain_t *gc = &gt->gc[i];
412
+ int32_t j, n_ss, n_intv, *sc;
413
+ ed_intv_t *intv;
414
+ mg_msseg_t *ss;
415
+ if (gc->id != gc->parent) continue;
416
+ if (gc->p == 0 || gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
417
+ assert(gc->cnt > 0);
418
+
419
+ n_intv = gg_count_intv(g, gt, i);
420
+ KCALLOC(km, intv, n_intv);
421
+ gg_write_intv(g, gt, i, intv);
422
+ gg_score_intv(n_intv, intv);
423
+ KCALLOC(km, sc, n_intv);
424
+ for (j = 0; j < n_intv; ++j) sc[j] = intv[j].sc;
425
+ ss = mg_mss_all(0, n_intv, sc, opt->min_var_len, 2 * opt->min_var_len, &n_ss);
426
+ gg_merge_seg(intv, n_ss, ss);
427
+
428
+ // get regions to insert
429
+ for (j = 0; j < n_ss; ++j) {
430
+ int32_t st, en, pd, k, n_ovlp, min_len, is_inv = 0, ls, le;
431
+ gfa_ins_t I;
432
+ ed_intv_t *is, *ie;
433
+
434
+ // find the initial positions
435
+ st = ss[j].st, en = ss[j].en; // this is a CLOSED interval
436
+ if (st == en) continue;
437
+ is = &intv[st], ie = &intv[en - 1];
438
+ assert(is->op != 7 && ie->op != 7);
439
+
440
+ ls = is->lc, le = ie->lc;
441
+ I.ctg = t;
442
+ I.v[0] = gt->lc[ls].v;
443
+ I.v[1] = gt->lc[le].v;
444
+ I.voff[0] = is->vo;
445
+ I.voff[1] = ie->vo + (ie->op != 1? ie->len : 0);
446
+ I.coff[0] = is->qo;
447
+ I.coff[1] = ie->qo + (ie->op != 2? ie->len : 0);
448
+ assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
449
+ assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
450
+
451
+ if (I.voff[0] == 0) { // if an insert starts at pos 0, make it start at the end of the previous vertex in the chain
452
+ assert(ls - 1 >= gc->off);
453
+ I.v[0] = gt->lc[--ls].v;
454
+ I.voff[0] = g->seg[I.v[0]>>1].len;
455
+ }
456
+ if (I.voff[1] == g->seg[I.v[1]>>1].len) { // if an insert ends at the end of the vertex, make it end at the beginning of the next vertex
457
+ assert(le + 1 < gc->off + gc->cnt);
458
+ I.v[1] = gt->lc[++le].v;
459
+ I.voff[1] = 0;
460
+ }
461
+
462
+ pd = ie->po + (ie->op != 1? ie->len : 0) - is->po;
463
+ pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
464
+
465
+ min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
466
+ if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
467
+
468
+ // filtering
469
+ if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
470
+ continue;
471
+ for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
472
+ int c = seq[t].seq[k];
473
+ if (c == 'n' || c == 'N') break;
474
+ }
475
+ if (k != I.coff[1]) continue; // no ambiguous bases on the insert
476
+ n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
477
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
478
+ if (n_ovlp != 1) continue;
479
+ for (k = is->lc; k <= ie->lc; ++k) { // find other mappings overlapping with the insert on the graph
480
+ uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
481
+ int32_t s = 0, e = len, tmp;
482
+ if (k == is->lc) s = is->vo;
483
+ if (k == ie->lc) e = ie->vo + (ie->op != 1? ie->len : 0);
484
+ if (v&1) tmp = s, s = len - e, e = len - tmp;
485
+ if (s == e) {
486
+ if (s == 0) ++e;
487
+ else --s;
488
+ }
489
+ n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
490
+ if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %c%s:%d-%d is not covered by %s:%d-%d\n", __func__, "><"[v&1], g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
491
+ if (n_ovlp != 1) break;
492
+ }
493
+ if (k <= ie->lc) continue;
494
+ if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
495
+ int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score = 0;
496
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
497
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
498
+ if (score > 0) {
499
+ if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
500
+ if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
501
+ } else if (!(opt->flag & MG_G_NO_INV)) {
502
+ mg_revcomp_seq(l_pseq, pseq);
503
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
504
+ if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
505
+ }
506
+ }
507
+ if (mg_dbg_flag & MG_DBG_INSERT) {
508
+ int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
509
+ l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
510
+ fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
511
+ fprintf(stderr, "IP\t%s\nIQ\t", pseq);
512
+ fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
513
+ if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
514
+ score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
515
+ } else score = -1, mlen = 0, blen = pd > qd? pd : qd;
516
+ fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
517
+ //if (I.voff[0] == 2305301) { for (k = st; k < en; ++k) fprintf(stderr, "%d%c", intv[k].len, "MIDNSHP=XB"[intv[k].op]); fprintf(stderr, "\n"); }
518
+ }
519
+ if (is_inv) { // turn one inversion to two events
520
+ gfa_ins_t I_inv[2];
521
+ I_inv[0].ctg = I_inv[1].ctg = I.ctg;
522
+ // the first event
523
+ I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
524
+ I_inv[0].v[0] = I.v[0];
525
+ I_inv[0].voff[0] = I.voff[0];
526
+ I_inv[0].v[1] = I.v[1]^1;
527
+ I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
528
+ // the second event
529
+ I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
530
+ I_inv[1].v[0] = I.v[0]^1;
531
+ I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
532
+ I_inv[1].v[1] = I.v[1];
533
+ I_inv[1].voff[1] = I.voff[1];
534
+ // insert
535
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
536
+ ins[n_ins++] = I_inv[0];
537
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
538
+ ins[n_ins++] = I_inv[1];
539
+ ++n_inv;
540
+ } else {
541
+ if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
542
+ ins[n_ins++] = I;
543
+ }
544
+ }
545
+ kfree(0, ss); // this is allocated from malloc() inside mg_mss_all()
546
+ kfree(km, intv);
547
+ kfree(km, sc);
548
+ }
549
+ }
550
+ kfree(km, pseq);
551
+ kfree(km, ovlp);
552
+ kfree(km, soff); kfree(km, qoff);
553
+ kfree(km, sintv); kfree(km, qintv);
554
+
555
+ if (n_ins > 0) {
556
+ char **names, **seqs;
557
+ KMALLOC(km, names, n_seq);
558
+ KMALLOC(km, seqs, n_seq);
559
+ for (i = 0; i < n_seq; ++i)
560
+ names[i] = seq[i].name, seqs[i] = seq[i].seq;
561
+ n_ins = gfa_ins_filter(g, n_ins, ins);
562
+ gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
563
+ kfree(km, ins);
564
+ kfree(km, names);
565
+ kfree(km, seqs);
566
+ }
567
+ if (mg_verbose >= 3)
568
+ fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
569
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
570
+ }