ruby-minigraph 0.0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,570 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "gfa-priv.h"
|
4
|
+
#include "kalloc.h"
|
5
|
+
#include "bseq.h"
|
6
|
+
#include "algo.h"
|
7
|
+
#include "sys.h"
|
8
|
+
#include "ggen.h"
|
9
|
+
#include "kvec-km.h"
|
10
|
+
|
11
|
+
int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
|
12
|
+
double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_)
|
13
|
+
{
|
14
|
+
int32_t t, i, j, max_acnt, *scnt, *soff, *qcnt, *qoff;
|
15
|
+
int64_t sum_acnt, sum_alen;
|
16
|
+
mg_intv_t *sintv, *qintv;
|
17
|
+
|
18
|
+
// count the number of intervals on each segment
|
19
|
+
KCALLOC(km, scnt, g->n_seg);
|
20
|
+
KCALLOC(km, qcnt, n_seq);
|
21
|
+
for (t = 0, max_acnt = 0; t < n_seq; ++t) {
|
22
|
+
const mg_gchains_t *gt = gcs[t];
|
23
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
24
|
+
const mg_gchain_t *gc = >->gc[i];
|
25
|
+
if (gc->id != gc->parent) continue;
|
26
|
+
if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
|
27
|
+
if (gc->n_anchor > max_acnt) max_acnt = gc->n_anchor;
|
28
|
+
++qcnt[t];
|
29
|
+
for (j = 0; j < gc->cnt; ++j)
|
30
|
+
++scnt[gt->lc[gc->off + j].v>>1];
|
31
|
+
}
|
32
|
+
}
|
33
|
+
if (max_acnt == 0) { // no gchain
|
34
|
+
kfree(km, scnt); kfree(km, qcnt);
|
35
|
+
return 0;
|
36
|
+
}
|
37
|
+
|
38
|
+
// compute soff[] and qoff[]
|
39
|
+
KMALLOC(km, soff, g->n_seg + 1);
|
40
|
+
KMALLOC(km, qoff, n_seq + 1);
|
41
|
+
for (soff[0] = 0, i = 1; i <= g->n_seg; ++i)
|
42
|
+
soff[i] = soff[i - 1] + scnt[i - 1];
|
43
|
+
for (qoff[0] = 0, i = 1; i <= n_seq; ++i)
|
44
|
+
qoff[i] = qoff[i - 1] + qcnt[i - 1];
|
45
|
+
|
46
|
+
// populate the interval list
|
47
|
+
memset(scnt, 0, 4 * g->n_seg);
|
48
|
+
memset(qcnt, 0, 4 * n_seq);
|
49
|
+
KMALLOC(km, sintv, soff[g->n_seg]);
|
50
|
+
KMALLOC(km, qintv, qoff[n_seq]);
|
51
|
+
sum_acnt = sum_alen = 0;
|
52
|
+
for (t = 0; t < n_seq; ++t) {
|
53
|
+
const mg_gchains_t *gt = gcs[t];
|
54
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
55
|
+
const mg_gchain_t *gc = >->gc[i];
|
56
|
+
mg_intv_t *p;
|
57
|
+
if (gc->id != gc->parent) continue;
|
58
|
+
if (gc->blen < min_depth_len || gc->mapq < min_mapq) continue;
|
59
|
+
p = &qintv[qoff[t] + qcnt[t]];
|
60
|
+
++qcnt[t];
|
61
|
+
p->st = gc->qs, p->en = gc->qe, p->rev = 0, p->far = -1, p->i = -1;
|
62
|
+
for (j = 0; j < gc->cnt; ++j) {
|
63
|
+
const mg_llchain_t *lc = >->lc[gc->off + j];
|
64
|
+
int32_t rs, re, tmp;
|
65
|
+
if (lc->cnt > 0) { // compute start and end on the forward strand on the segment
|
66
|
+
const mg128_t *qs = >->a[lc->off];
|
67
|
+
const mg128_t *qe = >->a[lc->off + lc->cnt - 1];
|
68
|
+
int32_t rs0 = (int32_t)qs->x + 1 - (int32_t)(qs->y>>32&0xff);
|
69
|
+
int32_t re0 = (int32_t)qe->x;
|
70
|
+
assert(rs0 >= 0 && re0 > rs0 && re0 < g->seg[lc->v>>1].len);
|
71
|
+
sum_alen += re0 - rs0, sum_acnt += (qe->x>>32) - (qs->x>>32) + 1;
|
72
|
+
rs = 0, re = g->seg[lc->v>>1].len;
|
73
|
+
if (j == 0) rs = gc->p? gc->p->ss : rs0;
|
74
|
+
if (j == gc->cnt - 1) re = gc->p? gc->p->ee : re0;
|
75
|
+
if (lc->v&1) // swap rs and re
|
76
|
+
tmp = rs, rs = g->seg[lc->v>>1].len - re, re = g->seg[lc->v>>1].len - tmp;
|
77
|
+
} else rs = 0, re = g->seg[lc->v>>1].len;
|
78
|
+
p = &sintv[soff[lc->v>>1] + scnt[lc->v>>1]];
|
79
|
+
++scnt[lc->v>>1];
|
80
|
+
p->st = rs, p->en = re, p->rev = lc->v&1, p->far = -1, p->i = -1;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
}
|
84
|
+
*a_dens = (double)sum_acnt / sum_alen;
|
85
|
+
|
86
|
+
// sort and index intervals
|
87
|
+
for (i = 0; i < g->n_seg; ++i) {
|
88
|
+
assert(soff[i+1] - soff[i] == scnt[i]);
|
89
|
+
mg_intv_index(soff[i+1] - soff[i], &sintv[soff[i]]);
|
90
|
+
}
|
91
|
+
kfree(km, scnt);
|
92
|
+
for (i = 0; i < n_seq; ++i) {
|
93
|
+
assert(qoff[i+1] - qoff[i] == qcnt[i]);
|
94
|
+
mg_intv_index(qoff[i+1] - qoff[i], &qintv[qoff[i]]);
|
95
|
+
}
|
96
|
+
kfree(km, qcnt);
|
97
|
+
|
98
|
+
*sintv_ = sintv, *qintv_ = qintv;
|
99
|
+
*soff_ = soff, *qoff_ = qoff;
|
100
|
+
return max_acnt;
|
101
|
+
}
|
102
|
+
|
103
|
+
/**********************
|
104
|
+
* Graph augmentation *
|
105
|
+
**********************/
|
106
|
+
|
107
|
+
void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
|
108
|
+
{
|
109
|
+
int32_t t, i, j, *soff, *qoff, max_acnt, *sc, m_ovlp = 0, *ovlp = 0, n_ins, m_ins, n_inv;
|
110
|
+
int32_t l_pseq, m_pseq;
|
111
|
+
uint64_t *meta;
|
112
|
+
mg_intv_t *sintv, *qintv;
|
113
|
+
double a_dens;
|
114
|
+
gfa_ins_t *ins;
|
115
|
+
char *pseq;
|
116
|
+
|
117
|
+
max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
|
118
|
+
if (max_acnt == 0) return;
|
119
|
+
|
120
|
+
// extract poorly regions
|
121
|
+
m_pseq = l_pseq = 0, pseq = 0;
|
122
|
+
m_ins = n_ins = 0, ins = 0;
|
123
|
+
n_inv = 0;
|
124
|
+
KMALLOC(km, sc, max_acnt);
|
125
|
+
KMALLOC(km, meta, max_acnt);
|
126
|
+
for (t = 0; t < n_seq; ++t) {
|
127
|
+
const mg_gchains_t *gt = gcs[t];
|
128
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
129
|
+
const mg_gchain_t *gc = >->gc[i];
|
130
|
+
int32_t off_a, off_l, n_ss, far_q;
|
131
|
+
mg_msseg_t *ss;
|
132
|
+
if (gc->id != gc->parent) continue;
|
133
|
+
if (gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
|
134
|
+
assert(gc->cnt > 0);
|
135
|
+
|
136
|
+
// fill sc[]. This part achieves a similar goal to the one in mg_gchain_extra(). It makes more assumptions, but is logically simpler.
|
137
|
+
off_l = gc->off;
|
138
|
+
off_a = gt->lc[off_l].off + 1;
|
139
|
+
far_q = 0;
|
140
|
+
for (j = 1; j < gc->n_anchor; ++j, ++off_a) {
|
141
|
+
const mg128_t *q = >->a[off_a - 1], *p = >->a[off_a];
|
142
|
+
const mg_llchain_t *lc = >->lc[off_l];
|
143
|
+
int32_t s, ed = -1, off_l0 = off_l, pd, qd = (int32_t)p->y - (int32_t)q->y, c = (int32_t)(p->x>>32) - (int32_t)(q->x>>32) - 1;
|
144
|
+
if ((int32_t)q->y > far_q) far_q = (int32_t)q->y; // far_q keeps the rightmost query position seen so far
|
145
|
+
if (off_a == lc->off + lc->cnt) { // we are at the end of the current lchain
|
146
|
+
pd = g->seg[lc->v>>1].len - (int32_t)q->x - 1;
|
147
|
+
for (++off_l; off_l < gc->off + gc->cnt && gt->lc[off_l].cnt == 0; ++off_l)
|
148
|
+
pd += g->seg[gt->lc[off_l].v>>1].len;
|
149
|
+
assert(off_l < gc->off + gc->cnt);
|
150
|
+
if (gt->lc[off_l].ed >= 0) ed = gt->lc[off_l].ed;
|
151
|
+
pd += (int32_t)p->x + 1;
|
152
|
+
} else pd = (int32_t)p->x - (int32_t)q->x;
|
153
|
+
if ((opt->flag&MG_G_NO_QOVLP) && (int32_t)p->y < far_q) s = 1; // query overlap
|
154
|
+
else if (pd == qd && c == 0) s = -opt->match_pen;
|
155
|
+
else if (ed >= 0) {
|
156
|
+
int32_t min_d = pd < qd? pd : qd;
|
157
|
+
double t = 1. / (1.01 - opt->ggs_max_iden);
|
158
|
+
if (t > 10.) t = 10.;
|
159
|
+
s = (int32_t)(ed * t - min_d);
|
160
|
+
} else if (pd > qd) {
|
161
|
+
double x = qd * a_dens;
|
162
|
+
x = x > c? x : c;
|
163
|
+
s = (int32_t)(x + (pd - qd) * a_dens + .499);
|
164
|
+
} else {
|
165
|
+
s = (int32_t)(qd * a_dens + .499);
|
166
|
+
s = s > c? s : c;
|
167
|
+
}
|
168
|
+
sc[j - 1] = s;
|
169
|
+
meta[j-1] = (uint64_t)pd<<32 | off_l0;
|
170
|
+
}
|
171
|
+
|
172
|
+
// get regions to insert
|
173
|
+
ss = mg_mss_all(0, gc->n_anchor - 1, sc, 10, 0, &n_ss);
|
174
|
+
off_a = gt->lc[gc->off].off;
|
175
|
+
for (j = 0; j < n_ss; ++j) {
|
176
|
+
const mg128_t *p, *q;
|
177
|
+
int32_t st, en, ls, le, span, pd, k, n_ovlp, min_len, is_inv = 0;
|
178
|
+
gfa_ins_t I;
|
179
|
+
|
180
|
+
// find the initial positions
|
181
|
+
min_len = opt->ggs_min_end_cnt > 0? opt->ggs_min_end_cnt : 0;
|
182
|
+
if (min_len < ss[j].sc * opt->ggs_min_end_frac) min_len = ss[j].sc * opt->ggs_min_end_frac;
|
183
|
+
if (ss[j].st <= min_len || ss[j].en >= gc->n_anchor - 1 - min_len) continue; // too close to ends
|
184
|
+
st = ss[j].st, en = ss[j].en;
|
185
|
+
q = >->a[off_a + st];
|
186
|
+
p = >->a[off_a + en];
|
187
|
+
span = p->y>>32&0xff;
|
188
|
+
I.ctg = t;
|
189
|
+
ls = (int32_t)meta[st], le = (int32_t)meta[en]; // first and last lchain; CLOSED interval
|
190
|
+
assert(ls <= le);
|
191
|
+
I.v[0] = gt->lc[ls].v;
|
192
|
+
I.v[1] = gt->lc[le].v;
|
193
|
+
I.voff[0] = (int32_t)q->x + 1 - span;
|
194
|
+
I.voff[1] = (int32_t)p->x + 1;
|
195
|
+
I.coff[0] = (int32_t)q->y + 1 - span;
|
196
|
+
I.coff[1] = (int32_t)p->y + 1;
|
197
|
+
assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
|
198
|
+
assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
|
199
|
+
for (k = st, pd = span; k < en; ++k)
|
200
|
+
pd += meta[k]>>32;
|
201
|
+
|
202
|
+
if (I.coff[0] > I.coff[1]) {
|
203
|
+
if (mg_verbose >= 2 && pd + (I.coff[0] - I.coff[1]) >= opt->min_var_len)
|
204
|
+
fprintf(stderr, "[W::%s] query overlap on gchain %d: [%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d]\n", __func__, t, "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0]);
|
205
|
+
continue; // such overlap can't be properly resolved
|
206
|
+
}
|
207
|
+
pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
|
208
|
+
|
209
|
+
min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
|
210
|
+
if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
|
211
|
+
|
212
|
+
// filtering
|
213
|
+
if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
|
214
|
+
continue;
|
215
|
+
for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
|
216
|
+
int c = seq[t].seq[k];
|
217
|
+
if (c == 'n' || c == 'N') break;
|
218
|
+
}
|
219
|
+
if (k != I.coff[1]) continue; // no ambiguous bases on the insert
|
220
|
+
n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
|
221
|
+
if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
|
222
|
+
if (n_ovlp != 1) continue;
|
223
|
+
for (k = ls; k <= le; ++k) { // find other mappings overlapping with the insert on the graph
|
224
|
+
uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
|
225
|
+
int32_t s = 0, e = len, tmp;
|
226
|
+
if (k == ls) s = (int32_t)gt->a[off_a+st].x + 1 - (int32_t)(gt->a[off_a+st].y>>32&0xff);
|
227
|
+
if (k == le) e = (int32_t)gt->a[off_a+en].x + 1;
|
228
|
+
if (v&1) tmp = s, s = len - e, e = len - tmp;
|
229
|
+
n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
|
230
|
+
if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %s:%d-%d is not covered by %s:%d-%d\n", __func__, g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
|
231
|
+
if (n_ovlp != 1) break;
|
232
|
+
}
|
233
|
+
if (k <= le) continue;
|
234
|
+
if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
|
235
|
+
int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score;
|
236
|
+
l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
|
237
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
238
|
+
if (score > 0) {
|
239
|
+
if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
|
240
|
+
if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
|
241
|
+
} else if (!(opt->flag & MG_G_NO_INV)) {
|
242
|
+
mg_revcomp_seq(l_pseq, pseq);
|
243
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
244
|
+
if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
|
245
|
+
}
|
246
|
+
}
|
247
|
+
if (mg_dbg_flag & MG_DBG_INSERT) {
|
248
|
+
int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
|
249
|
+
l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
|
250
|
+
fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
|
251
|
+
fprintf(stderr, "IP\t%s\nIQ\t", pseq);
|
252
|
+
fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
|
253
|
+
if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
|
254
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
255
|
+
} else score = -1, mlen = 0, blen = pd > qd? pd : qd;
|
256
|
+
fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
|
257
|
+
}
|
258
|
+
if (is_inv) { // turn one inversion to two events
|
259
|
+
gfa_ins_t I_inv[2];
|
260
|
+
I_inv[0].ctg = I_inv[1].ctg = I.ctg;
|
261
|
+
// the first event
|
262
|
+
I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
|
263
|
+
I_inv[0].v[0] = I.v[0];
|
264
|
+
I_inv[0].voff[0] = I.voff[0];
|
265
|
+
I_inv[0].v[1] = I.v[1]^1;
|
266
|
+
I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
|
267
|
+
// the second event
|
268
|
+
I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
|
269
|
+
I_inv[1].v[0] = I.v[0]^1;
|
270
|
+
I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
|
271
|
+
I_inv[1].v[1] = I.v[1];
|
272
|
+
I_inv[1].voff[1] = I.voff[1];
|
273
|
+
// insert
|
274
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
275
|
+
ins[n_ins++] = I_inv[0];
|
276
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
277
|
+
ins[n_ins++] = I_inv[1];
|
278
|
+
++n_inv;
|
279
|
+
} else {
|
280
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
281
|
+
ins[n_ins++] = I;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
kfree(0, ss);
|
285
|
+
}
|
286
|
+
}
|
287
|
+
kfree(km, pseq);
|
288
|
+
kfree(km, ovlp);
|
289
|
+
kfree(km, sc);
|
290
|
+
kfree(km, meta);
|
291
|
+
kfree(km, soff); kfree(km, qoff);
|
292
|
+
kfree(km, sintv); kfree(km, qintv);
|
293
|
+
|
294
|
+
if (n_ins > 0) {
|
295
|
+
char **names, **seqs;
|
296
|
+
KMALLOC(km, names, n_seq);
|
297
|
+
KMALLOC(km, seqs, n_seq);
|
298
|
+
for (i = 0; i < n_seq; ++i)
|
299
|
+
names[i] = seq[i].name, seqs[i] = seq[i].seq;
|
300
|
+
n_ins = gfa_ins_filter(g, n_ins, ins);
|
301
|
+
gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
|
302
|
+
kfree(km, ins);
|
303
|
+
kfree(km, names);
|
304
|
+
kfree(km, seqs);
|
305
|
+
}
|
306
|
+
if (mg_verbose >= 3)
|
307
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
|
308
|
+
realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
|
309
|
+
}
|
310
|
+
|
311
|
+
/**********************
|
312
|
+
* Graph augmentation *
|
313
|
+
**********************/
|
314
|
+
|
315
|
+
typedef struct {
|
316
|
+
int32_t lc, vo, qo, po, len, op, sc;
|
317
|
+
} ed_intv_t;
|
318
|
+
|
319
|
+
static int32_t gg_count_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i)
|
320
|
+
{
|
321
|
+
const mg_gchain_t *gc = >->gc[i];
|
322
|
+
int32_t j, l = gc->off, x = gc->ps, n = 0;
|
323
|
+
assert(gc->p);
|
324
|
+
for (j = 0; j < gc->p->n_cigar; ++j) {
|
325
|
+
int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
|
326
|
+
assert(op == 1 || op == 2 || op == 7 || op == 8);
|
327
|
+
if (op == 2 || op == 7 || op == 8) {
|
328
|
+
while (x + rl > g->seg[gt->lc[l].v>>1].len) {
|
329
|
+
rl -= g->seg[gt->lc[l].v>>1].len - x;
|
330
|
+
++n, ++l, x = 0;
|
331
|
+
}
|
332
|
+
x += rl;
|
333
|
+
}
|
334
|
+
++n;
|
335
|
+
}
|
336
|
+
return n;
|
337
|
+
}
|
338
|
+
|
339
|
+
static void gg_write_intv(const gfa_t *g, const mg_gchains_t *gt, int32_t i, ed_intv_t *intv)
|
340
|
+
{
|
341
|
+
const mg_gchain_t *gc = >->gc[i];
|
342
|
+
int32_t j, l = gc->off, pl = 0, x = gc->ps, y = gc->qs, n = 0;
|
343
|
+
ed_intv_t *p;
|
344
|
+
assert(gc->p);
|
345
|
+
for (j = 0; j < gc->p->n_cigar; ++j) {
|
346
|
+
int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4, rl = len;
|
347
|
+
if (op == 2 || op == 7 || op == 8) {
|
348
|
+
while (x + rl > g->seg[gt->lc[l].v>>1].len) {
|
349
|
+
p = &intv[n++];
|
350
|
+
p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = g->seg[gt->lc[l].v>>1].len - x, p->op = op;
|
351
|
+
if (op == 7 || op == 8) y += p->len;
|
352
|
+
rl -= p->len, pl += p->len, ++l, x = 0;
|
353
|
+
}
|
354
|
+
}
|
355
|
+
p = &intv[n++];
|
356
|
+
p->lc = l, p->vo = x, p->qo = y, p->po = pl, p->len = rl, p->op = op;
|
357
|
+
if (op == 7 || op == 8) x += rl, y += rl, pl += rl;
|
358
|
+
else if (op == 1) y += rl;
|
359
|
+
else if (op == 2) x += rl, pl += rl;
|
360
|
+
}
|
361
|
+
assert(y == gc->qe && pl == gc->pe - gc->ps);
|
362
|
+
}
|
363
|
+
|
364
|
+
static void gg_score_intv(int32_t n_intv, ed_intv_t *intv)
|
365
|
+
{
|
366
|
+
int32_t j;
|
367
|
+
for (j = 0; j < n_intv; ++j) {
|
368
|
+
int32_t s;
|
369
|
+
if (intv[j].op == 7)
|
370
|
+
s = intv[j].len >= 10? -intv[j].len : 0;
|
371
|
+
else s = intv[j].len;
|
372
|
+
intv[j].sc = s;
|
373
|
+
}
|
374
|
+
}
|
375
|
+
|
376
|
+
static void gg_merge_seg(const ed_intv_t *intv, int32_t n_ss, mg_msseg_t *ss)
|
377
|
+
{
|
378
|
+
int32_t j0, j;
|
379
|
+
for (j0 = 0, j = 1; j < n_ss; ++j) {
|
380
|
+
mg_msseg_t *s0 = &ss[j0], *s1 = &ss[j];
|
381
|
+
int32_t i, mid = 0;
|
382
|
+
for (i = s0->en + 1; i < s1->st; ++i)
|
383
|
+
mid += intv[i].sc;
|
384
|
+
//fprintf(stderr, "XX\t%d\t%d\t%d\t%d\t%d\t%d\n", j, s0->sc, mid, s1->sc, s0->en+1, s1->st);
|
385
|
+
if (-mid < s0->sc * 0.2 && -mid < s1->sc * 0.2) { // FIXME: mid is sometimes 0
|
386
|
+
s0->en = s1->en, s0->sc += s1->sc + mid;
|
387
|
+
s1->st = s1->en, s1->sc = 0;
|
388
|
+
} else j0 = j;
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs)
|
393
|
+
{
|
394
|
+
int32_t t, i, *soff, *qoff, max_acnt, m_ovlp = 0, *ovlp = 0, n_ins = 0, m_ins, n_inv;
|
395
|
+
int32_t l_pseq, m_pseq;
|
396
|
+
mg_intv_t *sintv, *qintv;
|
397
|
+
double a_dens;
|
398
|
+
gfa_ins_t *ins;
|
399
|
+
char *pseq;
|
400
|
+
|
401
|
+
max_acnt = mg_gc_index(km, opt->min_mapq, opt->min_map_len, opt->min_depth_len, g, n_seq, gcs, &a_dens, &soff, &qoff, &sintv, &qintv);
|
402
|
+
if (max_acnt == 0) return;
|
403
|
+
|
404
|
+
// extract poorly regions
|
405
|
+
m_pseq = l_pseq = 0, pseq = 0;
|
406
|
+
m_ins = n_ins = 0, ins = 0;
|
407
|
+
n_inv = 0;
|
408
|
+
for (t = 0; t < n_seq; ++t) {
|
409
|
+
const mg_gchains_t *gt = gcs[t];
|
410
|
+
for (i = 0; i < gt->n_gc; ++i) {
|
411
|
+
const mg_gchain_t *gc = >->gc[i];
|
412
|
+
int32_t j, n_ss, n_intv, *sc;
|
413
|
+
ed_intv_t *intv;
|
414
|
+
mg_msseg_t *ss;
|
415
|
+
if (gc->id != gc->parent) continue;
|
416
|
+
if (gc->p == 0 || gc->blen < opt->min_map_len || gc->mapq < opt->min_mapq) continue;
|
417
|
+
assert(gc->cnt > 0);
|
418
|
+
|
419
|
+
n_intv = gg_count_intv(g, gt, i);
|
420
|
+
KCALLOC(km, intv, n_intv);
|
421
|
+
gg_write_intv(g, gt, i, intv);
|
422
|
+
gg_score_intv(n_intv, intv);
|
423
|
+
KCALLOC(km, sc, n_intv);
|
424
|
+
for (j = 0; j < n_intv; ++j) sc[j] = intv[j].sc;
|
425
|
+
ss = mg_mss_all(0, n_intv, sc, opt->min_var_len, 2 * opt->min_var_len, &n_ss);
|
426
|
+
gg_merge_seg(intv, n_ss, ss);
|
427
|
+
|
428
|
+
// get regions to insert
|
429
|
+
for (j = 0; j < n_ss; ++j) {
|
430
|
+
int32_t st, en, pd, k, n_ovlp, min_len, is_inv = 0, ls, le;
|
431
|
+
gfa_ins_t I;
|
432
|
+
ed_intv_t *is, *ie;
|
433
|
+
|
434
|
+
// find the initial positions
|
435
|
+
st = ss[j].st, en = ss[j].en; // this is a CLOSED interval
|
436
|
+
if (st == en) continue;
|
437
|
+
is = &intv[st], ie = &intv[en - 1];
|
438
|
+
assert(is->op != 7 && ie->op != 7);
|
439
|
+
|
440
|
+
ls = is->lc, le = ie->lc;
|
441
|
+
I.ctg = t;
|
442
|
+
I.v[0] = gt->lc[ls].v;
|
443
|
+
I.v[1] = gt->lc[le].v;
|
444
|
+
I.voff[0] = is->vo;
|
445
|
+
I.voff[1] = ie->vo + (ie->op != 1? ie->len : 0);
|
446
|
+
I.coff[0] = is->qo;
|
447
|
+
I.coff[1] = ie->qo + (ie->op != 2? ie->len : 0);
|
448
|
+
assert(I.voff[0] <= g->seg[I.v[0]>>1].len);
|
449
|
+
assert(I.voff[1] <= g->seg[I.v[1]>>1].len);
|
450
|
+
|
451
|
+
if (I.voff[0] == 0) { // if an insert starts at pos 0, make it start at the end of the previous vertex in the chain
|
452
|
+
assert(ls - 1 >= gc->off);
|
453
|
+
I.v[0] = gt->lc[--ls].v;
|
454
|
+
I.voff[0] = g->seg[I.v[0]>>1].len;
|
455
|
+
}
|
456
|
+
if (I.voff[1] == g->seg[I.v[1]>>1].len) { // if an insert ends at the end of the vertex, make it end at the beginning of the next vertex
|
457
|
+
assert(le + 1 < gc->off + gc->cnt);
|
458
|
+
I.v[1] = gt->lc[++le].v;
|
459
|
+
I.voff[1] = 0;
|
460
|
+
}
|
461
|
+
|
462
|
+
pd = ie->po + (ie->op != 1? ie->len : 0) - is->po;
|
463
|
+
pd -= gfa_ins_adj(g, opt->ggs_shrink_pen, &I, seq[t].seq);
|
464
|
+
|
465
|
+
min_len = pd > I.coff[1] - I.coff[0]? pd : I.coff[1] - I.coff[0];
|
466
|
+
if (I.coff[0] <= min_len || I.coff[1] >= seq[t].l_seq - min_len) continue; // test if the event is close to ends again
|
467
|
+
|
468
|
+
// filtering
|
469
|
+
if (I.coff[1] - I.coff[0] < opt->min_var_len && pd < opt->min_var_len)
|
470
|
+
continue;
|
471
|
+
for (k = I.coff[0]; k < I.coff[1]; ++k) { // test ambiguous bases
|
472
|
+
int c = seq[t].seq[k];
|
473
|
+
if (c == 'n' || c == 'N') break;
|
474
|
+
}
|
475
|
+
if (k != I.coff[1]) continue; // no ambiguous bases on the insert
|
476
|
+
n_ovlp = mg_intv_overlap(km, qoff[t+1] - qoff[t], &qintv[qoff[t]], I.coff[0], I.coff[1], &ovlp, &m_ovlp); // test overlapping on the query
|
477
|
+
if (n_ovlp == 0) fprintf(stderr, "[W::%s] query interval %s:%d-%d is not covered\n", __func__, seq[t].name, I.coff[0], I.coff[1]);
|
478
|
+
if (n_ovlp != 1) continue;
|
479
|
+
for (k = is->lc; k <= ie->lc; ++k) { // find other mappings overlapping with the insert on the graph
|
480
|
+
uint32_t v = gt->lc[k].v, len = g->seg[v>>1].len;
|
481
|
+
int32_t s = 0, e = len, tmp;
|
482
|
+
if (k == is->lc) s = is->vo;
|
483
|
+
if (k == ie->lc) e = ie->vo + (ie->op != 1? ie->len : 0);
|
484
|
+
if (v&1) tmp = s, s = len - e, e = len - tmp;
|
485
|
+
if (s == e) {
|
486
|
+
if (s == 0) ++e;
|
487
|
+
else --s;
|
488
|
+
}
|
489
|
+
n_ovlp = mg_intv_overlap(km, soff[(v>>1)+1] - soff[v>>1], &sintv[soff[v>>1]], s, e, &ovlp, &m_ovlp);
|
490
|
+
if (n_ovlp == 0) fprintf(stderr, "[W::%s] graph interval %c%s:%d-%d is not covered by %s:%d-%d\n", __func__, "><"[v&1], g->seg[v>>1].name, s, e, seq[t].name, I.coff[0], I.coff[1]); // this should be an assert()
|
491
|
+
if (n_ovlp != 1) break;
|
492
|
+
}
|
493
|
+
if (k <= ie->lc) continue;
|
494
|
+
if (pd - (I.coff[1] - I.coff[0]) < opt->min_var_len && (I.coff[1] - I.coff[0]) - pd < opt->min_var_len) { // if length difference > min_var_len, just insert
|
495
|
+
int32_t qd = I.coff[1] - I.coff[0], mlen, blen, score = 0;
|
496
|
+
l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
|
497
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
498
|
+
if (score > 0) {
|
499
|
+
if (mlen > blen * opt->ggs_max_iden) continue; // make sure k-mer identity is small enough
|
500
|
+
if (blen - mlen < opt->min_var_len * opt->ggs_max_iden) continue;
|
501
|
+
} else if (!(opt->flag & MG_G_NO_INV)) {
|
502
|
+
mg_revcomp_seq(l_pseq, pseq);
|
503
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
504
|
+
if (score > 0 && mlen > blen * opt->ggs_min_inv_iden) is_inv = 1;
|
505
|
+
}
|
506
|
+
}
|
507
|
+
if (mg_dbg_flag & MG_DBG_INSERT) {
|
508
|
+
int32_t mlen, blen, score, qd = I.coff[1] - I.coff[0];
|
509
|
+
l_pseq = mg_path2seq(km, g, gt, ls, le, I.voff, &pseq, &m_pseq);
|
510
|
+
fprintf(stderr, "IN\t[%c%s:%d,%c%s:%d|%d] <=> %s:[%d,%d|%d] inv:%d\n", "><"[I.v[0]&1], g->seg[I.v[0]>>1].name, I.voff[0], "><"[I.v[1]&1], g->seg[I.v[1]>>1].name, I.voff[1], pd, seq[t].name, I.coff[0], I.coff[1], I.coff[1] - I.coff[0], is_inv);
|
511
|
+
fprintf(stderr, "IP\t%s\nIQ\t", pseq);
|
512
|
+
fwrite(&seq[t].seq[I.coff[0]], 1, qd, stderr);
|
513
|
+
if (pd - qd < opt->min_var_len && qd - pd < opt->min_var_len) {
|
514
|
+
score = mg_wfa_cmp(km, l_pseq, pseq, qd, &seq[t].seq[I.coff[0]], 5000, &mlen, &blen);
|
515
|
+
} else score = -1, mlen = 0, blen = pd > qd? pd : qd;
|
516
|
+
fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
|
517
|
+
//if (I.voff[0] == 2305301) { for (k = st; k < en; ++k) fprintf(stderr, "%d%c", intv[k].len, "MIDNSHP=XB"[intv[k].op]); fprintf(stderr, "\n"); }
|
518
|
+
}
|
519
|
+
if (is_inv) { // turn one inversion to two events
|
520
|
+
gfa_ins_t I_inv[2];
|
521
|
+
I_inv[0].ctg = I_inv[1].ctg = I.ctg;
|
522
|
+
// the first event
|
523
|
+
I_inv[0].coff[0] = I_inv[0].coff[1] = I.coff[0];
|
524
|
+
I_inv[0].v[0] = I.v[0];
|
525
|
+
I_inv[0].voff[0] = I.voff[0];
|
526
|
+
I_inv[0].v[1] = I.v[1]^1;
|
527
|
+
I_inv[0].voff[1] = g->seg[I.v[1]>>1].len - I.voff[1];
|
528
|
+
// the second event
|
529
|
+
I_inv[1].coff[0] = I_inv[1].coff[1] = I.coff[1];
|
530
|
+
I_inv[1].v[0] = I.v[0]^1;
|
531
|
+
I_inv[1].voff[0] = g->seg[I.v[0]>>1].len - I.voff[0];
|
532
|
+
I_inv[1].v[1] = I.v[1];
|
533
|
+
I_inv[1].voff[1] = I.voff[1];
|
534
|
+
// insert
|
535
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
536
|
+
ins[n_ins++] = I_inv[0];
|
537
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
538
|
+
ins[n_ins++] = I_inv[1];
|
539
|
+
++n_inv;
|
540
|
+
} else {
|
541
|
+
if (n_ins == m_ins) KEXPAND(km, ins, m_ins);
|
542
|
+
ins[n_ins++] = I;
|
543
|
+
}
|
544
|
+
}
|
545
|
+
kfree(0, ss); // this is allocated from malloc() inside mg_mss_all()
|
546
|
+
kfree(km, intv);
|
547
|
+
kfree(km, sc);
|
548
|
+
}
|
549
|
+
}
|
550
|
+
kfree(km, pseq);
|
551
|
+
kfree(km, ovlp);
|
552
|
+
kfree(km, soff); kfree(km, qoff);
|
553
|
+
kfree(km, sintv); kfree(km, qintv);
|
554
|
+
|
555
|
+
if (n_ins > 0) {
|
556
|
+
char **names, **seqs;
|
557
|
+
KMALLOC(km, names, n_seq);
|
558
|
+
KMALLOC(km, seqs, n_seq);
|
559
|
+
for (i = 0; i < n_seq; ++i)
|
560
|
+
names[i] = seq[i].name, seqs[i] = seq[i].seq;
|
561
|
+
n_ins = gfa_ins_filter(g, n_ins, ins);
|
562
|
+
gfa_augment(g, n_ins, ins, n_seq, (const char*const*)names, (const char*const*)seqs);
|
563
|
+
kfree(km, ins);
|
564
|
+
kfree(km, names);
|
565
|
+
kfree(km, seqs);
|
566
|
+
}
|
567
|
+
if (mg_verbose >= 3)
|
568
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] inserted %d events, including %d inversions\n", __func__,
|
569
|
+
realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), n_ins, n_inv);
|
570
|
+
}
|