ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
#include <math.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "mgpriv.h"
|
5
|
+
#include "kalloc.h"
|
6
|
+
|
7
|
+
// reorder gcs->a[] and gcs->lc[] such that they are in the same order as gcs->gc[]
|
8
|
+
void mg_gchain_restore_order(void *km, mg_gchains_t *gcs)
|
9
|
+
{
|
10
|
+
int32_t i, n_a, n_lc;
|
11
|
+
mg_llchain_t *lc;
|
12
|
+
mg128_t *a;
|
13
|
+
KMALLOC(km, lc, gcs->n_lc);
|
14
|
+
KMALLOC(km, a, gcs->n_a);
|
15
|
+
for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
|
16
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
17
|
+
assert(gc->cnt > 0);
|
18
|
+
memcpy(&lc[n_lc], &gcs->lc[gc->off], gc->cnt * sizeof(mg_llchain_t));
|
19
|
+
memcpy(&a[n_a], &gcs->a[gcs->lc[gc->off].off], gc->n_anchor * sizeof(mg128_t));
|
20
|
+
n_lc += gc->cnt, n_a += gc->n_anchor;
|
21
|
+
}
|
22
|
+
memcpy(gcs->lc, lc, gcs->n_lc * sizeof(mg_llchain_t));
|
23
|
+
memcpy(gcs->a, a, gcs->n_a * sizeof(mg128_t));
|
24
|
+
kfree(km, lc); kfree(km, a);
|
25
|
+
for (i = 0, n_lc = 0; i < gcs->n_gc; ++i) {
|
26
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
27
|
+
gc->off = n_lc;
|
28
|
+
n_lc += gc->cnt;
|
29
|
+
}
|
30
|
+
for (i = 0, n_a = 0; i < gcs->n_lc; ++i) {
|
31
|
+
mg_llchain_t *lc = &gcs->lc[i];
|
32
|
+
lc->off = n_a;
|
33
|
+
n_a += lc->cnt;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
// recompute gcs->gc[].{off,n_anchor} and gcs->lc[].off, ASSUMING they are properly ordered (see mg_gchain_restore_order)
|
38
|
+
void mg_gchain_restore_offset(mg_gchains_t *gcs)
|
39
|
+
{
|
40
|
+
int32_t i, j, n_a, n_lc;
|
41
|
+
for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
|
42
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
43
|
+
gc->off = n_lc;
|
44
|
+
for (j = 0, gc->n_anchor = 0; j < gc->cnt; ++j) {
|
45
|
+
mg_llchain_t *lc = &gcs->lc[n_lc + j];
|
46
|
+
lc->off = n_a;
|
47
|
+
n_a += lc->cnt;
|
48
|
+
gc->n_anchor += lc->cnt;
|
49
|
+
}
|
50
|
+
n_lc += gc->cnt;
|
51
|
+
}
|
52
|
+
assert(n_lc == gcs->n_lc && n_a == gcs->n_a);
|
53
|
+
}
|
54
|
+
|
55
|
+
// sort chains by score
|
56
|
+
void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs)
|
57
|
+
{
|
58
|
+
mg128_t *z;
|
59
|
+
mg_gchain_t *gc;
|
60
|
+
int32_t i;
|
61
|
+
KMALLOC(km, z, gcs->n_gc);
|
62
|
+
KMALLOC(km, gc, gcs->n_gc);
|
63
|
+
for (i = 0; i < gcs->n_gc; ++i)
|
64
|
+
z[i].x = (uint64_t)gcs->gc[i].score << 32 | gcs->gc[i].hash, z[i].y = i;
|
65
|
+
radix_sort_128x(z, z + gcs->n_gc);
|
66
|
+
for (i = gcs->n_gc - 1; i >= 0; --i)
|
67
|
+
gc[gcs->n_gc - 1 - i] = gcs->gc[z[i].y];
|
68
|
+
memcpy(gcs->gc, gc, gcs->n_gc * sizeof(mg_gchain_t));
|
69
|
+
kfree(km, z); kfree(km, gc);
|
70
|
+
mg_gchain_restore_order(km, gcs); // this put gcs in the proper order
|
71
|
+
}
|
72
|
+
|
73
|
+
// set r[].{id,parent,subsc}, ASSUMING r[] is sorted by score
|
74
|
+
void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level)
|
75
|
+
{
|
76
|
+
int i, j, k, *w;
|
77
|
+
uint64_t *cov;
|
78
|
+
if (n <= 0) return;
|
79
|
+
for (i = 0; i < n; ++i) r[i].id = i;
|
80
|
+
cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
|
81
|
+
w = (int*)kmalloc(km, n * sizeof(int));
|
82
|
+
w[0] = 0, r[0].parent = 0;
|
83
|
+
for (i = 1, k = 1; i < n; ++i) {
|
84
|
+
mg_gchain_t *ri = &r[i];
|
85
|
+
int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
|
86
|
+
if (hard_mask_level) goto skip_uncov;
|
87
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
|
88
|
+
mg_gchain_t *rp = &r[w[j]];
|
89
|
+
int sj = rp->qs, ej = rp->qe;
|
90
|
+
if (ej <= si || sj >= ei) continue;
|
91
|
+
if (sj < si) sj = si;
|
92
|
+
if (ej > ei) ej = ei;
|
93
|
+
cov[n_cov++] = (uint64_t)sj<<32 | ej;
|
94
|
+
}
|
95
|
+
if (n_cov == 0) {
|
96
|
+
goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
|
97
|
+
} else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
|
98
|
+
int j, x = si;
|
99
|
+
radix_sort_gfa64(cov, cov + n_cov);
|
100
|
+
for (j = 0; j < n_cov; ++j) {
|
101
|
+
if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
|
102
|
+
x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
|
103
|
+
}
|
104
|
+
if (ei > x) uncov_len += ei - x;
|
105
|
+
}
|
106
|
+
skip_uncov:
|
107
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits again
|
108
|
+
mg_gchain_t *rp = &r[w[j]];
|
109
|
+
int sj = rp->qs, ej = rp->qe, min, max, ol;
|
110
|
+
if (ej <= si || sj >= ei) continue; // no overlap
|
111
|
+
min = ej - sj < ei - si? ej - sj : ei - si;
|
112
|
+
max = ej - sj > ei - si? ej - sj : ei - si;
|
113
|
+
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
114
|
+
if ((float)ol / min - (float)uncov_len / max > mask_level) {
|
115
|
+
int cnt_sub = 0;
|
116
|
+
ri->parent = rp->parent;
|
117
|
+
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
|
118
|
+
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
119
|
+
if (cnt_sub) ++rp->n_sub;
|
120
|
+
break;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
set_parent_test:
|
124
|
+
if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
|
125
|
+
}
|
126
|
+
kfree(km, cov);
|
127
|
+
kfree(km, w);
|
128
|
+
}
|
129
|
+
|
130
|
+
// set r[].flt, i.e. mark weak suboptimal chains as filtered
|
131
|
+
int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r)
|
132
|
+
{
|
133
|
+
if (pri_ratio > 0.0f && n > 0) {
|
134
|
+
int i, k, n_2nd = 0;
|
135
|
+
for (i = k = 0; i < n; ++i) {
|
136
|
+
int p = r[i].parent;
|
137
|
+
if (p == i) { // primary
|
138
|
+
r[i].flt = 0, ++k;
|
139
|
+
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
|
140
|
+
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].ps == r[p].ps && r[i].pe == r[p].pe)) // not identical hits; TODO: check path as well
|
141
|
+
r[i].flt = 0, ++n_2nd, ++k;
|
142
|
+
else r[i].flt = 1;
|
143
|
+
} else r[i].flt = 1;
|
144
|
+
}
|
145
|
+
return k;
|
146
|
+
}
|
147
|
+
return n;
|
148
|
+
}
|
149
|
+
|
150
|
+
// hard drop filtered chains, ASSUMING gcs is properly ordered
|
151
|
+
void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs)
|
152
|
+
{
|
153
|
+
int32_t i, n_gc, n_lc, n_a, n_lc0, n_a0, *o2n;
|
154
|
+
if (gcs->n_gc == 0) return;
|
155
|
+
KMALLOC(km, o2n, gcs->n_gc);
|
156
|
+
for (i = 0, n_gc = 0; i < gcs->n_gc; ++i) {
|
157
|
+
mg_gchain_t *r = &gcs->gc[i];
|
158
|
+
o2n[i] = -1;
|
159
|
+
if (r->flt || r->cnt == 0) {
|
160
|
+
kfree(gcs->km, r->p);
|
161
|
+
continue;
|
162
|
+
}
|
163
|
+
o2n[i] = n_gc++;
|
164
|
+
}
|
165
|
+
n_gc = n_lc = n_a = 0;
|
166
|
+
n_lc0 = n_a0 = 0;
|
167
|
+
for (i = 0; i < gcs->n_gc; ++i) {
|
168
|
+
mg_gchain_t *r = &gcs->gc[i];
|
169
|
+
if (o2n[i] >= 0) {
|
170
|
+
memmove(&gcs->a[n_a], &gcs->a[n_a0], r->n_anchor * sizeof(mg128_t));
|
171
|
+
memmove(&gcs->lc[n_lc], &gcs->lc[n_lc0], r->cnt * sizeof(mg_llchain_t));
|
172
|
+
gcs->gc[n_gc] = *r;
|
173
|
+
gcs->gc[n_gc].id = n_gc;
|
174
|
+
gcs->gc[n_gc].parent = o2n[gcs->gc[n_gc].parent];
|
175
|
+
++n_gc, n_lc += r->cnt, n_a += r->n_anchor;
|
176
|
+
}
|
177
|
+
n_lc0 += r->cnt, n_a0 += r->n_anchor;
|
178
|
+
}
|
179
|
+
assert(n_lc0 == gcs->n_lc && n_a0 == gcs->n_a);
|
180
|
+
kfree(km, o2n);
|
181
|
+
gcs->n_gc = n_gc, gcs->n_lc = n_lc, gcs->n_a = n_a;
|
182
|
+
if (n_a != n_a0) {
|
183
|
+
KREALLOC(gcs->km, gcs->a, gcs->n_a);
|
184
|
+
KREALLOC(gcs->km, gcs->lc, gcs->n_lc);
|
185
|
+
KREALLOC(gcs->km, gcs->gc, gcs->n_gc);
|
186
|
+
}
|
187
|
+
mg_gchain_restore_offset(gcs);
|
188
|
+
}
|
189
|
+
|
190
|
+
// estimate mapping quality
|
191
|
+
void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score)
|
192
|
+
{
|
193
|
+
static const float q_coef = 40.0f;
|
194
|
+
int64_t sum_sc = 0;
|
195
|
+
float uniq_ratio, r_sc, r_cnt;
|
196
|
+
int i, t_sc, t_cnt;
|
197
|
+
if (gcs == 0 || gcs->n_gc == 0) return;
|
198
|
+
t_sc = qlen < 100? qlen : 100;
|
199
|
+
t_cnt = max_mini < 10? max_mini : 10;
|
200
|
+
if (t_cnt < 5) t_cnt = 5;
|
201
|
+
r_sc = 1.0 / t_sc;
|
202
|
+
r_cnt = 1.0 / t_cnt;
|
203
|
+
for (i = 0; i < gcs->n_gc; ++i)
|
204
|
+
if (gcs->gc[i].parent == gcs->gc[i].id)
|
205
|
+
sum_sc += gcs->gc[i].score;
|
206
|
+
uniq_ratio = (float)sum_sc / (sum_sc + gcs->rep_len);
|
207
|
+
for (i = 0; i < gcs->n_gc; ++i) {
|
208
|
+
mg_gchain_t *r = &gcs->gc[i];
|
209
|
+
if (r->parent == r->id) {
|
210
|
+
int mapq, subsc;
|
211
|
+
float pen_s1 = (r->score > t_sc? 1.0f : r->score * r_sc) * uniq_ratio;
|
212
|
+
float x, pen_cm = r->n_anchor > t_cnt? 1.0f : r->n_anchor * r_cnt;
|
213
|
+
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
214
|
+
subsc = r->subsc > min_gc_score? r->subsc : min_gc_score;
|
215
|
+
x = (float)subsc / r->score;
|
216
|
+
mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
|
217
|
+
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
|
218
|
+
mapq = mapq > 0? mapq : 0;
|
219
|
+
if (r->score > subsc && mapq == 0) mapq = 1;
|
220
|
+
r->mapq = mapq < 60? mapq : 60;
|
221
|
+
} else r->mapq = 0;
|
222
|
+
}
|
223
|
+
}
|
@@ -0,0 +1,260 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include "gfa-priv.h"
|
4
|
+
#include "ksort.h"
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t side;
|
8
|
+
uint32_t ins:31, end:1;
|
9
|
+
} gfa_split_t;
|
10
|
+
|
11
|
+
#define split_key(p) ((p).side)
|
12
|
+
KRADIX_SORT_INIT(split, gfa_split_t, split_key, 4)
|
13
|
+
|
14
|
+
static inline void create_first_arc_semi(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank, uint64_t link_id, int is_comp)
|
15
|
+
{
|
16
|
+
gfa_arc_t *a;
|
17
|
+
if (g->n_arc == g->m_arc) GFA_EXPAND(g->arc, g->m_arc);
|
18
|
+
a = &g->arc[g->n_arc++];
|
19
|
+
a->v_lv = (uint64_t)v<<32 | seg[v>>1].len;
|
20
|
+
a->w = w;
|
21
|
+
a->rank = rank;
|
22
|
+
a->ov = a->ow = 0;
|
23
|
+
a->link_id = link_id;
|
24
|
+
a->del = 0;
|
25
|
+
a->comp = !!is_comp;
|
26
|
+
}
|
27
|
+
|
28
|
+
static inline void create_first_arc(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank)
|
29
|
+
{
|
30
|
+
uint64_t link_id = g->n_arc;
|
31
|
+
create_first_arc_semi(g, seg, v, w, rank, link_id, 0);
|
32
|
+
create_first_arc_semi(g, seg, w^1, v^1, rank, link_id, 1);
|
33
|
+
}
|
34
|
+
|
35
|
+
void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq)
|
36
|
+
{
|
37
|
+
int32_t i, j, k, *scnt, *soff, n_ctg_seg, n_old_seg, n_seg;
|
38
|
+
gfa_split_t *sp;
|
39
|
+
gfa_seg_t *seg;
|
40
|
+
char buf[16];
|
41
|
+
uint64_t t, n_old_arc = g->n_arc, *ins_side, *oldcnt;
|
42
|
+
|
43
|
+
if (n_ins <= 0 || n_ctg <= 0) return;
|
44
|
+
|
45
|
+
// set soff[]
|
46
|
+
GFA_CALLOC(scnt, g->n_seg);
|
47
|
+
for (i = 0; i < n_ins; ++i)
|
48
|
+
++scnt[ins[i].v[0]>>1], ++scnt[ins[i].v[1]>>1];
|
49
|
+
GFA_MALLOC(soff, g->n_seg + 1);
|
50
|
+
for (j = 1, soff[0] = 0; j <= g->n_seg; ++j)
|
51
|
+
soff[j] = soff[j-1] + scnt[j-1];
|
52
|
+
|
53
|
+
// populate sp[]
|
54
|
+
GFA_MALLOC(sp, soff[g->n_seg]);
|
55
|
+
GFA_BZERO(scnt, g->n_seg);
|
56
|
+
for (i = 0, n_ctg_seg = 0; i < n_ins; ++i) {
|
57
|
+
const gfa_ins_t *p = &ins[i];
|
58
|
+
for (k = 0; k < 2; ++k) {
|
59
|
+
uint32_t vlen = g->seg[p->v[k]>>1].len;
|
60
|
+
gfa_split_t *q = &sp[soff[p->v[k]>>1] + scnt[p->v[k]>>1]];
|
61
|
+
q->ins = i, q->end = k;
|
62
|
+
q->side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
|
63
|
+
assert(q->side != (0<<1|0) && q->side != (vlen<<1|1)); // not possible to link such sides
|
64
|
+
++scnt[p->v[k]>>1];
|
65
|
+
}
|
66
|
+
if (p->coff[1] > p->coff[0])
|
67
|
+
++n_ctg_seg;
|
68
|
+
}
|
69
|
+
free(scnt);
|
70
|
+
|
71
|
+
// sort sp[]
|
72
|
+
for (j = 0, n_old_seg = 0; j < g->n_seg; ++j)
|
73
|
+
if (soff[j+1] - soff[j] > 1)
|
74
|
+
radix_sort_split(&sp[soff[j]], &sp[soff[j+1]]);
|
75
|
+
|
76
|
+
// precompute the number of segments after split
|
77
|
+
for (j = 0, n_old_seg = 0; j < g->n_seg; ++j) {
|
78
|
+
int32_t i0;
|
79
|
+
for (i0 = soff[j], i = i0 + 1, k = 0; i <= soff[j+1]; ++i)
|
80
|
+
if (i == soff[j+1] || sp[i0].side>>1 != sp[i].side>>1) {
|
81
|
+
if (sp[i0].side>>1 != 0 && sp[i0].side>>1 != g->seg[j].len) // otherwise no new segment will be created
|
82
|
+
++k;
|
83
|
+
i0 = i;
|
84
|
+
}
|
85
|
+
n_old_seg += k + 1;
|
86
|
+
}
|
87
|
+
|
88
|
+
// compute ins_side[] and split old segments
|
89
|
+
n_seg = n_old_seg + n_ctg_seg;
|
90
|
+
GFA_CALLOC(seg, n_seg);
|
91
|
+
GFA_CALLOC(ins_side, n_ins);
|
92
|
+
GFA_MALLOC(oldcnt, g->n_seg);
|
93
|
+
for (j = 0, k = 0; j < g->n_seg; ++j) {
|
94
|
+
int32_t i0, l, off = 0, k0 = k;
|
95
|
+
gfa_seg_t *s = &g->seg[j];
|
96
|
+
gfa_seg_t *t = &seg[k]; // this is so far a placeholder
|
97
|
+
// create the first half of a new segment
|
98
|
+
snprintf(buf, 15, "s%d", k + 1);
|
99
|
+
t->name = gfa_strdup(buf);
|
100
|
+
t->snid = s->snid, t->soff = s->soff, t->rank = s->rank;
|
101
|
+
// iterate over splits
|
102
|
+
for (i0 = soff[j], i = i0 + 1; i <= soff[j+1]; ++i) {
|
103
|
+
if (i == soff[j+1] || sp[i].side>>1 != sp[i0].side>>1) {
|
104
|
+
gfa_split_t *q0 = &sp[i0];
|
105
|
+
for (l = i0; l < i; ++l) {
|
106
|
+
gfa_split_t *q = &sp[l];
|
107
|
+
int32_t shift = q->end == 0? 32 : 0; // first end on the higher 32 bits
|
108
|
+
int32_t side = q->side & 1;
|
109
|
+
int32_t which = q->side>>1 == 0? 0 : side; // special-casing when q->side==1, because no new segment created in this case
|
110
|
+
ins_side[q->ins] |= (uint64_t)((uint32_t)(k + which) << 1 | (side^q->end)) << shift;
|
111
|
+
}
|
112
|
+
if (q0->side>>1 != 0 && q0->side>>1 != g->seg[j].len) { // create a new segment
|
113
|
+
t->len = (q0->side>>1) - off;
|
114
|
+
GFA_MALLOC(t->seq, t->len + 1);
|
115
|
+
memcpy(t->seq, &s->seq[off], t->len);
|
116
|
+
t->seq[t->len] = 0;
|
117
|
+
off += t->len;
|
118
|
+
t = &seg[++k]; // create a new segment
|
119
|
+
snprintf(buf, 15, "s%d", k + 1);
|
120
|
+
t->name = gfa_strdup(buf);
|
121
|
+
t->snid = s->snid, t->soff = s->soff + off, t->rank = s->rank;
|
122
|
+
}
|
123
|
+
i0 = i;
|
124
|
+
}
|
125
|
+
}
|
126
|
+
// finish the last segment
|
127
|
+
t->len = s->len - off;
|
128
|
+
GFA_MALLOC(t->seq, t->len + 1);
|
129
|
+
memcpy(t->seq, &s->seq[off], t->len);
|
130
|
+
t->seq[t->len] = 0;
|
131
|
+
++k;
|
132
|
+
oldcnt[j] = (uint64_t)k0 << 32 | (k - k0);
|
133
|
+
// add new arcs between newly created segments
|
134
|
+
for (i = 0; i < k - k0 - 1; ++i)
|
135
|
+
create_first_arc(g, seg, (uint32_t)(k0+i)<<1, (uint32_t)(k0+i+1)<<1, s->rank);
|
136
|
+
}
|
137
|
+
assert(k == n_old_seg);
|
138
|
+
free(soff);
|
139
|
+
free(sp);
|
140
|
+
|
141
|
+
// update existing g->arc[]
|
142
|
+
for (t = 0; t < n_old_arc; ++t) {
|
143
|
+
gfa_arc_t *a = &g->arc[t];
|
144
|
+
uint32_t v = a->v_lv >> 32;
|
145
|
+
uint32_t off = oldcnt[v>>1]>>32, cnt = (uint32_t)oldcnt[v>>1];
|
146
|
+
v = (v&1) == 0? (off+cnt-1)<<1 : off<<1 | 1;
|
147
|
+
a->v_lv = (uint64_t)v << 32 | seg[v>>1].len;
|
148
|
+
off = oldcnt[a->w>>1]>>32, cnt = (uint32_t)oldcnt[a->w>>1];
|
149
|
+
a->w = (a->w&1) == 0? off<<1 : (off+cnt-1)<<1 | 1;
|
150
|
+
}
|
151
|
+
free(oldcnt);
|
152
|
+
|
153
|
+
// create newly inserted segments
|
154
|
+
for (i = 0, k = n_old_seg; i < n_ins; ++i) {
|
155
|
+
const gfa_ins_t *p = &ins[i];
|
156
|
+
if (p->coff[0] < p->coff[1]) { // not a pure deletion
|
157
|
+
gfa_seg_t *t = &seg[k];
|
158
|
+
snprintf(buf, 15, "s%d", k + 1);
|
159
|
+
t->name = gfa_strdup(buf);
|
160
|
+
GFA_MALLOC(t->seq, p->coff[1] - p->coff[0] + 1);
|
161
|
+
for (j = 0; j < p->coff[1] - p->coff[0]; ++j)
|
162
|
+
t->seq[j] = seq[p->ctg][p->coff[0] + j];
|
163
|
+
t->seq[j] = 0;
|
164
|
+
t->len = j;
|
165
|
+
t->snid = gfa_sseq_add(g, name[p->ctg]);
|
166
|
+
t->soff = p->coff[0];
|
167
|
+
t->rank = g->max_rank + 1; // TODO: to deal with SN/SO/SR tags somewhere
|
168
|
+
gfa_sseq_update(g, t);
|
169
|
+
create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)k<<1, t->rank);
|
170
|
+
create_first_arc(g, seg, (uint32_t)k<<1, (uint32_t)ins_side[i], t->rank);
|
171
|
+
++k;
|
172
|
+
} else { // a pure deletion
|
173
|
+
create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)ins_side[i], g->max_rank + 1);
|
174
|
+
}
|
175
|
+
}
|
176
|
+
free(ins_side);
|
177
|
+
|
178
|
+
// update *g
|
179
|
+
for (j = 0; j < g->n_seg; ++j) {
|
180
|
+
free(g->seg[j].name);
|
181
|
+
free(g->seg[j].seq);
|
182
|
+
free(g->seg[j].aux.aux);
|
183
|
+
}
|
184
|
+
free(g->seg);
|
185
|
+
g->seg = seg, g->n_seg = g->m_seg = n_seg;
|
186
|
+
++g->max_rank;
|
187
|
+
GFA_REALLOC(g->link_aux, g->m_arc);
|
188
|
+
GFA_BZERO(&g->link_aux[n_old_arc], g->m_arc - n_old_arc);
|
189
|
+
gfa_arc_sort(g);
|
190
|
+
gfa_arc_index(g);
|
191
|
+
gfa_fix_multi(g);
|
192
|
+
// k = gfa_fix_symm(g); assert(k == 0); // for debugging; the graph should be symmetric
|
193
|
+
}
|
194
|
+
|
195
|
+
static int32_t gfa_ins_shrink_semi(const gfa_t *g, int32_t pen, uint32_t v, int32_t voff, int32_t coff, uint32_t vv, int32_t vend, int32_t cend, const char *seq)
|
196
|
+
{
|
197
|
+
int32_t i, j, l, dir, score, max, max_l;
|
198
|
+
if (cend == coff) return 0;
|
199
|
+
dir = cend > coff? +1 : -1;
|
200
|
+
for (i = coff, j = voff, l = max_l = 0, score = max = 0; i != cend; i += dir, j += dir) {
|
201
|
+
int32_t cg, vlen = g->seg[v>>1].len;
|
202
|
+
if (j == vlen || j == -1) break;
|
203
|
+
if (vv == v && j == vend) break;
|
204
|
+
++l;
|
205
|
+
cg = (v&1) == 0? g->seg[v>>1].seq[j] : gfa_comp_table[(uint8_t)g->seg[v>>1].seq[vlen - 1 - j]];
|
206
|
+
score += tolower(cg) == tolower(seq[i])? +1 : -pen;
|
207
|
+
if (score > max) max = score, max_l = l;
|
208
|
+
if (score < max - pen * pen) break; // X-drop
|
209
|
+
}
|
210
|
+
return max_l;
|
211
|
+
}
|
212
|
+
|
213
|
+
int gfa_ins_adj(const gfa_t *g, int pen, gfa_ins_t *ins, const char *seq) // min_len is NOT used for now
|
214
|
+
{
|
215
|
+
int32_t l, tot = 0;
|
216
|
+
l = gfa_ins_shrink_semi(g, pen, ins->v[0], ins->voff[0], ins->coff[0], ins->v[1], ins->voff[1], ins->coff[1], seq);
|
217
|
+
ins->voff[0] += l, ins->coff[0] += l, tot += l;
|
218
|
+
l = gfa_ins_shrink_semi(g, pen, ins->v[1], ins->voff[1] - 1, ins->coff[1] - 1, ins->v[0], ins->voff[0] - 1, ins->coff[0] - 1, seq);
|
219
|
+
ins->voff[1] -= l, ins->coff[1] -= l, tot += l;
|
220
|
+
return tot;
|
221
|
+
}
|
222
|
+
|
223
|
+
static inline int check_multi(const gfa_t *g, const gfa_ins_t *ins)
|
224
|
+
{
|
225
|
+
if (ins->v[0] != ins->v[1] && ins->coff[1] - ins->coff[0] == 0) {
|
226
|
+
const gfa_seg_t *s[2];
|
227
|
+
uint32_t v[2];
|
228
|
+
s[0] = &g->seg[ins->v[0]>>1];
|
229
|
+
s[1] = &g->seg[ins->v[1]>>1];
|
230
|
+
if (ins->voff[0] != 0 && ins->voff[0] != s[0]->len) return 0;
|
231
|
+
if (ins->voff[1] != 0 && ins->voff[1] != s[1]->len) return 0;
|
232
|
+
v[0] = ins->voff[0] == 0? ins->v[0]^1 : ins->v[0];
|
233
|
+
v[1] = ins->voff[1] == 0? ins->v[1] : ins->v[1]^1;
|
234
|
+
if (gfa_find_arc(g, v[0], v[1]) >= 0) return 1;
|
235
|
+
return 0;
|
236
|
+
} else return 0;
|
237
|
+
}
|
238
|
+
|
239
|
+
int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins) // filter out impossible inserts
|
240
|
+
{
|
241
|
+
int32_t i, k, n;
|
242
|
+
for (i = 0, n = 0; i < n_ins; ++i) {
|
243
|
+
gfa_ins_t *p = &ins[i];
|
244
|
+
for (k = 0; k < 2; ++k) {
|
245
|
+
uint32_t vlen = g->seg[p->v[k]>>1].len;
|
246
|
+
uint32_t side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
|
247
|
+
if (side == (0<<1|0) || side == (vlen<<1|1))
|
248
|
+
break;
|
249
|
+
}
|
250
|
+
if (k != 2 || check_multi(g, p)) { // multi-link may happen due to inconsistency between graph chaining and WFA alignment
|
251
|
+
if (gfa_verbose >= 2)
|
252
|
+
fprintf(stderr, "[W::%s] %s between %c%s and %c%s derived from the %d-th query at %d-%d\n",
|
253
|
+
__func__, k != 2? "impossible insert" : "multi-link",
|
254
|
+
"><"[p->v[0]&1], g->seg[p->v[0]>>1].name, "><"[p->v[1]&1], g->seg[p->v[1]>>1].name, p->ctg, p->coff[0], p->coff[1]);
|
255
|
+
continue;
|
256
|
+
}
|
257
|
+
ins[n++] = ins[i];
|
258
|
+
}
|
259
|
+
return n;
|
260
|
+
}
|