ruby-minigraph 0.0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
#include <math.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "mgpriv.h"
|
5
|
+
#include "kalloc.h"
|
6
|
+
|
7
|
+
// reorder gcs->a[] and gcs->lc[] such that they are in the same order as gcs->gc[]
|
8
|
+
void mg_gchain_restore_order(void *km, mg_gchains_t *gcs)
|
9
|
+
{
|
10
|
+
int32_t i, n_a, n_lc;
|
11
|
+
mg_llchain_t *lc;
|
12
|
+
mg128_t *a;
|
13
|
+
KMALLOC(km, lc, gcs->n_lc);
|
14
|
+
KMALLOC(km, a, gcs->n_a);
|
15
|
+
for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
|
16
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
17
|
+
assert(gc->cnt > 0);
|
18
|
+
memcpy(&lc[n_lc], &gcs->lc[gc->off], gc->cnt * sizeof(mg_llchain_t));
|
19
|
+
memcpy(&a[n_a], &gcs->a[gcs->lc[gc->off].off], gc->n_anchor * sizeof(mg128_t));
|
20
|
+
n_lc += gc->cnt, n_a += gc->n_anchor;
|
21
|
+
}
|
22
|
+
memcpy(gcs->lc, lc, gcs->n_lc * sizeof(mg_llchain_t));
|
23
|
+
memcpy(gcs->a, a, gcs->n_a * sizeof(mg128_t));
|
24
|
+
kfree(km, lc); kfree(km, a);
|
25
|
+
for (i = 0, n_lc = 0; i < gcs->n_gc; ++i) {
|
26
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
27
|
+
gc->off = n_lc;
|
28
|
+
n_lc += gc->cnt;
|
29
|
+
}
|
30
|
+
for (i = 0, n_a = 0; i < gcs->n_lc; ++i) {
|
31
|
+
mg_llchain_t *lc = &gcs->lc[i];
|
32
|
+
lc->off = n_a;
|
33
|
+
n_a += lc->cnt;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
// recompute gcs->gc[].{off,n_anchor} and gcs->lc[].off, ASSUMING they are properly ordered (see mg_gchain_restore_order)
|
38
|
+
void mg_gchain_restore_offset(mg_gchains_t *gcs)
|
39
|
+
{
|
40
|
+
int32_t i, j, n_a, n_lc;
|
41
|
+
for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
|
42
|
+
mg_gchain_t *gc = &gcs->gc[i];
|
43
|
+
gc->off = n_lc;
|
44
|
+
for (j = 0, gc->n_anchor = 0; j < gc->cnt; ++j) {
|
45
|
+
mg_llchain_t *lc = &gcs->lc[n_lc + j];
|
46
|
+
lc->off = n_a;
|
47
|
+
n_a += lc->cnt;
|
48
|
+
gc->n_anchor += lc->cnt;
|
49
|
+
}
|
50
|
+
n_lc += gc->cnt;
|
51
|
+
}
|
52
|
+
assert(n_lc == gcs->n_lc && n_a == gcs->n_a);
|
53
|
+
}
|
54
|
+
|
55
|
+
// sort chains by score
|
56
|
+
void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs)
|
57
|
+
{
|
58
|
+
mg128_t *z;
|
59
|
+
mg_gchain_t *gc;
|
60
|
+
int32_t i;
|
61
|
+
KMALLOC(km, z, gcs->n_gc);
|
62
|
+
KMALLOC(km, gc, gcs->n_gc);
|
63
|
+
for (i = 0; i < gcs->n_gc; ++i)
|
64
|
+
z[i].x = (uint64_t)gcs->gc[i].score << 32 | gcs->gc[i].hash, z[i].y = i;
|
65
|
+
radix_sort_128x(z, z + gcs->n_gc);
|
66
|
+
for (i = gcs->n_gc - 1; i >= 0; --i)
|
67
|
+
gc[gcs->n_gc - 1 - i] = gcs->gc[z[i].y];
|
68
|
+
memcpy(gcs->gc, gc, gcs->n_gc * sizeof(mg_gchain_t));
|
69
|
+
kfree(km, z); kfree(km, gc);
|
70
|
+
mg_gchain_restore_order(km, gcs); // this put gcs in the proper order
|
71
|
+
}
|
72
|
+
|
73
|
+
// set r[].{id,parent,subsc}, ASSUMING r[] is sorted by score
|
74
|
+
void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level)
|
75
|
+
{
|
76
|
+
int i, j, k, *w;
|
77
|
+
uint64_t *cov;
|
78
|
+
if (n <= 0) return;
|
79
|
+
for (i = 0; i < n; ++i) r[i].id = i;
|
80
|
+
cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
|
81
|
+
w = (int*)kmalloc(km, n * sizeof(int));
|
82
|
+
w[0] = 0, r[0].parent = 0;
|
83
|
+
for (i = 1, k = 1; i < n; ++i) {
|
84
|
+
mg_gchain_t *ri = &r[i];
|
85
|
+
int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
|
86
|
+
if (hard_mask_level) goto skip_uncov;
|
87
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
|
88
|
+
mg_gchain_t *rp = &r[w[j]];
|
89
|
+
int sj = rp->qs, ej = rp->qe;
|
90
|
+
if (ej <= si || sj >= ei) continue;
|
91
|
+
if (sj < si) sj = si;
|
92
|
+
if (ej > ei) ej = ei;
|
93
|
+
cov[n_cov++] = (uint64_t)sj<<32 | ej;
|
94
|
+
}
|
95
|
+
if (n_cov == 0) {
|
96
|
+
goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
|
97
|
+
} else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
|
98
|
+
int j, x = si;
|
99
|
+
radix_sort_gfa64(cov, cov + n_cov);
|
100
|
+
for (j = 0; j < n_cov; ++j) {
|
101
|
+
if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
|
102
|
+
x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
|
103
|
+
}
|
104
|
+
if (ei > x) uncov_len += ei - x;
|
105
|
+
}
|
106
|
+
skip_uncov:
|
107
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits again
|
108
|
+
mg_gchain_t *rp = &r[w[j]];
|
109
|
+
int sj = rp->qs, ej = rp->qe, min, max, ol;
|
110
|
+
if (ej <= si || sj >= ei) continue; // no overlap
|
111
|
+
min = ej - sj < ei - si? ej - sj : ei - si;
|
112
|
+
max = ej - sj > ei - si? ej - sj : ei - si;
|
113
|
+
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
114
|
+
if ((float)ol / min - (float)uncov_len / max > mask_level) {
|
115
|
+
int cnt_sub = 0;
|
116
|
+
ri->parent = rp->parent;
|
117
|
+
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
|
118
|
+
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
119
|
+
if (cnt_sub) ++rp->n_sub;
|
120
|
+
break;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
set_parent_test:
|
124
|
+
if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
|
125
|
+
}
|
126
|
+
kfree(km, cov);
|
127
|
+
kfree(km, w);
|
128
|
+
}
|
129
|
+
|
130
|
+
// set r[].flt, i.e. mark weak suboptimal chains as filtered
|
131
|
+
int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r)
|
132
|
+
{
|
133
|
+
if (pri_ratio > 0.0f && n > 0) {
|
134
|
+
int i, k, n_2nd = 0;
|
135
|
+
for (i = k = 0; i < n; ++i) {
|
136
|
+
int p = r[i].parent;
|
137
|
+
if (p == i) { // primary
|
138
|
+
r[i].flt = 0, ++k;
|
139
|
+
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
|
140
|
+
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].ps == r[p].ps && r[i].pe == r[p].pe)) // not identical hits; TODO: check path as well
|
141
|
+
r[i].flt = 0, ++n_2nd, ++k;
|
142
|
+
else r[i].flt = 1;
|
143
|
+
} else r[i].flt = 1;
|
144
|
+
}
|
145
|
+
return k;
|
146
|
+
}
|
147
|
+
return n;
|
148
|
+
}
|
149
|
+
|
150
|
+
// hard drop filtered chains, ASSUMING gcs is properly ordered
|
151
|
+
void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs)
|
152
|
+
{
|
153
|
+
int32_t i, n_gc, n_lc, n_a, n_lc0, n_a0, *o2n;
|
154
|
+
if (gcs->n_gc == 0) return;
|
155
|
+
KMALLOC(km, o2n, gcs->n_gc);
|
156
|
+
for (i = 0, n_gc = 0; i < gcs->n_gc; ++i) {
|
157
|
+
mg_gchain_t *r = &gcs->gc[i];
|
158
|
+
o2n[i] = -1;
|
159
|
+
if (r->flt || r->cnt == 0) {
|
160
|
+
kfree(gcs->km, r->p);
|
161
|
+
continue;
|
162
|
+
}
|
163
|
+
o2n[i] = n_gc++;
|
164
|
+
}
|
165
|
+
n_gc = n_lc = n_a = 0;
|
166
|
+
n_lc0 = n_a0 = 0;
|
167
|
+
for (i = 0; i < gcs->n_gc; ++i) {
|
168
|
+
mg_gchain_t *r = &gcs->gc[i];
|
169
|
+
if (o2n[i] >= 0) {
|
170
|
+
memmove(&gcs->a[n_a], &gcs->a[n_a0], r->n_anchor * sizeof(mg128_t));
|
171
|
+
memmove(&gcs->lc[n_lc], &gcs->lc[n_lc0], r->cnt * sizeof(mg_llchain_t));
|
172
|
+
gcs->gc[n_gc] = *r;
|
173
|
+
gcs->gc[n_gc].id = n_gc;
|
174
|
+
gcs->gc[n_gc].parent = o2n[gcs->gc[n_gc].parent];
|
175
|
+
++n_gc, n_lc += r->cnt, n_a += r->n_anchor;
|
176
|
+
}
|
177
|
+
n_lc0 += r->cnt, n_a0 += r->n_anchor;
|
178
|
+
}
|
179
|
+
assert(n_lc0 == gcs->n_lc && n_a0 == gcs->n_a);
|
180
|
+
kfree(km, o2n);
|
181
|
+
gcs->n_gc = n_gc, gcs->n_lc = n_lc, gcs->n_a = n_a;
|
182
|
+
if (n_a != n_a0) {
|
183
|
+
KREALLOC(gcs->km, gcs->a, gcs->n_a);
|
184
|
+
KREALLOC(gcs->km, gcs->lc, gcs->n_lc);
|
185
|
+
KREALLOC(gcs->km, gcs->gc, gcs->n_gc);
|
186
|
+
}
|
187
|
+
mg_gchain_restore_offset(gcs);
|
188
|
+
}
|
189
|
+
|
190
|
+
// estimate mapping quality
|
191
|
+
void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score)
|
192
|
+
{
|
193
|
+
static const float q_coef = 40.0f;
|
194
|
+
int64_t sum_sc = 0;
|
195
|
+
float uniq_ratio, r_sc, r_cnt;
|
196
|
+
int i, t_sc, t_cnt;
|
197
|
+
if (gcs == 0 || gcs->n_gc == 0) return;
|
198
|
+
t_sc = qlen < 100? qlen : 100;
|
199
|
+
t_cnt = max_mini < 10? max_mini : 10;
|
200
|
+
if (t_cnt < 5) t_cnt = 5;
|
201
|
+
r_sc = 1.0 / t_sc;
|
202
|
+
r_cnt = 1.0 / t_cnt;
|
203
|
+
for (i = 0; i < gcs->n_gc; ++i)
|
204
|
+
if (gcs->gc[i].parent == gcs->gc[i].id)
|
205
|
+
sum_sc += gcs->gc[i].score;
|
206
|
+
uniq_ratio = (float)sum_sc / (sum_sc + gcs->rep_len);
|
207
|
+
for (i = 0; i < gcs->n_gc; ++i) {
|
208
|
+
mg_gchain_t *r = &gcs->gc[i];
|
209
|
+
if (r->parent == r->id) {
|
210
|
+
int mapq, subsc;
|
211
|
+
float pen_s1 = (r->score > t_sc? 1.0f : r->score * r_sc) * uniq_ratio;
|
212
|
+
float x, pen_cm = r->n_anchor > t_cnt? 1.0f : r->n_anchor * r_cnt;
|
213
|
+
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
214
|
+
subsc = r->subsc > min_gc_score? r->subsc : min_gc_score;
|
215
|
+
x = (float)subsc / r->score;
|
216
|
+
mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
|
217
|
+
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
|
218
|
+
mapq = mapq > 0? mapq : 0;
|
219
|
+
if (r->score > subsc && mapq == 0) mapq = 1;
|
220
|
+
r->mapq = mapq < 60? mapq : 60;
|
221
|
+
} else r->mapq = 0;
|
222
|
+
}
|
223
|
+
}
|
@@ -0,0 +1,260 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include "gfa-priv.h"
|
4
|
+
#include "ksort.h"
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t side;
|
8
|
+
uint32_t ins:31, end:1;
|
9
|
+
} gfa_split_t;
|
10
|
+
|
11
|
+
#define split_key(p) ((p).side)
|
12
|
+
KRADIX_SORT_INIT(split, gfa_split_t, split_key, 4)
|
13
|
+
|
14
|
+
static inline void create_first_arc_semi(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank, uint64_t link_id, int is_comp)
|
15
|
+
{
|
16
|
+
gfa_arc_t *a;
|
17
|
+
if (g->n_arc == g->m_arc) GFA_EXPAND(g->arc, g->m_arc);
|
18
|
+
a = &g->arc[g->n_arc++];
|
19
|
+
a->v_lv = (uint64_t)v<<32 | seg[v>>1].len;
|
20
|
+
a->w = w;
|
21
|
+
a->rank = rank;
|
22
|
+
a->ov = a->ow = 0;
|
23
|
+
a->link_id = link_id;
|
24
|
+
a->del = 0;
|
25
|
+
a->comp = !!is_comp;
|
26
|
+
}
|
27
|
+
|
28
|
+
static inline void create_first_arc(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank)
|
29
|
+
{
|
30
|
+
uint64_t link_id = g->n_arc;
|
31
|
+
create_first_arc_semi(g, seg, v, w, rank, link_id, 0);
|
32
|
+
create_first_arc_semi(g, seg, w^1, v^1, rank, link_id, 1);
|
33
|
+
}
|
34
|
+
|
35
|
+
void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq)
|
36
|
+
{
|
37
|
+
int32_t i, j, k, *scnt, *soff, n_ctg_seg, n_old_seg, n_seg;
|
38
|
+
gfa_split_t *sp;
|
39
|
+
gfa_seg_t *seg;
|
40
|
+
char buf[16];
|
41
|
+
uint64_t t, n_old_arc = g->n_arc, *ins_side, *oldcnt;
|
42
|
+
|
43
|
+
if (n_ins <= 0 || n_ctg <= 0) return;
|
44
|
+
|
45
|
+
// set soff[]
|
46
|
+
GFA_CALLOC(scnt, g->n_seg);
|
47
|
+
for (i = 0; i < n_ins; ++i)
|
48
|
+
++scnt[ins[i].v[0]>>1], ++scnt[ins[i].v[1]>>1];
|
49
|
+
GFA_MALLOC(soff, g->n_seg + 1);
|
50
|
+
for (j = 1, soff[0] = 0; j <= g->n_seg; ++j)
|
51
|
+
soff[j] = soff[j-1] + scnt[j-1];
|
52
|
+
|
53
|
+
// populate sp[]
|
54
|
+
GFA_MALLOC(sp, soff[g->n_seg]);
|
55
|
+
GFA_BZERO(scnt, g->n_seg);
|
56
|
+
for (i = 0, n_ctg_seg = 0; i < n_ins; ++i) {
|
57
|
+
const gfa_ins_t *p = &ins[i];
|
58
|
+
for (k = 0; k < 2; ++k) {
|
59
|
+
uint32_t vlen = g->seg[p->v[k]>>1].len;
|
60
|
+
gfa_split_t *q = &sp[soff[p->v[k]>>1] + scnt[p->v[k]>>1]];
|
61
|
+
q->ins = i, q->end = k;
|
62
|
+
q->side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
|
63
|
+
assert(q->side != (0<<1|0) && q->side != (vlen<<1|1)); // not possible to link such sides
|
64
|
+
++scnt[p->v[k]>>1];
|
65
|
+
}
|
66
|
+
if (p->coff[1] > p->coff[0])
|
67
|
+
++n_ctg_seg;
|
68
|
+
}
|
69
|
+
free(scnt);
|
70
|
+
|
71
|
+
// sort sp[]
|
72
|
+
for (j = 0, n_old_seg = 0; j < g->n_seg; ++j)
|
73
|
+
if (soff[j+1] - soff[j] > 1)
|
74
|
+
radix_sort_split(&sp[soff[j]], &sp[soff[j+1]]);
|
75
|
+
|
76
|
+
// precompute the number of segments after split
|
77
|
+
for (j = 0, n_old_seg = 0; j < g->n_seg; ++j) {
|
78
|
+
int32_t i0;
|
79
|
+
for (i0 = soff[j], i = i0 + 1, k = 0; i <= soff[j+1]; ++i)
|
80
|
+
if (i == soff[j+1] || sp[i0].side>>1 != sp[i].side>>1) {
|
81
|
+
if (sp[i0].side>>1 != 0 && sp[i0].side>>1 != g->seg[j].len) // otherwise no new segment will be created
|
82
|
+
++k;
|
83
|
+
i0 = i;
|
84
|
+
}
|
85
|
+
n_old_seg += k + 1;
|
86
|
+
}
|
87
|
+
|
88
|
+
// compute ins_side[] and split old segments
|
89
|
+
n_seg = n_old_seg + n_ctg_seg;
|
90
|
+
GFA_CALLOC(seg, n_seg);
|
91
|
+
GFA_CALLOC(ins_side, n_ins);
|
92
|
+
GFA_MALLOC(oldcnt, g->n_seg);
|
93
|
+
for (j = 0, k = 0; j < g->n_seg; ++j) {
|
94
|
+
int32_t i0, l, off = 0, k0 = k;
|
95
|
+
gfa_seg_t *s = &g->seg[j];
|
96
|
+
gfa_seg_t *t = &seg[k]; // this is so far a placeholder
|
97
|
+
// create the first half of a new segment
|
98
|
+
snprintf(buf, 15, "s%d", k + 1);
|
99
|
+
t->name = gfa_strdup(buf);
|
100
|
+
t->snid = s->snid, t->soff = s->soff, t->rank = s->rank;
|
101
|
+
// iterate over splits
|
102
|
+
for (i0 = soff[j], i = i0 + 1; i <= soff[j+1]; ++i) {
|
103
|
+
if (i == soff[j+1] || sp[i].side>>1 != sp[i0].side>>1) {
|
104
|
+
gfa_split_t *q0 = &sp[i0];
|
105
|
+
for (l = i0; l < i; ++l) {
|
106
|
+
gfa_split_t *q = &sp[l];
|
107
|
+
int32_t shift = q->end == 0? 32 : 0; // first end on the higher 32 bits
|
108
|
+
int32_t side = q->side & 1;
|
109
|
+
int32_t which = q->side>>1 == 0? 0 : side; // special-casing when q->side==1, because no new segment created in this case
|
110
|
+
ins_side[q->ins] |= (uint64_t)((uint32_t)(k + which) << 1 | (side^q->end)) << shift;
|
111
|
+
}
|
112
|
+
if (q0->side>>1 != 0 && q0->side>>1 != g->seg[j].len) { // create a new segment
|
113
|
+
t->len = (q0->side>>1) - off;
|
114
|
+
GFA_MALLOC(t->seq, t->len + 1);
|
115
|
+
memcpy(t->seq, &s->seq[off], t->len);
|
116
|
+
t->seq[t->len] = 0;
|
117
|
+
off += t->len;
|
118
|
+
t = &seg[++k]; // create a new segment
|
119
|
+
snprintf(buf, 15, "s%d", k + 1);
|
120
|
+
t->name = gfa_strdup(buf);
|
121
|
+
t->snid = s->snid, t->soff = s->soff + off, t->rank = s->rank;
|
122
|
+
}
|
123
|
+
i0 = i;
|
124
|
+
}
|
125
|
+
}
|
126
|
+
// finish the last segment
|
127
|
+
t->len = s->len - off;
|
128
|
+
GFA_MALLOC(t->seq, t->len + 1);
|
129
|
+
memcpy(t->seq, &s->seq[off], t->len);
|
130
|
+
t->seq[t->len] = 0;
|
131
|
+
++k;
|
132
|
+
oldcnt[j] = (uint64_t)k0 << 32 | (k - k0);
|
133
|
+
// add new arcs between newly created segments
|
134
|
+
for (i = 0; i < k - k0 - 1; ++i)
|
135
|
+
create_first_arc(g, seg, (uint32_t)(k0+i)<<1, (uint32_t)(k0+i+1)<<1, s->rank);
|
136
|
+
}
|
137
|
+
assert(k == n_old_seg);
|
138
|
+
free(soff);
|
139
|
+
free(sp);
|
140
|
+
|
141
|
+
// update existing g->arc[]
|
142
|
+
for (t = 0; t < n_old_arc; ++t) {
|
143
|
+
gfa_arc_t *a = &g->arc[t];
|
144
|
+
uint32_t v = a->v_lv >> 32;
|
145
|
+
uint32_t off = oldcnt[v>>1]>>32, cnt = (uint32_t)oldcnt[v>>1];
|
146
|
+
v = (v&1) == 0? (off+cnt-1)<<1 : off<<1 | 1;
|
147
|
+
a->v_lv = (uint64_t)v << 32 | seg[v>>1].len;
|
148
|
+
off = oldcnt[a->w>>1]>>32, cnt = (uint32_t)oldcnt[a->w>>1];
|
149
|
+
a->w = (a->w&1) == 0? off<<1 : (off+cnt-1)<<1 | 1;
|
150
|
+
}
|
151
|
+
free(oldcnt);
|
152
|
+
|
153
|
+
// create newly inserted segments
|
154
|
+
for (i = 0, k = n_old_seg; i < n_ins; ++i) {
|
155
|
+
const gfa_ins_t *p = &ins[i];
|
156
|
+
if (p->coff[0] < p->coff[1]) { // not a pure deletion
|
157
|
+
gfa_seg_t *t = &seg[k];
|
158
|
+
snprintf(buf, 15, "s%d", k + 1);
|
159
|
+
t->name = gfa_strdup(buf);
|
160
|
+
GFA_MALLOC(t->seq, p->coff[1] - p->coff[0] + 1);
|
161
|
+
for (j = 0; j < p->coff[1] - p->coff[0]; ++j)
|
162
|
+
t->seq[j] = seq[p->ctg][p->coff[0] + j];
|
163
|
+
t->seq[j] = 0;
|
164
|
+
t->len = j;
|
165
|
+
t->snid = gfa_sseq_add(g, name[p->ctg]);
|
166
|
+
t->soff = p->coff[0];
|
167
|
+
t->rank = g->max_rank + 1; // TODO: to deal with SN/SO/SR tags somewhere
|
168
|
+
gfa_sseq_update(g, t);
|
169
|
+
create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)k<<1, t->rank);
|
170
|
+
create_first_arc(g, seg, (uint32_t)k<<1, (uint32_t)ins_side[i], t->rank);
|
171
|
+
++k;
|
172
|
+
} else { // a pure deletion
|
173
|
+
create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)ins_side[i], g->max_rank + 1);
|
174
|
+
}
|
175
|
+
}
|
176
|
+
free(ins_side);
|
177
|
+
|
178
|
+
// update *g
|
179
|
+
for (j = 0; j < g->n_seg; ++j) {
|
180
|
+
free(g->seg[j].name);
|
181
|
+
free(g->seg[j].seq);
|
182
|
+
free(g->seg[j].aux.aux);
|
183
|
+
}
|
184
|
+
free(g->seg);
|
185
|
+
g->seg = seg, g->n_seg = g->m_seg = n_seg;
|
186
|
+
++g->max_rank;
|
187
|
+
GFA_REALLOC(g->link_aux, g->m_arc);
|
188
|
+
GFA_BZERO(&g->link_aux[n_old_arc], g->m_arc - n_old_arc);
|
189
|
+
gfa_arc_sort(g);
|
190
|
+
gfa_arc_index(g);
|
191
|
+
gfa_fix_multi(g);
|
192
|
+
// k = gfa_fix_symm(g); assert(k == 0); // for debugging; the graph should be symmetric
|
193
|
+
}
|
194
|
+
|
195
|
+
static int32_t gfa_ins_shrink_semi(const gfa_t *g, int32_t pen, uint32_t v, int32_t voff, int32_t coff, uint32_t vv, int32_t vend, int32_t cend, const char *seq)
|
196
|
+
{
|
197
|
+
int32_t i, j, l, dir, score, max, max_l;
|
198
|
+
if (cend == coff) return 0;
|
199
|
+
dir = cend > coff? +1 : -1;
|
200
|
+
for (i = coff, j = voff, l = max_l = 0, score = max = 0; i != cend; i += dir, j += dir) {
|
201
|
+
int32_t cg, vlen = g->seg[v>>1].len;
|
202
|
+
if (j == vlen || j == -1) break;
|
203
|
+
if (vv == v && j == vend) break;
|
204
|
+
++l;
|
205
|
+
cg = (v&1) == 0? g->seg[v>>1].seq[j] : gfa_comp_table[(uint8_t)g->seg[v>>1].seq[vlen - 1 - j]];
|
206
|
+
score += tolower(cg) == tolower(seq[i])? +1 : -pen;
|
207
|
+
if (score > max) max = score, max_l = l;
|
208
|
+
if (score < max - pen * pen) break; // X-drop
|
209
|
+
}
|
210
|
+
return max_l;
|
211
|
+
}
|
212
|
+
|
213
|
+
int gfa_ins_adj(const gfa_t *g, int pen, gfa_ins_t *ins, const char *seq) // min_len is NOT used for now
|
214
|
+
{
|
215
|
+
int32_t l, tot = 0;
|
216
|
+
l = gfa_ins_shrink_semi(g, pen, ins->v[0], ins->voff[0], ins->coff[0], ins->v[1], ins->voff[1], ins->coff[1], seq);
|
217
|
+
ins->voff[0] += l, ins->coff[0] += l, tot += l;
|
218
|
+
l = gfa_ins_shrink_semi(g, pen, ins->v[1], ins->voff[1] - 1, ins->coff[1] - 1, ins->v[0], ins->voff[0] - 1, ins->coff[0] - 1, seq);
|
219
|
+
ins->voff[1] -= l, ins->coff[1] -= l, tot += l;
|
220
|
+
return tot;
|
221
|
+
}
|
222
|
+
|
223
|
+
static inline int check_multi(const gfa_t *g, const gfa_ins_t *ins)
|
224
|
+
{
|
225
|
+
if (ins->v[0] != ins->v[1] && ins->coff[1] - ins->coff[0] == 0) {
|
226
|
+
const gfa_seg_t *s[2];
|
227
|
+
uint32_t v[2];
|
228
|
+
s[0] = &g->seg[ins->v[0]>>1];
|
229
|
+
s[1] = &g->seg[ins->v[1]>>1];
|
230
|
+
if (ins->voff[0] != 0 && ins->voff[0] != s[0]->len) return 0;
|
231
|
+
if (ins->voff[1] != 0 && ins->voff[1] != s[1]->len) return 0;
|
232
|
+
v[0] = ins->voff[0] == 0? ins->v[0]^1 : ins->v[0];
|
233
|
+
v[1] = ins->voff[1] == 0? ins->v[1] : ins->v[1]^1;
|
234
|
+
if (gfa_find_arc(g, v[0], v[1]) >= 0) return 1;
|
235
|
+
return 0;
|
236
|
+
} else return 0;
|
237
|
+
}
|
238
|
+
|
239
|
+
int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins) // filter out impossible inserts
|
240
|
+
{
|
241
|
+
int32_t i, k, n;
|
242
|
+
for (i = 0, n = 0; i < n_ins; ++i) {
|
243
|
+
gfa_ins_t *p = &ins[i];
|
244
|
+
for (k = 0; k < 2; ++k) {
|
245
|
+
uint32_t vlen = g->seg[p->v[k]>>1].len;
|
246
|
+
uint32_t side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
|
247
|
+
if (side == (0<<1|0) || side == (vlen<<1|1))
|
248
|
+
break;
|
249
|
+
}
|
250
|
+
if (k != 2 || check_multi(g, p)) { // multi-link may happen due to inconsistency between graph chaining and WFA alignment
|
251
|
+
if (gfa_verbose >= 2)
|
252
|
+
fprintf(stderr, "[W::%s] %s between %c%s and %c%s derived from the %d-th query at %d-%d\n",
|
253
|
+
__func__, k != 2? "impossible insert" : "multi-link",
|
254
|
+
"><"[p->v[0]&1], g->seg[p->v[0]>>1].name, "><"[p->v[1]&1], g->seg[p->v[1]>>1].name, p->ctg, p->coff[0], p->coff[1]);
|
255
|
+
continue;
|
256
|
+
}
|
257
|
+
ins[n++] = ins[i];
|
258
|
+
}
|
259
|
+
return n;
|
260
|
+
}
|