ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,526 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "gfa-priv.h"
|
5
|
+
#include "kstring.h"
|
6
|
+
|
7
|
+
#include "khashl.h"
|
8
|
+
KHASHL_MAP_INIT(KH_LOCAL, h_s2i_t, h_s2i, kh_cstr_t, uint32_t, kh_hash_str, kh_eq_str)
|
9
|
+
|
10
|
+
#include "ksort.h"
|
11
|
+
#define gfa_arc_key(a) ((a).v_lv)
|
12
|
+
KRADIX_SORT_INIT(arc, gfa_arc_t, gfa_arc_key, 8)
|
13
|
+
|
14
|
+
#define generic_key(x) (x)
|
15
|
+
KRADIX_SORT_INIT(gfa64, uint64_t, generic_key, 8)
|
16
|
+
|
17
|
+
int gfa_verbose = 2;
|
18
|
+
|
19
|
+
gfa_t *gfa_init(void)
|
20
|
+
{
|
21
|
+
gfa_t *g;
|
22
|
+
g = (gfa_t*)calloc(1, sizeof(gfa_t));
|
23
|
+
g->h_names = h_s2i_init();
|
24
|
+
g->h_snames = h_s2i_init();
|
25
|
+
return g;
|
26
|
+
}
|
27
|
+
|
28
|
+
void gfa_destroy(gfa_t *g)
|
29
|
+
{
|
30
|
+
uint32_t i, j;
|
31
|
+
uint64_t k;
|
32
|
+
if (g == 0) return;
|
33
|
+
h_s2i_destroy((h_s2i_t*)g->h_names);
|
34
|
+
for (i = 0; i < g->n_seg; ++i) {
|
35
|
+
gfa_seg_t *s = &g->seg[i];
|
36
|
+
free(s->name);
|
37
|
+
free(s->seq);
|
38
|
+
free(s->aux.aux);
|
39
|
+
if (s->utg) {
|
40
|
+
for (j = 0; j < s->utg->n; ++j)
|
41
|
+
free(s->utg->name[j]);
|
42
|
+
free(s->utg->name);
|
43
|
+
free(s->utg->a);
|
44
|
+
free(s->utg);
|
45
|
+
}
|
46
|
+
}
|
47
|
+
for (i = 0; i < g->n_sseq; ++i) free(g->sseq[i].name);
|
48
|
+
h_s2i_destroy((h_s2i_t*)g->h_snames);
|
49
|
+
if (g->link_aux)
|
50
|
+
for (k = 0; k < g->n_arc; ++k)
|
51
|
+
free(g->link_aux[k].aux);
|
52
|
+
free(g->idx); free(g->seg); free(g->arc); free(g->link_aux); free(g->sseq);
|
53
|
+
free(g);
|
54
|
+
}
|
55
|
+
|
56
|
+
char *gfa_strdup(const char *src)
|
57
|
+
{
|
58
|
+
int32_t len;
|
59
|
+
char *dst;
|
60
|
+
len = strlen(src);
|
61
|
+
GFA_MALLOC(dst, len + 1);
|
62
|
+
memcpy(dst, src, len + 1);
|
63
|
+
return dst;
|
64
|
+
}
|
65
|
+
|
66
|
+
char *gfa_strndup(const char *src, size_t n)
|
67
|
+
{
|
68
|
+
char *dst;
|
69
|
+
GFA_MALLOC(dst, n + 1);
|
70
|
+
strncpy(dst, src, n);
|
71
|
+
dst[n] = 0;
|
72
|
+
return dst;
|
73
|
+
}
|
74
|
+
|
75
|
+
int32_t gfa_add_seg(gfa_t *g, const char *name)
|
76
|
+
{
|
77
|
+
khint_t k;
|
78
|
+
int absent;
|
79
|
+
h_s2i_t *h = (h_s2i_t*)g->h_names;
|
80
|
+
k = h_s2i_put(h, name, &absent);
|
81
|
+
if (absent) {
|
82
|
+
gfa_seg_t *s;
|
83
|
+
if (g->n_seg == g->m_seg) {
|
84
|
+
uint32_t old_m = g->m_seg;
|
85
|
+
g->m_seg = g->m_seg? g->m_seg<<1 : 16;
|
86
|
+
g->seg = (gfa_seg_t*)realloc(g->seg, g->m_seg * sizeof(gfa_seg_t));
|
87
|
+
memset(&g->seg[old_m], 0, (g->m_seg - old_m) * sizeof(gfa_seg_t));
|
88
|
+
}
|
89
|
+
s = &g->seg[g->n_seg++];
|
90
|
+
kh_key(h, k) = s->name = gfa_strdup(name);
|
91
|
+
s->del = s->len = 0;
|
92
|
+
s->snid = s->soff = s->rank = -1;
|
93
|
+
kh_val(h, k) = g->n_seg - 1;
|
94
|
+
}
|
95
|
+
return kh_val(h, k);
|
96
|
+
}
|
97
|
+
|
98
|
+
int32_t gfa_sseq_add(gfa_t *g, const char *sname)
|
99
|
+
{
|
100
|
+
h_s2i_t *h = (h_s2i_t*)g->h_snames;
|
101
|
+
khint_t k;
|
102
|
+
int absent;
|
103
|
+
k = h_s2i_put(h, sname, &absent);
|
104
|
+
if (absent) {
|
105
|
+
gfa_sseq_t *ss;
|
106
|
+
if (g->n_sseq == g->m_sseq) GFA_EXPAND(g->sseq, g->m_sseq);
|
107
|
+
ss = &g->sseq[g->n_sseq++];
|
108
|
+
kh_val(h, k) = g->n_sseq - 1;
|
109
|
+
kh_key(h, k) = ss->name = gfa_strdup(sname);
|
110
|
+
ss->min = -1, ss->max = -1, ss->rank = -1;
|
111
|
+
}
|
112
|
+
return kh_val(h, k);
|
113
|
+
}
|
114
|
+
|
115
|
+
int32_t gfa_sseq_get(const gfa_t *g, const char *sname)
|
116
|
+
{
|
117
|
+
h_s2i_t *h = (h_s2i_t*)g->h_snames;
|
118
|
+
khint_t k;
|
119
|
+
k = h_s2i_get(h, sname);
|
120
|
+
return k == kh_end(h)? -1 : kh_val(h, k);
|
121
|
+
}
|
122
|
+
|
123
|
+
void gfa_sseq_update(gfa_t *g, const gfa_seg_t *s)
|
124
|
+
{
|
125
|
+
gfa_sseq_t *ps;
|
126
|
+
if (s->snid < 0 || s->snid >= g->n_sseq) return;
|
127
|
+
ps = &g->sseq[s->snid];
|
128
|
+
if (ps->min < 0 || s->soff < ps->min) ps->min = s->soff;
|
129
|
+
if (ps->max < 0 || s->soff + s->len > ps->max) ps->max = s->soff + s->len;
|
130
|
+
if (ps->rank < 0) ps->rank = s->rank;
|
131
|
+
else if (ps->rank != s->rank) {
|
132
|
+
if (gfa_verbose >= 2)
|
133
|
+
fprintf(stderr, "[W] stable sequence '%s' associated with different ranks on segment '%s': %d != %d\n", ps->name, s->name, ps->rank, s->rank);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
int32_t gfa_name2id(const gfa_t *g, const char *name)
|
138
|
+
{
|
139
|
+
h_s2i_t *h = (h_s2i_t*)g->h_names;
|
140
|
+
khint_t k;
|
141
|
+
k = h_s2i_get(h, name);
|
142
|
+
return k == kh_end(h)? -1 : kh_val(h, k);
|
143
|
+
}
|
144
|
+
|
145
|
+
gfa_arc_t *gfa_add_arc1(gfa_t *g, uint32_t v, uint32_t w, int32_t ov, int32_t ow, int64_t link_id, int comp)
|
146
|
+
{
|
147
|
+
gfa_arc_t *a;
|
148
|
+
if (g->m_arc == g->n_arc) {
|
149
|
+
uint64_t old_m = g->m_arc;
|
150
|
+
g->m_arc = g->m_arc? g->m_arc<<1 : 16;
|
151
|
+
g->arc = (gfa_arc_t*)realloc(g->arc, g->m_arc * sizeof(gfa_arc_t));
|
152
|
+
memset(&g->arc[old_m], 0, (g->m_arc - old_m) * sizeof(gfa_arc_t));
|
153
|
+
g->link_aux = (gfa_aux_t*)realloc(g->link_aux, g->m_arc * sizeof(gfa_aux_t));
|
154
|
+
memset(&g->link_aux[old_m], 0, (g->m_arc - old_m) * sizeof(gfa_aux_t));
|
155
|
+
}
|
156
|
+
a = &g->arc[g->n_arc++];
|
157
|
+
a->v_lv = (uint64_t)v << 32;
|
158
|
+
a->w = w, a->ov = ov, a->ow = ow, a->rank = -1;
|
159
|
+
a->link_id = link_id >= 0? link_id : g->n_arc - 1;
|
160
|
+
if (link_id >= 0) a->rank = g->arc[link_id].rank; // TODO: this is not always correct!
|
161
|
+
a->del = a->strong = 0;
|
162
|
+
a->comp = comp;
|
163
|
+
return a;
|
164
|
+
}
|
165
|
+
|
166
|
+
int gfa_arc_is_sorted(const gfa_t *g)
|
167
|
+
{
|
168
|
+
uint64_t e;
|
169
|
+
for (e = 1; e < g->n_arc; ++e)
|
170
|
+
if (g->arc[e-1].v_lv > g->arc[e].v_lv)
|
171
|
+
break;
|
172
|
+
return (e == g->n_arc);
|
173
|
+
}
|
174
|
+
|
175
|
+
void gfa_arc_sort(gfa_t *g)
|
176
|
+
{
|
177
|
+
radix_sort_arc(g->arc, g->arc + g->n_arc);
|
178
|
+
}
|
179
|
+
|
180
|
+
uint64_t *gfa_arc_index_core(size_t max_seq, size_t n, const gfa_arc_t *a)
|
181
|
+
{
|
182
|
+
size_t i, last;
|
183
|
+
uint64_t *idx;
|
184
|
+
idx = (uint64_t*)calloc(max_seq * 2, 8);
|
185
|
+
for (i = 1, last = 0; i <= n; ++i)
|
186
|
+
if (i == n || gfa_arc_head(a[i-1]) != gfa_arc_head(a[i]))
|
187
|
+
idx[gfa_arc_head(a[i-1])] = (uint64_t)last<<32 | (i - last), last = i;
|
188
|
+
return idx;
|
189
|
+
}
|
190
|
+
|
191
|
+
void gfa_arc_index(gfa_t *g)
|
192
|
+
{
|
193
|
+
if (g->idx) free(g->idx);
|
194
|
+
g->idx = gfa_arc_index_core(g->n_seg, g->n_arc, g->arc);
|
195
|
+
}
|
196
|
+
|
197
|
+
/********************
|
198
|
+
* Fix graph issues *
|
199
|
+
********************/
|
200
|
+
|
201
|
+
uint32_t gfa_fix_no_seg(gfa_t *g)
|
202
|
+
{
|
203
|
+
uint32_t i, n_err = 0;
|
204
|
+
for (i = 0; i < g->n_seg; ++i) {
|
205
|
+
gfa_seg_t *s = &g->seg[i];
|
206
|
+
if (s->len == 0) {
|
207
|
+
++n_err, s->del = 1;
|
208
|
+
if (gfa_verbose >= 2)
|
209
|
+
fprintf(stderr, "[W] segment '%s' is used on an L-line but not defined on an S-line\n", s->name);
|
210
|
+
}
|
211
|
+
}
|
212
|
+
return n_err;
|
213
|
+
}
|
214
|
+
|
215
|
+
void gfa_fix_arc_len(gfa_t *g)
|
216
|
+
{
|
217
|
+
uint64_t k;
|
218
|
+
for (k = 0; k < g->n_arc; ++k) {
|
219
|
+
gfa_arc_t *a = &g->arc[k];
|
220
|
+
uint32_t v = gfa_arc_head(*a), w = gfa_arc_tail(*a);
|
221
|
+
const gfa_seg_t *sv = &g->seg[v>>1];
|
222
|
+
if (!sv->del && sv->len < a->ov) {
|
223
|
+
if (gfa_verbose >= 2)
|
224
|
+
fprintf(stderr, "[W] overlap length longer than segment length for '%s': %d > %d\n", sv->name, a->ov, sv->len);
|
225
|
+
a->ov = sv->len;
|
226
|
+
}
|
227
|
+
if (sv->del || g->seg[w>>1].del) {
|
228
|
+
a->del = 1;
|
229
|
+
} else {
|
230
|
+
a->v_lv |= sv->len - a->ov;
|
231
|
+
}
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
uint32_t gfa_fix_semi_arc(gfa_t *g)
|
236
|
+
{
|
237
|
+
uint32_t n_err = 0, v, n_vtx = gfa_n_vtx(g);
|
238
|
+
int i, j;
|
239
|
+
for (v = 0; v < n_vtx; ++v) {
|
240
|
+
int nv = gfa_arc_n(g, v);
|
241
|
+
gfa_arc_t *av = gfa_arc_a(g, v);
|
242
|
+
for (i = 0; i < nv; ++i) {
|
243
|
+
if (!av[i].del && (av[i].ow == INT32_MAX || av[i].ov == INT32_MAX)) { // overlap length is missing
|
244
|
+
uint32_t w = av[i].w^1;
|
245
|
+
int is_multi = 0, c, jv = -1, nw = gfa_arc_n(g, w);
|
246
|
+
gfa_arc_t *aw = gfa_arc_a(g, w);
|
247
|
+
for (j = 0, c = 0; j < nw; ++j)
|
248
|
+
if (!aw[j].del && aw[j].w == (v^1)) ++c, jv = j;
|
249
|
+
if (c == 1) {
|
250
|
+
if (av[i].ov != INT32_MAX && aw[jv].ow != INT32_MAX && av[i].ov != aw[jv].ow) is_multi = 1;
|
251
|
+
if (av[i].ow != INT32_MAX && aw[jv].ov != INT32_MAX && av[i].ow != aw[jv].ov) is_multi = 1;
|
252
|
+
}
|
253
|
+
if (c == 1 && !is_multi) {
|
254
|
+
if (aw[jv].ov != INT32_MAX) av[i].ow = aw[jv].ov;
|
255
|
+
if (aw[jv].ow != INT32_MAX) av[i].ov = aw[jv].ow;
|
256
|
+
} else {
|
257
|
+
if (gfa_verbose >= 2)
|
258
|
+
fprintf(stderr, "[W] can't infer overlap length for %s%c -> %s%c\n",
|
259
|
+
g->seg[v>>1].name, "+-"[v&1], g->seg[w>>1].name, "+-"[(w^1)&1]);
|
260
|
+
++n_err;
|
261
|
+
av[i].del = 1;
|
262
|
+
}
|
263
|
+
}
|
264
|
+
}
|
265
|
+
}
|
266
|
+
return n_err;
|
267
|
+
}
|
268
|
+
|
269
|
+
uint32_t gfa_fix_symm_add(gfa_t *g)
|
270
|
+
{
|
271
|
+
uint32_t n_err = 0, v, n_vtx = gfa_n_vtx(g);
|
272
|
+
int i;
|
273
|
+
for (v = 0; v < n_vtx; ++v) {
|
274
|
+
int nv = gfa_arc_n(g, v);
|
275
|
+
gfa_arc_t *av = gfa_arc_a(g, v);
|
276
|
+
for (i = 0; i < nv; ++i) {
|
277
|
+
int j, nw;
|
278
|
+
gfa_arc_t *aw, *avi = &av[i];
|
279
|
+
if (avi->del || avi->comp) continue;
|
280
|
+
nw = gfa_arc_n(g, avi->w^1);
|
281
|
+
aw = gfa_arc_a(g, avi->w^1);
|
282
|
+
for (j = 0; j < nw; ++j) {
|
283
|
+
gfa_arc_t *awj = &aw[j];
|
284
|
+
if (awj->del || awj->comp) continue;
|
285
|
+
if (awj->w == (v^1) && awj->ov == avi->ow && awj->ow == avi->ov) { // complement found
|
286
|
+
awj->comp = 1;
|
287
|
+
awj->link_id = avi->link_id;
|
288
|
+
break;
|
289
|
+
}
|
290
|
+
}
|
291
|
+
if (j == nw) {
|
292
|
+
gfa_arc_t *arc_old = g->arc, *arc_new;
|
293
|
+
arc_new = gfa_add_arc1(g, avi->w^1, v^1, avi->ow, avi->ov, avi->link_id, 1);
|
294
|
+
if (arc_old != g->arc) av = gfa_arc_a(g, v); // g->arc may be reallocated
|
295
|
+
arc_new->rank = av[i].rank;
|
296
|
+
}
|
297
|
+
}
|
298
|
+
}
|
299
|
+
if (n_vtx < gfa_n_vtx(g)) {
|
300
|
+
gfa_arc_sort(g);
|
301
|
+
gfa_arc_index(g);
|
302
|
+
}
|
303
|
+
return n_err;
|
304
|
+
}
|
305
|
+
|
306
|
+
void gfa_arc_rm(gfa_t *g)
|
307
|
+
{
|
308
|
+
uint32_t e, n;
|
309
|
+
for (e = n = 0; e < g->n_arc; ++e) {
|
310
|
+
uint32_t u = g->arc[e].v_lv>>32, v = g->arc[e].w;
|
311
|
+
if (!g->arc[e].del && !g->seg[u>>1].del && !g->seg[v>>1].del)
|
312
|
+
g->arc[n++] = g->arc[e];
|
313
|
+
else {
|
314
|
+
gfa_aux_t *aux = g->arc[e].link_id < g->n_arc? &g->link_aux[g->arc[e].link_id] : 0;
|
315
|
+
if (aux) {
|
316
|
+
free(aux->aux);
|
317
|
+
aux->aux = 0, aux->l_aux = aux->m_aux = 0;
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}
|
321
|
+
if (n < g->n_arc) { // arc index is out of sync
|
322
|
+
if (g->idx) free(g->idx);
|
323
|
+
g->idx = 0;
|
324
|
+
}
|
325
|
+
g->n_arc = n;
|
326
|
+
}
|
327
|
+
|
328
|
+
void gfa_cleanup(gfa_t *g)
|
329
|
+
{
|
330
|
+
gfa_arc_rm(g);
|
331
|
+
if (!gfa_arc_is_sorted(g)) {
|
332
|
+
gfa_arc_sort(g);
|
333
|
+
if (g->idx) free(g->idx);
|
334
|
+
g->idx = 0;
|
335
|
+
}
|
336
|
+
if (g->idx == 0) gfa_arc_index(g);
|
337
|
+
}
|
338
|
+
|
339
|
+
int32_t gfa_check_multi(const gfa_t *g)
|
340
|
+
{
|
341
|
+
uint32_t v, n_vtx = gfa_n_vtx(g);
|
342
|
+
int32_t max_nv = -1, n_multi = 0;
|
343
|
+
uint64_t *buf; // actually, uint32_t is enough
|
344
|
+
for (v = 0; v < n_vtx; ++v) {
|
345
|
+
int32_t nv = gfa_arc_n(g, v);
|
346
|
+
max_nv = max_nv > nv? max_nv : nv;
|
347
|
+
}
|
348
|
+
if (max_nv == 1 || max_nv < 0) return 0;
|
349
|
+
GFA_MALLOC(buf, max_nv);
|
350
|
+
for (v = 0; v < n_vtx; ++v) {
|
351
|
+
int32_t i, s, nv = gfa_arc_n(g, v);
|
352
|
+
const gfa_arc_t *av = gfa_arc_a(g, v);
|
353
|
+
for (i = 0; i < nv; ++i) buf[i] = av[i].w;
|
354
|
+
radix_sort_gfa64(buf, buf + nv);
|
355
|
+
for (s = 0, i = 1; i <= nv; ++i)
|
356
|
+
if (i == nv || buf[i] != buf[s])
|
357
|
+
n_multi += i - s - 1, s = i;
|
358
|
+
}
|
359
|
+
free(buf);
|
360
|
+
return n_multi;
|
361
|
+
}
|
362
|
+
|
363
|
+
uint32_t gfa_fix_multi(gfa_t *g)
|
364
|
+
{
|
365
|
+
uint32_t v, n_vtx = gfa_n_vtx(g), n_rm = 0;
|
366
|
+
int32_t max_nv = -1;
|
367
|
+
uint64_t *buf; // actually, uint32_t is enough
|
368
|
+
for (v = 0; v < n_vtx; ++v) {
|
369
|
+
int32_t nv = gfa_arc_n(g, v);
|
370
|
+
max_nv = max_nv > nv? max_nv : nv;
|
371
|
+
}
|
372
|
+
if (max_nv == 1) return 0;
|
373
|
+
GFA_MALLOC(buf, max_nv);
|
374
|
+
for (v = 0; v < n_vtx; ++v) {
|
375
|
+
int32_t i, j, s, nv = gfa_arc_n(g, v), nb;
|
376
|
+
gfa_arc_t *av = gfa_arc_a(g, v);
|
377
|
+
for (i = j = 0; i < nv; ++i)
|
378
|
+
if (!av[i].del) buf[j++] = (uint64_t)av[i].w<<32 | i;
|
379
|
+
nb = j;
|
380
|
+
if (nb < 1) continue;
|
381
|
+
radix_sort_gfa64(buf, buf + nb);
|
382
|
+
for (s = 0, i = 1; i <= nb; ++i) {
|
383
|
+
if (i == nv || buf[i]>>32 != buf[s]>>32) {
|
384
|
+
if (i - s > 1) {
|
385
|
+
int32_t k = (int32_t)buf[s], min_rank = av[k].rank; // prefer longest overlap
|
386
|
+
for (j = s + 1; j < i; ++j) { // rank has higher priority
|
387
|
+
int32_t t = (int32_t)buf[j];
|
388
|
+
if (av[t].rank >= 0 && av[t].rank < min_rank)
|
389
|
+
min_rank = av[t].rank, k = t;
|
390
|
+
}
|
391
|
+
if (av[k].w == (v^1)) { // a weird loop
|
392
|
+
if (gfa_verbose >= 2)
|
393
|
+
fprintf(stderr, "[W::%s] can't fix multiple edges due to '>v -- <v' involving segment %s\n", __func__, g->seg[v>>1].name);
|
394
|
+
} else {
|
395
|
+
int32_t nw = gfa_arc_n(g, av[k].w^1), n_wdel;
|
396
|
+
gfa_arc_t *aw = gfa_arc_a(g, av[k].w^1);
|
397
|
+
uint64_t link_id = av[k].link_id;
|
398
|
+
n_rm += i - s - 1;
|
399
|
+
for (j = s + 1; j < i; ++j)
|
400
|
+
av[(int32_t)buf[j]].del = 1;
|
401
|
+
for (j = 0, n_wdel = 0; j < nw; ++j)
|
402
|
+
if (aw[j].w == (v^1) && aw[j].link_id != link_id)
|
403
|
+
aw[j].del = 1, ++n_wdel;
|
404
|
+
assert(n_wdel == i - s - 1);
|
405
|
+
}
|
406
|
+
}
|
407
|
+
s = i;
|
408
|
+
}
|
409
|
+
}
|
410
|
+
}
|
411
|
+
free(buf);
|
412
|
+
if (n_rm > 0) {
|
413
|
+
if (gfa_verbose >= 2)
|
414
|
+
fprintf(stderr, "[W::%s] removed %d multiple link(s)\n", __func__, n_rm);
|
415
|
+
gfa_arc_rm(g);
|
416
|
+
gfa_arc_index(g);
|
417
|
+
}
|
418
|
+
return n_rm;
|
419
|
+
}
|
420
|
+
|
421
|
+
void gfa_finalize(gfa_t *g)
|
422
|
+
{
|
423
|
+
gfa_fix_no_seg(g);
|
424
|
+
gfa_arc_sort(g);
|
425
|
+
gfa_arc_index(g);
|
426
|
+
gfa_fix_semi_arc(g);
|
427
|
+
gfa_fix_symm_add(g);
|
428
|
+
gfa_fix_arc_len(g);
|
429
|
+
gfa_cleanup(g);
|
430
|
+
}
|
431
|
+
|
432
|
+
/********************
|
433
|
+
* Tag manipulation *
|
434
|
+
********************/
|
435
|
+
|
436
|
+
static inline int gfa_aux_type2size(int x)
|
437
|
+
{
|
438
|
+
if (x == 'C' || x == 'c' || x == 'A') return 1;
|
439
|
+
else if (x == 'S' || x == 's') return 2;
|
440
|
+
else if (x == 'I' || x == 'i' || x == 'f') return 4;
|
441
|
+
else return 0;
|
442
|
+
}
|
443
|
+
|
444
|
+
#define __skip_tag(s) do { \
|
445
|
+
int type = *(s); \
|
446
|
+
++(s); \
|
447
|
+
if (type == 'Z') { while (*(s)) ++(s); ++(s); } \
|
448
|
+
else if (type == 'B') (s) += 5 + gfa_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
|
449
|
+
else (s) += gfa_aux_type2size(type); \
|
450
|
+
} while(0)
|
451
|
+
|
452
|
+
uint8_t *gfa_aux_get(int l_data, const uint8_t *data, const char tag[2])
|
453
|
+
{
|
454
|
+
const uint8_t *s = data;
|
455
|
+
int y = tag[0]<<8 | tag[1];
|
456
|
+
while (s < data + l_data) {
|
457
|
+
int x = (int)s[0]<<8 | s[1];
|
458
|
+
s += 2;
|
459
|
+
if (x == y) return (uint8_t*)s;
|
460
|
+
__skip_tag(s);
|
461
|
+
}
|
462
|
+
return 0;
|
463
|
+
}
|
464
|
+
|
465
|
+
// s MUST BE returned by gfa_aux_get()
|
466
|
+
int gfa_aux_del(int l_data, uint8_t *data, uint8_t *s)
|
467
|
+
{
|
468
|
+
uint8_t *p;
|
469
|
+
p = s - 2;
|
470
|
+
__skip_tag(s);
|
471
|
+
memmove(p, s, l_data - (s - data));
|
472
|
+
return l_data - (s - p);
|
473
|
+
}
|
474
|
+
|
475
|
+
void gfa_aux_update_f(gfa_aux_t *a, const char tag[2], float x)
|
476
|
+
{
|
477
|
+
uint8_t *p = 0;
|
478
|
+
if (a->l_aux > 0)
|
479
|
+
p = gfa_aux_get(a->l_aux, a->aux, "cv");
|
480
|
+
if (p) {
|
481
|
+
memcpy(p + 1, &x, 4);
|
482
|
+
} else {
|
483
|
+
kstring_t str;
|
484
|
+
str.l = a->l_aux, str.m = a->m_aux, str.s = (char*)a->aux;
|
485
|
+
ks_resize(&str, str.l + 7);
|
486
|
+
kputsn_(tag, 2, &str);
|
487
|
+
kputc_('f', &str);
|
488
|
+
kputsn_(&x, 4, &str);
|
489
|
+
a->l_aux = str.l, a->m_aux = str.m, a->aux = (uint8_t*)str.s;
|
490
|
+
}
|
491
|
+
}
|
492
|
+
|
493
|
+
void gfa_aux_update_cv(gfa_t *g, const char *tag, const double *cov_seg, const double *cov_link)
|
494
|
+
{
|
495
|
+
int64_t i;
|
496
|
+
if (cov_seg)
|
497
|
+
for (i = 0; i < g->n_seg; ++i)
|
498
|
+
gfa_aux_update_f(&g->seg[i].aux, tag, cov_seg[i]);
|
499
|
+
if (cov_link)
|
500
|
+
for (i = 0; i < g->n_arc; ++i)
|
501
|
+
if (g->arc[i].comp == 0)
|
502
|
+
gfa_aux_update_f(&g->link_aux[g->arc[i].link_id], tag, cov_link[i]);
|
503
|
+
}
|
504
|
+
|
505
|
+
/*********************
|
506
|
+
* Translation table *
|
507
|
+
*********************/
|
508
|
+
|
509
|
+
unsigned char gfa_comp_table[256] = {
|
510
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
511
|
+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
512
|
+
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
513
|
+
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
514
|
+
64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
|
515
|
+
'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
|
516
|
+
96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
|
517
|
+
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127,
|
518
|
+
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
519
|
+
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
520
|
+
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
521
|
+
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
522
|
+
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
523
|
+
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
524
|
+
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
525
|
+
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
|
526
|
+
};
|