ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,395 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <ctype.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include "kstring.h"
|
7
|
+
#include "gfa-priv.h"
|
8
|
+
|
9
|
+
#include "kseq.h"
|
10
|
+
KSTREAM_INIT(gzFile, gzread, 65536)
|
11
|
+
|
12
|
+
/***********
|
13
|
+
* Tag I/O *
|
14
|
+
***********/
|
15
|
+
|
16
|
+
int gfa_aux_parse(char *s, uint8_t **data, int *max)
|
17
|
+
{
|
18
|
+
char *q, *p;
|
19
|
+
kstring_t str;
|
20
|
+
if (s == 0) return 0;
|
21
|
+
str.l = 0, str.m = *max, str.s = (char*)*data;
|
22
|
+
if (*s == '\t') ++s;
|
23
|
+
for (p = q = s;; ++p) {
|
24
|
+
if (*p == 0 || *p == '\t') {
|
25
|
+
int c = *p;
|
26
|
+
*p = 0;
|
27
|
+
if (p - q >= 5 && q[2] == ':' && q[4] == ':' && (q[3] == 'A' || q[3] == 'i' || q[3] == 'f' || q[3] == 'Z' || q[3] == 'B')) {
|
28
|
+
int type = q[3];
|
29
|
+
kputsn_(q, 2, &str);
|
30
|
+
q += 5;
|
31
|
+
if (type == 'A') {
|
32
|
+
kputc_('A', &str);
|
33
|
+
kputc_(*q, &str);
|
34
|
+
} else if (type == 'i') {
|
35
|
+
int32_t x;
|
36
|
+
x = strtol(q, &q, 10);
|
37
|
+
kputc_(type, &str); kputsn_((char*)&x, 4, &str);
|
38
|
+
} else if (type == 'f') {
|
39
|
+
float x;
|
40
|
+
x = strtod(q, &q);
|
41
|
+
kputc_('f', &str); kputsn_(&x, 4, &str);
|
42
|
+
} else if (type == 'Z') {
|
43
|
+
kputc_('Z', &str); kputsn_(q, p - q + 1, &str); // note that this include the trailing NULL
|
44
|
+
} else if (type == 'B') {
|
45
|
+
type = *q++; // q points to the first ',' following the typing byte
|
46
|
+
if (p - q >= 2 && (type == 'c' || type == 'C' || type == 's' || type == 'S' || type == 'i' || type == 'I' || type != 'f')) {
|
47
|
+
int32_t n;
|
48
|
+
char *r;
|
49
|
+
for (r = q, n = 0; *r; ++r)
|
50
|
+
if (*r == ',') ++n;
|
51
|
+
kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
|
52
|
+
// TODO: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
|
53
|
+
if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
|
54
|
+
else if (type == 'C') while (q + 1 < p) { uint8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
|
55
|
+
else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
|
56
|
+
else if (type == 'S') while (q + 1 < p) { uint16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
|
57
|
+
else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
|
58
|
+
else if (type == 'I') while (q + 1 < p) { uint32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
|
59
|
+
else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
|
60
|
+
}
|
61
|
+
} // should not be here, as we have tested all types
|
62
|
+
}
|
63
|
+
q = p + 1;
|
64
|
+
if (c == 0) break;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
if (str.l > 0 && str.l == str.m) ks_resize(&str, str.l + 1);
|
68
|
+
if (str.s) str.s[str.l] = 0;
|
69
|
+
*max = str.m, *data = (uint8_t*)str.s;
|
70
|
+
return str.l;
|
71
|
+
}
|
72
|
+
|
73
|
+
int gfa_aux_format(int l_aux, const uint8_t *aux, char **t, int *max)
|
74
|
+
{
|
75
|
+
kstring_t str;
|
76
|
+
const uint8_t *s = aux;
|
77
|
+
str.l = 0, str.s = *t, str.m = *max;
|
78
|
+
while (s < aux + l_aux) {
|
79
|
+
uint8_t type, key[2];
|
80
|
+
key[0] = s[0]; key[1] = s[1];
|
81
|
+
s += 2; type = *s++;
|
82
|
+
kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
|
83
|
+
if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
|
84
|
+
else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
|
85
|
+
else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
|
86
|
+
else if (type == 'Z') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
|
87
|
+
else if (type == 'B') {
|
88
|
+
uint8_t sub_type = *(s++);
|
89
|
+
int32_t i, n;
|
90
|
+
memcpy(&n, s, 4);
|
91
|
+
s += 4; // no point to the start of the array
|
92
|
+
kputsn("B:", 2, &str); kputc(sub_type, &str); // write the typing
|
93
|
+
for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
|
94
|
+
kputc(',', &str);
|
95
|
+
if ('c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
|
96
|
+
else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
|
97
|
+
else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
|
98
|
+
else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
|
99
|
+
else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
|
100
|
+
else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
|
101
|
+
else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
*t = str.s, *max = str.m;
|
106
|
+
return str.l;
|
107
|
+
}
|
108
|
+
|
109
|
+
/****************
|
110
|
+
* Line parsers *
|
111
|
+
****************/
|
112
|
+
|
113
|
+
int gfa_parse_S(gfa_t *g, char *s)
|
114
|
+
{
|
115
|
+
int i, is_ok = 0;
|
116
|
+
char *p, *q, *seg = 0, *seq = 0, *rest = 0;
|
117
|
+
uint32_t sid, len = 0;
|
118
|
+
for (i = 0, p = q = s + 2;; ++p) {
|
119
|
+
if (*p == 0 || *p == '\t') {
|
120
|
+
int c = *p;
|
121
|
+
*p = 0;
|
122
|
+
if (i == 0) seg = q;
|
123
|
+
else if (i == 1) {
|
124
|
+
seq = q[0] == '*'? 0 : gfa_strdup(q);
|
125
|
+
is_ok = 1, rest = c? p + 1 : 0;
|
126
|
+
break;
|
127
|
+
}
|
128
|
+
++i, q = p + 1;
|
129
|
+
if (c == 0) break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
if (is_ok) { // all mandatory fields read
|
133
|
+
int l_aux, m_aux = 0, LN = -1;
|
134
|
+
uint8_t *aux = 0, *s_LN = 0;
|
135
|
+
gfa_seg_t *s;
|
136
|
+
l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
|
137
|
+
s_LN = l_aux? gfa_aux_get(l_aux, aux, "LN") : 0;
|
138
|
+
if (s_LN && s_LN[0] == 'i') {
|
139
|
+
LN = *(int32_t*)(s_LN + 1);
|
140
|
+
l_aux = gfa_aux_del(l_aux, aux, s_LN);
|
141
|
+
}
|
142
|
+
if (seq == 0) {
|
143
|
+
if (LN >= 0) len = LN;
|
144
|
+
} else len = strlen(seq);
|
145
|
+
if (LN >= 0 && len != LN && gfa_verbose >= 2)
|
146
|
+
fprintf(stderr, "[W] for segment '%s', LN:i:%d tag is different from sequence length %d\n", seg, LN, len);
|
147
|
+
sid = gfa_add_seg(g, seg);
|
148
|
+
s = &g->seg[sid];
|
149
|
+
s->len = len, s->seq = seq;
|
150
|
+
if (l_aux > 0) {
|
151
|
+
uint8_t *s_SN = 0, *s_SO = 0, *s_SR = 0;
|
152
|
+
s_SN = gfa_aux_get(l_aux, aux, "SN");
|
153
|
+
if (s_SN && *s_SN == 'Z') { // then parse stable tags
|
154
|
+
s->snid = gfa_sseq_add(g, (char*)(s_SN + 1)), s->soff = 0;
|
155
|
+
l_aux = gfa_aux_del(l_aux, aux, s_SN);
|
156
|
+
s_SO = gfa_aux_get(l_aux, aux, "SO");
|
157
|
+
if (s_SO && *s_SO == 'i') {
|
158
|
+
s->soff = *(int32_t*)(s_SO + 1);
|
159
|
+
l_aux = gfa_aux_del(l_aux, aux, s_SO);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
s_SR = gfa_aux_get(l_aux, aux, "SR");
|
163
|
+
if (s_SR && *s_SR == 'i') {
|
164
|
+
s->rank = *(int32_t*)(s_SR + 1);
|
165
|
+
if (s->rank > g->max_rank) g->max_rank = s->rank;
|
166
|
+
l_aux = gfa_aux_del(l_aux, aux, s_SR);
|
167
|
+
}
|
168
|
+
gfa_sseq_update(g, s);
|
169
|
+
}
|
170
|
+
if (l_aux > 0)
|
171
|
+
s->aux.m_aux = m_aux, s->aux.l_aux = l_aux, s->aux.aux = aux;
|
172
|
+
else if (aux)
|
173
|
+
free(aux);
|
174
|
+
} else return -1;
|
175
|
+
return 0;
|
176
|
+
}
|
177
|
+
|
178
|
+
int gfa_parse_L(gfa_t *g, char *s)
|
179
|
+
{
|
180
|
+
int i, oriv = -1, oriw = -1, is_ok = 0;
|
181
|
+
char *p, *q, *segv = 0, *segw = 0, *rest = 0;
|
182
|
+
int32_t ov = INT32_MAX, ow = INT32_MAX;
|
183
|
+
for (i = 0, p = q = s + 2;; ++p) {
|
184
|
+
if (*p == 0 || *p == '\t') {
|
185
|
+
int c = *p;
|
186
|
+
*p = 0;
|
187
|
+
if (i == 0) {
|
188
|
+
segv = q;
|
189
|
+
} else if (i == 1) {
|
190
|
+
if (*q != '+' && *q != '-') return -2;
|
191
|
+
oriv = (*q != '+');
|
192
|
+
} else if (i == 2) {
|
193
|
+
segw = q;
|
194
|
+
} else if (i == 3) {
|
195
|
+
if (*q != '+' && *q != '-') return -2;
|
196
|
+
oriw = (*q != '+');
|
197
|
+
} else if (i == 4) {
|
198
|
+
if (*q == '*') {
|
199
|
+
ov = ow = 0;
|
200
|
+
} else if (*q == ':') {
|
201
|
+
ov = INT32_MAX;
|
202
|
+
ow = isdigit(*(q+1))? strtol(q+1, &q, 10) : INT32_MAX;
|
203
|
+
} else if (isdigit(*q)) {
|
204
|
+
char *r;
|
205
|
+
ov = strtol(q, &r, 10);
|
206
|
+
if (isupper(*r)) { // CIGAR
|
207
|
+
ov = ow = 0;
|
208
|
+
do {
|
209
|
+
long l;
|
210
|
+
l = strtol(q, &q, 10);
|
211
|
+
if (*q == 'M' || *q == 'D' || *q == 'N') ov += l;
|
212
|
+
if (*q == 'M' || *q == 'I' || *q == 'S') ow += l;
|
213
|
+
++q;
|
214
|
+
} while (isdigit(*q));
|
215
|
+
} else if (*r == ':') { // overlap lengths
|
216
|
+
ow = isdigit(*(r+1))? strtol(r+1, &r, 10) : INT32_MAX;
|
217
|
+
} else break;
|
218
|
+
} else break;
|
219
|
+
is_ok = 1, rest = c? p + 1 : 0;
|
220
|
+
break;
|
221
|
+
}
|
222
|
+
++i, q = p + 1;
|
223
|
+
if (c == 0) break;
|
224
|
+
}
|
225
|
+
}
|
226
|
+
if (i == 4 && is_ok == 0) ov = ow = 0, is_ok = 1; // no overlap field
|
227
|
+
if (is_ok) {
|
228
|
+
uint32_t v, w;
|
229
|
+
int l_aux, m_aux = 0;
|
230
|
+
uint8_t *aux = 0;
|
231
|
+
gfa_arc_t *arc;
|
232
|
+
v = gfa_add_seg(g, segv) << 1 | oriv;
|
233
|
+
w = gfa_add_seg(g, segw) << 1 | oriw;
|
234
|
+
arc = gfa_add_arc1(g, v, w, ov, ow, -1, 0);
|
235
|
+
l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
|
236
|
+
if (l_aux) {
|
237
|
+
gfa_aux_t *a = &g->link_aux[arc->link_id];
|
238
|
+
uint8_t *s_L1, *s_L2, *s_SR;
|
239
|
+
a->l_aux = l_aux, a->m_aux = m_aux, a->aux = aux;
|
240
|
+
s_SR = gfa_aux_get(a->l_aux, a->aux, "SR");
|
241
|
+
if (s_SR && s_SR[0] == 'i') {
|
242
|
+
arc->rank = *(int32_t*)(s_SR+1);
|
243
|
+
a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_SR);
|
244
|
+
}
|
245
|
+
s_L1 = gfa_aux_get(a->l_aux, a->aux, "L1");
|
246
|
+
if (s_L1) {
|
247
|
+
if (ov != INT32_MAX && s_L1[0] == 'i')
|
248
|
+
g->seg[v>>1].len = g->seg[v>>1].len > ov + *(int32_t*)(s_L1+1)? g->seg[v>>1].len : ov + *(int32_t*)(s_L1+1);
|
249
|
+
a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L1);
|
250
|
+
}
|
251
|
+
s_L2 = gfa_aux_get(a->l_aux, a->aux, "L2");
|
252
|
+
if (s_L2) {
|
253
|
+
if (ow != INT32_MAX && s_L2[0] == 'i')
|
254
|
+
g->seg[w>>1].len = g->seg[w>>1].len > ow + *(int32_t*)(s_L2+1)? g->seg[w>>1].len : ow + *(int32_t*)(s_L2+1);
|
255
|
+
a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L2);
|
256
|
+
}
|
257
|
+
if (a->l_aux == 0) {
|
258
|
+
free(a->aux);
|
259
|
+
a->aux = 0, a->m_aux = 0;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
} else return -1;
|
263
|
+
return 0;
|
264
|
+
}
|
265
|
+
|
266
|
+
static gfa_seg_t *gfa_parse_fa_hdr(gfa_t *g, char *s)
|
267
|
+
{
|
268
|
+
int32_t i;
|
269
|
+
char buf[16];
|
270
|
+
gfa_seg_t *seg;
|
271
|
+
for (i = 0; s[i]; ++i)
|
272
|
+
if (isspace(s[i])) break;
|
273
|
+
s[i] = 0;
|
274
|
+
sprintf(buf, "s%d", g->n_seg + 1);
|
275
|
+
i = gfa_add_seg(g, buf);
|
276
|
+
seg = &g->seg[i];
|
277
|
+
seg->snid = gfa_sseq_add(g, s + 1);
|
278
|
+
seg->soff = seg->rank = 0;
|
279
|
+
return seg;
|
280
|
+
}
|
281
|
+
|
282
|
+
static void gfa_update_fa_seq(gfa_t *g, gfa_seg_t *seg, int32_t l_seq, const char *seq)
|
283
|
+
{
|
284
|
+
if (seg == 0) return;
|
285
|
+
seg->seq = gfa_strdup(seq);
|
286
|
+
seg->len = l_seq;
|
287
|
+
gfa_sseq_update(g, seg);
|
288
|
+
}
|
289
|
+
|
290
|
+
/****************
|
291
|
+
* User-end I/O *
|
292
|
+
****************/
|
293
|
+
|
294
|
+
gfa_t *gfa_read(const char *fn)
|
295
|
+
{
|
296
|
+
gzFile fp;
|
297
|
+
gfa_t *g;
|
298
|
+
kstring_t s = {0,0,0}, fa_seq = {0,0,0};
|
299
|
+
kstream_t *ks;
|
300
|
+
int dret, is_fa = 0;
|
301
|
+
gfa_seg_t *fa_seg = 0;
|
302
|
+
uint64_t lineno = 0;
|
303
|
+
|
304
|
+
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
|
305
|
+
if (fp == 0) return 0;
|
306
|
+
ks = ks_init(fp);
|
307
|
+
g = gfa_init();
|
308
|
+
while (ks_getuntil(ks, KS_SEP_LINE, &s, &dret) >= 0) {
|
309
|
+
int ret = 0;
|
310
|
+
++lineno;
|
311
|
+
if (s.l > 0 && s.s[0] == '>') { // FASTA header
|
312
|
+
is_fa = 1;
|
313
|
+
if (fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
|
314
|
+
fa_seg = gfa_parse_fa_hdr(g, s.s);
|
315
|
+
fa_seq.l = 0;
|
316
|
+
} else if (is_fa) { // FASTA mode
|
317
|
+
if (s.l >= 3 && s.s[1] == '\t') { // likely a GFA line
|
318
|
+
gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s); // finalize fa_seg
|
319
|
+
fa_seg = 0;
|
320
|
+
is_fa = 0;
|
321
|
+
} else kputsn(s.s, s.l, &fa_seq); // likely a FASTA sequence line
|
322
|
+
}
|
323
|
+
if (is_fa) continue;
|
324
|
+
if (s.l < 3 || s.s[1] != '\t') continue; // empty line
|
325
|
+
if (s.s[0] == 'S') ret = gfa_parse_S(g, s.s);
|
326
|
+
else if (s.s[0] == 'L') ret = gfa_parse_L(g, s.s);
|
327
|
+
if (ret < 0 && gfa_verbose >= 1)
|
328
|
+
fprintf(stderr, "[E] invalid %c-line at line %ld (error code %d)\n", s.s[0], (long)lineno, ret);
|
329
|
+
}
|
330
|
+
if (is_fa && fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
|
331
|
+
free(fa_seq.s);
|
332
|
+
free(s.s);
|
333
|
+
gfa_finalize(g);
|
334
|
+
ks_destroy(ks);
|
335
|
+
gzclose(fp);
|
336
|
+
return g;
|
337
|
+
}
|
338
|
+
|
339
|
+
void gfa_print(const gfa_t *g, FILE *fp, int flag)
|
340
|
+
{
|
341
|
+
uint32_t i;
|
342
|
+
uint64_t k;
|
343
|
+
for (i = 0; i < g->n_seg; ++i) {
|
344
|
+
const gfa_seg_t *s = &g->seg[i];
|
345
|
+
if (s->del) continue;
|
346
|
+
fprintf(fp, "S\t%s\t", s->name);
|
347
|
+
if (s->seq && !(flag & GFA_O_NO_SEQ)) fputs(s->seq, fp);
|
348
|
+
else fputc('*', fp);
|
349
|
+
fprintf(fp, "\tLN:i:%d", s->len);
|
350
|
+
if (s->snid >= 0 && s->soff >= 0)
|
351
|
+
fprintf(fp, "\tSN:Z:%s\tSO:i:%d", g->sseq[s->snid].name, s->soff);
|
352
|
+
if (s->rank >= 0)
|
353
|
+
fprintf(fp, "\tSR:i:%d", s->rank);
|
354
|
+
if (s->utg && s->utg->n) fprintf(fp, "\tRC:i:%d\tlc:i:%d", s->utg->n, s->utg->len_comp);
|
355
|
+
if (s->aux.l_aux > 0) {
|
356
|
+
char *t = 0;
|
357
|
+
int max = 0;
|
358
|
+
gfa_aux_format(s->aux.l_aux, s->aux.aux, &t, &max);
|
359
|
+
fputs(t, fp);
|
360
|
+
free(t);
|
361
|
+
}
|
362
|
+
fputc('\n', fp);
|
363
|
+
if (s->utg && s->utg->n) {
|
364
|
+
uint32_t j, l;
|
365
|
+
for (j = l = 0; j < s->utg->n; ++j) {
|
366
|
+
const gfa_utg_t *u = s->utg;
|
367
|
+
fprintf(fp, "A\t%s\t%d\t%c\t%s\t%d\t%d\n", s->name, l, "+-"[u->a[j]>>32&1], u->name[j], (int32_t)(u->r[j]>>32), (int32_t)u->r[j]);
|
368
|
+
l += (uint32_t)u->a[j];
|
369
|
+
}
|
370
|
+
}
|
371
|
+
}
|
372
|
+
for (k = 0; k < g->n_arc; ++k) {
|
373
|
+
const gfa_arc_t *a = &g->arc[k];
|
374
|
+
const gfa_aux_t *aux = a->link_id < g->n_arc? &g->link_aux[a->link_id] : 0;
|
375
|
+
if (a->del || a->comp) continue;
|
376
|
+
fprintf(fp, "L\t%s\t%c\t%s\t%c", g->seg[a->v_lv>>33].name, "+-"[a->v_lv>>32&1], g->seg[a->w>>1].name, "+-"[a->w&1]);
|
377
|
+
if (!(flag & GFA_O_OV_EXT)) {
|
378
|
+
fprintf(fp, "\t%dM", a->ov < a->ow? a->ov : a->ow);
|
379
|
+
} else {
|
380
|
+
if (a->ov == a->ow) fprintf(fp, "\t%dM", a->ov);
|
381
|
+
else fprintf(fp, "\t%d:%d", a->ov, a->ow);
|
382
|
+
}
|
383
|
+
if (a->rank >= 0) fprintf(fp, "\tSR:i:%d", a->rank);
|
384
|
+
fprintf(fp, "\tL1:i:%d", gfa_arc_len(*a));
|
385
|
+
fprintf(fp, "\tL2:i:%d", gfa_arc_lw(g, *a));
|
386
|
+
if (aux && aux->l_aux) {
|
387
|
+
char *t = 0;
|
388
|
+
int max = 0;
|
389
|
+
gfa_aux_format(aux->l_aux, aux->aux, &t, &max);
|
390
|
+
if (t) fputs(t, fp);
|
391
|
+
free(t);
|
392
|
+
}
|
393
|
+
fputc('\n', fp);
|
394
|
+
}
|
395
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#ifndef __GFA_PRIV_H__
|
2
|
+
#define __GFA_PRIV_H__
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include "gfa.h"
|
6
|
+
|
7
|
+
#define GFA_MALLOC(ptr, len) ((ptr) = (__typeof__(ptr))malloc((len) * sizeof(*(ptr))))
|
8
|
+
#define GFA_CALLOC(ptr, len) ((ptr) = (__typeof__(ptr))calloc((len), sizeof(*(ptr))))
|
9
|
+
#define GFA_REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
|
10
|
+
#define GFA_BZERO(ptr, len) memset((ptr), 0, (len) * sizeof(*(ptr)))
|
11
|
+
#define GFA_EXPAND(a, m) do { \
|
12
|
+
(m) = (m)? (m) + ((m)>>1) : 16; \
|
13
|
+
GFA_REALLOC((a), (m)); \
|
14
|
+
} while (0)
|
15
|
+
|
16
|
+
typedef struct { uint64_t x, y; } gfa128_t;
|
17
|
+
|
18
|
+
// linearized subgraphs
|
19
|
+
|
20
|
+
typedef struct {
|
21
|
+
uint32_t v, d;
|
22
|
+
int32_t off, n;
|
23
|
+
} gfa_subv_t;
|
24
|
+
|
25
|
+
typedef struct {
|
26
|
+
int32_t n_v, n_a, is_dag;
|
27
|
+
gfa_subv_t *v;
|
28
|
+
uint64_t *a; // high 32 bits: point to the neighbor; low 32 bit: arc index in the graph
|
29
|
+
void *km;
|
30
|
+
} gfa_sub_t;
|
31
|
+
|
32
|
+
typedef struct {
|
33
|
+
int32_t snid, ss, se;
|
34
|
+
uint32_t vs, ve;
|
35
|
+
int32_t is_bidir, n_seg, len_max, len_min;
|
36
|
+
uint32_t *v, n_paths;
|
37
|
+
char *seq_max, *seq_min; // seq_max and seq_min point to v[]
|
38
|
+
} gfa_bubble_t;
|
39
|
+
|
40
|
+
struct gfa_scbuf_s;
|
41
|
+
typedef struct gfa_scbuf_s gfa_scbuf_t;
|
42
|
+
|
43
|
+
#ifdef __cplusplus
|
44
|
+
extern "C" {
|
45
|
+
#endif
|
46
|
+
|
47
|
+
char *gfa_strdup(const char *src);
|
48
|
+
char *gfa_strndup(const char *src, size_t n);
|
49
|
+
void radix_sort_gfa64(uint64_t *st, uint64_t *en);
|
50
|
+
|
51
|
+
// add/delete one segment/arc/stable sequence
|
52
|
+
int32_t gfa_add_seg(gfa_t *g, const char *name);
|
53
|
+
gfa_arc_t *gfa_add_arc1(gfa_t *g, uint32_t v, uint32_t w, int32_t ov, int32_t ow, int64_t link_id, int comp);
|
54
|
+
int32_t gfa_sseq_get(const gfa_t *g, const char *sname);
|
55
|
+
int32_t gfa_sseq_add(gfa_t *g, const char *sname);
|
56
|
+
void gfa_sseq_update(gfa_t *g, const gfa_seg_t *s);
|
57
|
+
|
58
|
+
// whole graph operations
|
59
|
+
void gfa_arc_sort(gfa_t *g);
|
60
|
+
void gfa_arc_index(gfa_t *g);
|
61
|
+
uint32_t gfa_fix_symm_add(gfa_t *g);
|
62
|
+
void gfa_fix_symm_del(gfa_t *g); // delete multiple edges and restore skew-symmetry
|
63
|
+
void gfa_arc_rm(gfa_t *g);
|
64
|
+
void gfa_cleanup(gfa_t *g); // permanently delete arcs marked as deleted, sort and then index
|
65
|
+
void gfa_finalize(gfa_t *g);
|
66
|
+
int32_t gfa_check_multi(const gfa_t *g);
|
67
|
+
uint32_t gfa_fix_multi(gfa_t *g);
|
68
|
+
|
69
|
+
int gfa_arc_del_multi_risky(gfa_t *g);
|
70
|
+
int gfa_arc_del_asymm_risky(gfa_t *g);
|
71
|
+
|
72
|
+
// edit distance
|
73
|
+
typedef struct {
|
74
|
+
int32_t traceback;
|
75
|
+
int32_t bw_dyn, max_lag, max_chk;
|
76
|
+
int32_t s_term;
|
77
|
+
int64_t i_term;
|
78
|
+
} gfa_edopt_t;
|
79
|
+
|
80
|
+
typedef struct {
|
81
|
+
int32_t s;
|
82
|
+
uint32_t end_v;
|
83
|
+
int32_t end_off;
|
84
|
+
int32_t wlen; // length of walk
|
85
|
+
int32_t n_end;
|
86
|
+
int32_t nv;
|
87
|
+
int64_t n_iter;
|
88
|
+
int32_t *v;
|
89
|
+
} gfa_edrst_t;
|
90
|
+
|
91
|
+
void gfa_edopt_init(gfa_edopt_t *opt);
|
92
|
+
void *gfa_ed_init(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0);
|
93
|
+
void gfa_ed_step(void *z_, uint32_t v1, int32_t off1, int32_t s_term, gfa_edrst_t *r);
|
94
|
+
void gfa_ed_destroy(void *z_);
|
95
|
+
|
96
|
+
int32_t gfa_edit_dist(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0, gfa_edrst_t *rst);
|
97
|
+
|
98
|
+
// assembly related routines
|
99
|
+
int gfa_arc_del_trans(gfa_t *g, int fuzz); // transitive reduction
|
100
|
+
int gfa_arc_del_weak(gfa_t *g);
|
101
|
+
int gfa_arc_pair_strong(gfa_t *g);
|
102
|
+
int gfa_arc_del_short(gfa_t *g, int min_ovlp_len, float drop_ratio); // delete short arcs
|
103
|
+
int gfa_drop_tip(gfa_t *g, int tip_cnt, int tip_len); // cut tips
|
104
|
+
int gfa_drop_internal(gfa_t *g, int max_ext);
|
105
|
+
int gfa_cut_z(gfa_t *g, int32_t min_dist, int32_t max_dist);
|
106
|
+
int gfa_topocut(gfa_t *g, float drop_ratio, int32_t tip_cnt, int32_t tip_len);
|
107
|
+
int gfa_bub_simple(gfa_t *g, int min_side, int max_side);
|
108
|
+
int gfa_pop_bubble(gfa_t *g, int radius, int max_del, int protect_tip); // bubble popping
|
109
|
+
gfa_t *gfa_ug_gen(const gfa_t *g);
|
110
|
+
void gfa_scc_all(const gfa_t *g);
|
111
|
+
|
112
|
+
// subset, modifying the graph
|
113
|
+
void gfa_sub(gfa_t *g, int n, char *const* seg, int step);
|
114
|
+
char **gfa_query_by_reg(const gfa_t *g, int32_t n_bb, const gfa_bubble_t *bb, const char *reg, int *n_seg);
|
115
|
+
|
116
|
+
// subset, without modifying the graph
|
117
|
+
gfa_sub_t *gfa_sub_from(void *km0, const gfa_t *g, uint32_t v0, int32_t max_dist);
|
118
|
+
void gfa_sub_destroy(gfa_sub_t *sub);
|
119
|
+
void gfa_sub_print(FILE *fp, const gfa_t *g, const gfa_sub_t *sub);
|
120
|
+
|
121
|
+
gfa_scbuf_t *gfa_scbuf_init(const gfa_t *g);
|
122
|
+
gfa_sub_t *gfa_scc1(void *km0, const gfa_t *g, gfa_scbuf_t *b, uint32_t v0);
|
123
|
+
void gfa_scbuf_destroy(gfa_scbuf_t *b);
|
124
|
+
|
125
|
+
// graph augmentation
|
126
|
+
int gfa_ins_adj(const gfa_t *g, int min_len, gfa_ins_t *ins, const char *seq);
|
127
|
+
int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins);
|
128
|
+
void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq);
|
129
|
+
|
130
|
+
gfa_sfa_t *gfa_gfa2sfa(const gfa_t *g, int32_t *n_sfa_, int32_t write_seq);
|
131
|
+
|
132
|
+
void gfa_sort_ref_arc(gfa_t *g);
|
133
|
+
gfa_bubble_t *gfa_bubble(const gfa_t *g, int32_t *n_); // FIXME: doesn't work with translocation
|
134
|
+
|
135
|
+
void gfa_gt_simple_print(const gfa_t *g, float min_dc, int32_t is_path); // FIXME: doesn't work with translocations
|
136
|
+
|
137
|
+
void gfa_aux_update_cv(gfa_t *g, const char *tag, const double *cov_seg, const double *cov_link);
|
138
|
+
|
139
|
+
void gfa_sql_write(FILE *fp, const gfa_t *g, int write_seq);
|
140
|
+
|
141
|
+
static inline int64_t gfa_find_arc(const gfa_t *g, uint32_t v, uint32_t w)
|
142
|
+
{
|
143
|
+
uint32_t i, nv = gfa_arc_n(g, v), nw = 0, k = (uint32_t)-1;
|
144
|
+
gfa_arc_t *av = gfa_arc_a(g, v);
|
145
|
+
for (i = 0; i < nv; ++i)
|
146
|
+
if (av[i].w == w) ++nw, k = i;
|
147
|
+
return nw == 1? (int64_t)(&av[k] - g->arc) : nw == 0? -1 : -2;
|
148
|
+
}
|
149
|
+
|
150
|
+
#ifdef __cplusplus
|
151
|
+
}
|
152
|
+
#endif
|
153
|
+
|
154
|
+
#endif // ~__GFA_PRIV_H__
|