ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,395 @@
1
+ #include <zlib.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <ctype.h>
5
+ #include <string.h>
6
+ #include "kstring.h"
7
+ #include "gfa-priv.h"
8
+
9
+ #include "kseq.h"
10
+ KSTREAM_INIT(gzFile, gzread, 65536)
11
+
12
+ /***********
13
+ * Tag I/O *
14
+ ***********/
15
+
16
+ int gfa_aux_parse(char *s, uint8_t **data, int *max)
17
+ {
18
+ char *q, *p;
19
+ kstring_t str;
20
+ if (s == 0) return 0;
21
+ str.l = 0, str.m = *max, str.s = (char*)*data;
22
+ if (*s == '\t') ++s;
23
+ for (p = q = s;; ++p) {
24
+ if (*p == 0 || *p == '\t') {
25
+ int c = *p;
26
+ *p = 0;
27
+ if (p - q >= 5 && q[2] == ':' && q[4] == ':' && (q[3] == 'A' || q[3] == 'i' || q[3] == 'f' || q[3] == 'Z' || q[3] == 'B')) {
28
+ int type = q[3];
29
+ kputsn_(q, 2, &str);
30
+ q += 5;
31
+ if (type == 'A') {
32
+ kputc_('A', &str);
33
+ kputc_(*q, &str);
34
+ } else if (type == 'i') {
35
+ int32_t x;
36
+ x = strtol(q, &q, 10);
37
+ kputc_(type, &str); kputsn_((char*)&x, 4, &str);
38
+ } else if (type == 'f') {
39
+ float x;
40
+ x = strtod(q, &q);
41
+ kputc_('f', &str); kputsn_(&x, 4, &str);
42
+ } else if (type == 'Z') {
43
+ kputc_('Z', &str); kputsn_(q, p - q + 1, &str); // note that this include the trailing NULL
44
+ } else if (type == 'B') {
45
+ type = *q++; // q points to the first ',' following the typing byte
46
+ if (p - q >= 2 && (type == 'c' || type == 'C' || type == 's' || type == 'S' || type == 'i' || type == 'I' || type != 'f')) {
47
+ int32_t n;
48
+ char *r;
49
+ for (r = q, n = 0; *r; ++r)
50
+ if (*r == ',') ++n;
51
+ kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
52
+ // TODO: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
53
+ if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
54
+ else if (type == 'C') while (q + 1 < p) { uint8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
55
+ else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
56
+ else if (type == 'S') while (q + 1 < p) { uint16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
57
+ else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
58
+ else if (type == 'I') while (q + 1 < p) { uint32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
59
+ else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
60
+ }
61
+ } // should not be here, as we have tested all types
62
+ }
63
+ q = p + 1;
64
+ if (c == 0) break;
65
+ }
66
+ }
67
+ if (str.l > 0 && str.l == str.m) ks_resize(&str, str.l + 1);
68
+ if (str.s) str.s[str.l] = 0;
69
+ *max = str.m, *data = (uint8_t*)str.s;
70
+ return str.l;
71
+ }
72
+
73
+ int gfa_aux_format(int l_aux, const uint8_t *aux, char **t, int *max)
74
+ {
75
+ kstring_t str;
76
+ const uint8_t *s = aux;
77
+ str.l = 0, str.s = *t, str.m = *max;
78
+ while (s < aux + l_aux) {
79
+ uint8_t type, key[2];
80
+ key[0] = s[0]; key[1] = s[1];
81
+ s += 2; type = *s++;
82
+ kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
83
+ if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
84
+ else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
85
+ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
86
+ else if (type == 'Z') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
87
+ else if (type == 'B') {
88
+ uint8_t sub_type = *(s++);
89
+ int32_t i, n;
90
+ memcpy(&n, s, 4);
91
+ s += 4; // no point to the start of the array
92
+ kputsn("B:", 2, &str); kputc(sub_type, &str); // write the typing
93
+ for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
94
+ kputc(',', &str);
95
+ if ('c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
96
+ else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
97
+ else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
98
+ else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
99
+ else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
100
+ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
101
+ else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
102
+ }
103
+ }
104
+ }
105
+ *t = str.s, *max = str.m;
106
+ return str.l;
107
+ }
108
+
109
+ /****************
110
+ * Line parsers *
111
+ ****************/
112
+
113
+ int gfa_parse_S(gfa_t *g, char *s)
114
+ {
115
+ int i, is_ok = 0;
116
+ char *p, *q, *seg = 0, *seq = 0, *rest = 0;
117
+ uint32_t sid, len = 0;
118
+ for (i = 0, p = q = s + 2;; ++p) {
119
+ if (*p == 0 || *p == '\t') {
120
+ int c = *p;
121
+ *p = 0;
122
+ if (i == 0) seg = q;
123
+ else if (i == 1) {
124
+ seq = q[0] == '*'? 0 : gfa_strdup(q);
125
+ is_ok = 1, rest = c? p + 1 : 0;
126
+ break;
127
+ }
128
+ ++i, q = p + 1;
129
+ if (c == 0) break;
130
+ }
131
+ }
132
+ if (is_ok) { // all mandatory fields read
133
+ int l_aux, m_aux = 0, LN = -1;
134
+ uint8_t *aux = 0, *s_LN = 0;
135
+ gfa_seg_t *s;
136
+ l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
137
+ s_LN = l_aux? gfa_aux_get(l_aux, aux, "LN") : 0;
138
+ if (s_LN && s_LN[0] == 'i') {
139
+ LN = *(int32_t*)(s_LN + 1);
140
+ l_aux = gfa_aux_del(l_aux, aux, s_LN);
141
+ }
142
+ if (seq == 0) {
143
+ if (LN >= 0) len = LN;
144
+ } else len = strlen(seq);
145
+ if (LN >= 0 && len != LN && gfa_verbose >= 2)
146
+ fprintf(stderr, "[W] for segment '%s', LN:i:%d tag is different from sequence length %d\n", seg, LN, len);
147
+ sid = gfa_add_seg(g, seg);
148
+ s = &g->seg[sid];
149
+ s->len = len, s->seq = seq;
150
+ if (l_aux > 0) {
151
+ uint8_t *s_SN = 0, *s_SO = 0, *s_SR = 0;
152
+ s_SN = gfa_aux_get(l_aux, aux, "SN");
153
+ if (s_SN && *s_SN == 'Z') { // then parse stable tags
154
+ s->snid = gfa_sseq_add(g, (char*)(s_SN + 1)), s->soff = 0;
155
+ l_aux = gfa_aux_del(l_aux, aux, s_SN);
156
+ s_SO = gfa_aux_get(l_aux, aux, "SO");
157
+ if (s_SO && *s_SO == 'i') {
158
+ s->soff = *(int32_t*)(s_SO + 1);
159
+ l_aux = gfa_aux_del(l_aux, aux, s_SO);
160
+ }
161
+ }
162
+ s_SR = gfa_aux_get(l_aux, aux, "SR");
163
+ if (s_SR && *s_SR == 'i') {
164
+ s->rank = *(int32_t*)(s_SR + 1);
165
+ if (s->rank > g->max_rank) g->max_rank = s->rank;
166
+ l_aux = gfa_aux_del(l_aux, aux, s_SR);
167
+ }
168
+ gfa_sseq_update(g, s);
169
+ }
170
+ if (l_aux > 0)
171
+ s->aux.m_aux = m_aux, s->aux.l_aux = l_aux, s->aux.aux = aux;
172
+ else if (aux)
173
+ free(aux);
174
+ } else return -1;
175
+ return 0;
176
+ }
177
+
178
+ int gfa_parse_L(gfa_t *g, char *s)
179
+ {
180
+ int i, oriv = -1, oriw = -1, is_ok = 0;
181
+ char *p, *q, *segv = 0, *segw = 0, *rest = 0;
182
+ int32_t ov = INT32_MAX, ow = INT32_MAX;
183
+ for (i = 0, p = q = s + 2;; ++p) {
184
+ if (*p == 0 || *p == '\t') {
185
+ int c = *p;
186
+ *p = 0;
187
+ if (i == 0) {
188
+ segv = q;
189
+ } else if (i == 1) {
190
+ if (*q != '+' && *q != '-') return -2;
191
+ oriv = (*q != '+');
192
+ } else if (i == 2) {
193
+ segw = q;
194
+ } else if (i == 3) {
195
+ if (*q != '+' && *q != '-') return -2;
196
+ oriw = (*q != '+');
197
+ } else if (i == 4) {
198
+ if (*q == '*') {
199
+ ov = ow = 0;
200
+ } else if (*q == ':') {
201
+ ov = INT32_MAX;
202
+ ow = isdigit(*(q+1))? strtol(q+1, &q, 10) : INT32_MAX;
203
+ } else if (isdigit(*q)) {
204
+ char *r;
205
+ ov = strtol(q, &r, 10);
206
+ if (isupper(*r)) { // CIGAR
207
+ ov = ow = 0;
208
+ do {
209
+ long l;
210
+ l = strtol(q, &q, 10);
211
+ if (*q == 'M' || *q == 'D' || *q == 'N') ov += l;
212
+ if (*q == 'M' || *q == 'I' || *q == 'S') ow += l;
213
+ ++q;
214
+ } while (isdigit(*q));
215
+ } else if (*r == ':') { // overlap lengths
216
+ ow = isdigit(*(r+1))? strtol(r+1, &r, 10) : INT32_MAX;
217
+ } else break;
218
+ } else break;
219
+ is_ok = 1, rest = c? p + 1 : 0;
220
+ break;
221
+ }
222
+ ++i, q = p + 1;
223
+ if (c == 0) break;
224
+ }
225
+ }
226
+ if (i == 4 && is_ok == 0) ov = ow = 0, is_ok = 1; // no overlap field
227
+ if (is_ok) {
228
+ uint32_t v, w;
229
+ int l_aux, m_aux = 0;
230
+ uint8_t *aux = 0;
231
+ gfa_arc_t *arc;
232
+ v = gfa_add_seg(g, segv) << 1 | oriv;
233
+ w = gfa_add_seg(g, segw) << 1 | oriw;
234
+ arc = gfa_add_arc1(g, v, w, ov, ow, -1, 0);
235
+ l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
236
+ if (l_aux) {
237
+ gfa_aux_t *a = &g->link_aux[arc->link_id];
238
+ uint8_t *s_L1, *s_L2, *s_SR;
239
+ a->l_aux = l_aux, a->m_aux = m_aux, a->aux = aux;
240
+ s_SR = gfa_aux_get(a->l_aux, a->aux, "SR");
241
+ if (s_SR && s_SR[0] == 'i') {
242
+ arc->rank = *(int32_t*)(s_SR+1);
243
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_SR);
244
+ }
245
+ s_L1 = gfa_aux_get(a->l_aux, a->aux, "L1");
246
+ if (s_L1) {
247
+ if (ov != INT32_MAX && s_L1[0] == 'i')
248
+ g->seg[v>>1].len = g->seg[v>>1].len > ov + *(int32_t*)(s_L1+1)? g->seg[v>>1].len : ov + *(int32_t*)(s_L1+1);
249
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L1);
250
+ }
251
+ s_L2 = gfa_aux_get(a->l_aux, a->aux, "L2");
252
+ if (s_L2) {
253
+ if (ow != INT32_MAX && s_L2[0] == 'i')
254
+ g->seg[w>>1].len = g->seg[w>>1].len > ow + *(int32_t*)(s_L2+1)? g->seg[w>>1].len : ow + *(int32_t*)(s_L2+1);
255
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L2);
256
+ }
257
+ if (a->l_aux == 0) {
258
+ free(a->aux);
259
+ a->aux = 0, a->m_aux = 0;
260
+ }
261
+ }
262
+ } else return -1;
263
+ return 0;
264
+ }
265
+
266
+ static gfa_seg_t *gfa_parse_fa_hdr(gfa_t *g, char *s)
267
+ {
268
+ int32_t i;
269
+ char buf[16];
270
+ gfa_seg_t *seg;
271
+ for (i = 0; s[i]; ++i)
272
+ if (isspace(s[i])) break;
273
+ s[i] = 0;
274
+ sprintf(buf, "s%d", g->n_seg + 1);
275
+ i = gfa_add_seg(g, buf);
276
+ seg = &g->seg[i];
277
+ seg->snid = gfa_sseq_add(g, s + 1);
278
+ seg->soff = seg->rank = 0;
279
+ return seg;
280
+ }
281
+
282
+ static void gfa_update_fa_seq(gfa_t *g, gfa_seg_t *seg, int32_t l_seq, const char *seq)
283
+ {
284
+ if (seg == 0) return;
285
+ seg->seq = gfa_strdup(seq);
286
+ seg->len = l_seq;
287
+ gfa_sseq_update(g, seg);
288
+ }
289
+
290
+ /****************
291
+ * User-end I/O *
292
+ ****************/
293
+
294
+ gfa_t *gfa_read(const char *fn)
295
+ {
296
+ gzFile fp;
297
+ gfa_t *g;
298
+ kstring_t s = {0,0,0}, fa_seq = {0,0,0};
299
+ kstream_t *ks;
300
+ int dret, is_fa = 0;
301
+ gfa_seg_t *fa_seg = 0;
302
+ uint64_t lineno = 0;
303
+
304
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
305
+ if (fp == 0) return 0;
306
+ ks = ks_init(fp);
307
+ g = gfa_init();
308
+ while (ks_getuntil(ks, KS_SEP_LINE, &s, &dret) >= 0) {
309
+ int ret = 0;
310
+ ++lineno;
311
+ if (s.l > 0 && s.s[0] == '>') { // FASTA header
312
+ is_fa = 1;
313
+ if (fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
314
+ fa_seg = gfa_parse_fa_hdr(g, s.s);
315
+ fa_seq.l = 0;
316
+ } else if (is_fa) { // FASTA mode
317
+ if (s.l >= 3 && s.s[1] == '\t') { // likely a GFA line
318
+ gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s); // finalize fa_seg
319
+ fa_seg = 0;
320
+ is_fa = 0;
321
+ } else kputsn(s.s, s.l, &fa_seq); // likely a FASTA sequence line
322
+ }
323
+ if (is_fa) continue;
324
+ if (s.l < 3 || s.s[1] != '\t') continue; // empty line
325
+ if (s.s[0] == 'S') ret = gfa_parse_S(g, s.s);
326
+ else if (s.s[0] == 'L') ret = gfa_parse_L(g, s.s);
327
+ if (ret < 0 && gfa_verbose >= 1)
328
+ fprintf(stderr, "[E] invalid %c-line at line %ld (error code %d)\n", s.s[0], (long)lineno, ret);
329
+ }
330
+ if (is_fa && fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
331
+ free(fa_seq.s);
332
+ free(s.s);
333
+ gfa_finalize(g);
334
+ ks_destroy(ks);
335
+ gzclose(fp);
336
+ return g;
337
+ }
338
+
339
+ void gfa_print(const gfa_t *g, FILE *fp, int flag)
340
+ {
341
+ uint32_t i;
342
+ uint64_t k;
343
+ for (i = 0; i < g->n_seg; ++i) {
344
+ const gfa_seg_t *s = &g->seg[i];
345
+ if (s->del) continue;
346
+ fprintf(fp, "S\t%s\t", s->name);
347
+ if (s->seq && !(flag & GFA_O_NO_SEQ)) fputs(s->seq, fp);
348
+ else fputc('*', fp);
349
+ fprintf(fp, "\tLN:i:%d", s->len);
350
+ if (s->snid >= 0 && s->soff >= 0)
351
+ fprintf(fp, "\tSN:Z:%s\tSO:i:%d", g->sseq[s->snid].name, s->soff);
352
+ if (s->rank >= 0)
353
+ fprintf(fp, "\tSR:i:%d", s->rank);
354
+ if (s->utg && s->utg->n) fprintf(fp, "\tRC:i:%d\tlc:i:%d", s->utg->n, s->utg->len_comp);
355
+ if (s->aux.l_aux > 0) {
356
+ char *t = 0;
357
+ int max = 0;
358
+ gfa_aux_format(s->aux.l_aux, s->aux.aux, &t, &max);
359
+ fputs(t, fp);
360
+ free(t);
361
+ }
362
+ fputc('\n', fp);
363
+ if (s->utg && s->utg->n) {
364
+ uint32_t j, l;
365
+ for (j = l = 0; j < s->utg->n; ++j) {
366
+ const gfa_utg_t *u = s->utg;
367
+ fprintf(fp, "A\t%s\t%d\t%c\t%s\t%d\t%d\n", s->name, l, "+-"[u->a[j]>>32&1], u->name[j], (int32_t)(u->r[j]>>32), (int32_t)u->r[j]);
368
+ l += (uint32_t)u->a[j];
369
+ }
370
+ }
371
+ }
372
+ for (k = 0; k < g->n_arc; ++k) {
373
+ const gfa_arc_t *a = &g->arc[k];
374
+ const gfa_aux_t *aux = a->link_id < g->n_arc? &g->link_aux[a->link_id] : 0;
375
+ if (a->del || a->comp) continue;
376
+ fprintf(fp, "L\t%s\t%c\t%s\t%c", g->seg[a->v_lv>>33].name, "+-"[a->v_lv>>32&1], g->seg[a->w>>1].name, "+-"[a->w&1]);
377
+ if (!(flag & GFA_O_OV_EXT)) {
378
+ fprintf(fp, "\t%dM", a->ov < a->ow? a->ov : a->ow);
379
+ } else {
380
+ if (a->ov == a->ow) fprintf(fp, "\t%dM", a->ov);
381
+ else fprintf(fp, "\t%d:%d", a->ov, a->ow);
382
+ }
383
+ if (a->rank >= 0) fprintf(fp, "\tSR:i:%d", a->rank);
384
+ fprintf(fp, "\tL1:i:%d", gfa_arc_len(*a));
385
+ fprintf(fp, "\tL2:i:%d", gfa_arc_lw(g, *a));
386
+ if (aux && aux->l_aux) {
387
+ char *t = 0;
388
+ int max = 0;
389
+ gfa_aux_format(aux->l_aux, aux->aux, &t, &max);
390
+ if (t) fputs(t, fp);
391
+ free(t);
392
+ }
393
+ fputc('\n', fp);
394
+ }
395
+ }
@@ -0,0 +1,154 @@
1
+ #ifndef __GFA_PRIV_H__
2
+ #define __GFA_PRIV_H__
3
+
4
+ #include <stdlib.h>
5
+ #include "gfa.h"
6
+
7
+ #define GFA_MALLOC(ptr, len) ((ptr) = (__typeof__(ptr))malloc((len) * sizeof(*(ptr))))
8
+ #define GFA_CALLOC(ptr, len) ((ptr) = (__typeof__(ptr))calloc((len), sizeof(*(ptr))))
9
+ #define GFA_REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
10
+ #define GFA_BZERO(ptr, len) memset((ptr), 0, (len) * sizeof(*(ptr)))
11
+ #define GFA_EXPAND(a, m) do { \
12
+ (m) = (m)? (m) + ((m)>>1) : 16; \
13
+ GFA_REALLOC((a), (m)); \
14
+ } while (0)
15
+
16
+ typedef struct { uint64_t x, y; } gfa128_t;
17
+
18
+ // linearized subgraphs
19
+
20
+ typedef struct {
21
+ uint32_t v, d;
22
+ int32_t off, n;
23
+ } gfa_subv_t;
24
+
25
+ typedef struct {
26
+ int32_t n_v, n_a, is_dag;
27
+ gfa_subv_t *v;
28
+ uint64_t *a; // high 32 bits: point to the neighbor; low 32 bit: arc index in the graph
29
+ void *km;
30
+ } gfa_sub_t;
31
+
32
+ typedef struct {
33
+ int32_t snid, ss, se;
34
+ uint32_t vs, ve;
35
+ int32_t is_bidir, n_seg, len_max, len_min;
36
+ uint32_t *v, n_paths;
37
+ char *seq_max, *seq_min; // seq_max and seq_min point to v[]
38
+ } gfa_bubble_t;
39
+
40
+ struct gfa_scbuf_s;
41
+ typedef struct gfa_scbuf_s gfa_scbuf_t;
42
+
43
+ #ifdef __cplusplus
44
+ extern "C" {
45
+ #endif
46
+
47
+ char *gfa_strdup(const char *src);
48
+ char *gfa_strndup(const char *src, size_t n);
49
+ void radix_sort_gfa64(uint64_t *st, uint64_t *en);
50
+
51
+ // add/delete one segment/arc/stable sequence
52
+ int32_t gfa_add_seg(gfa_t *g, const char *name);
53
+ gfa_arc_t *gfa_add_arc1(gfa_t *g, uint32_t v, uint32_t w, int32_t ov, int32_t ow, int64_t link_id, int comp);
54
+ int32_t gfa_sseq_get(const gfa_t *g, const char *sname);
55
+ int32_t gfa_sseq_add(gfa_t *g, const char *sname);
56
+ void gfa_sseq_update(gfa_t *g, const gfa_seg_t *s);
57
+
58
+ // whole graph operations
59
+ void gfa_arc_sort(gfa_t *g);
60
+ void gfa_arc_index(gfa_t *g);
61
+ uint32_t gfa_fix_symm_add(gfa_t *g);
62
+ void gfa_fix_symm_del(gfa_t *g); // delete multiple edges and restore skew-symmetry
63
+ void gfa_arc_rm(gfa_t *g);
64
+ void gfa_cleanup(gfa_t *g); // permanently delete arcs marked as deleted, sort and then index
65
+ void gfa_finalize(gfa_t *g);
66
+ int32_t gfa_check_multi(const gfa_t *g);
67
+ uint32_t gfa_fix_multi(gfa_t *g);
68
+
69
+ int gfa_arc_del_multi_risky(gfa_t *g);
70
+ int gfa_arc_del_asymm_risky(gfa_t *g);
71
+
72
+ // edit distance
73
+ typedef struct {
74
+ int32_t traceback;
75
+ int32_t bw_dyn, max_lag, max_chk;
76
+ int32_t s_term;
77
+ int64_t i_term;
78
+ } gfa_edopt_t;
79
+
80
+ typedef struct {
81
+ int32_t s;
82
+ uint32_t end_v;
83
+ int32_t end_off;
84
+ int32_t wlen; // length of walk
85
+ int32_t n_end;
86
+ int32_t nv;
87
+ int64_t n_iter;
88
+ int32_t *v;
89
+ } gfa_edrst_t;
90
+
91
+ void gfa_edopt_init(gfa_edopt_t *opt);
92
+ void *gfa_ed_init(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0);
93
+ void gfa_ed_step(void *z_, uint32_t v1, int32_t off1, int32_t s_term, gfa_edrst_t *r);
94
+ void gfa_ed_destroy(void *z_);
95
+
96
+ int32_t gfa_edit_dist(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0, gfa_edrst_t *rst);
97
+
98
+ // assembly related routines
99
+ int gfa_arc_del_trans(gfa_t *g, int fuzz); // transitive reduction
100
+ int gfa_arc_del_weak(gfa_t *g);
101
+ int gfa_arc_pair_strong(gfa_t *g);
102
+ int gfa_arc_del_short(gfa_t *g, int min_ovlp_len, float drop_ratio); // delete short arcs
103
+ int gfa_drop_tip(gfa_t *g, int tip_cnt, int tip_len); // cut tips
104
+ int gfa_drop_internal(gfa_t *g, int max_ext);
105
+ int gfa_cut_z(gfa_t *g, int32_t min_dist, int32_t max_dist);
106
+ int gfa_topocut(gfa_t *g, float drop_ratio, int32_t tip_cnt, int32_t tip_len);
107
+ int gfa_bub_simple(gfa_t *g, int min_side, int max_side);
108
+ int gfa_pop_bubble(gfa_t *g, int radius, int max_del, int protect_tip); // bubble popping
109
+ gfa_t *gfa_ug_gen(const gfa_t *g);
110
+ void gfa_scc_all(const gfa_t *g);
111
+
112
+ // subset, modifying the graph
113
+ void gfa_sub(gfa_t *g, int n, char *const* seg, int step);
114
+ char **gfa_query_by_reg(const gfa_t *g, int32_t n_bb, const gfa_bubble_t *bb, const char *reg, int *n_seg);
115
+
116
+ // subset, without modifying the graph
117
+ gfa_sub_t *gfa_sub_from(void *km0, const gfa_t *g, uint32_t v0, int32_t max_dist);
118
+ void gfa_sub_destroy(gfa_sub_t *sub);
119
+ void gfa_sub_print(FILE *fp, const gfa_t *g, const gfa_sub_t *sub);
120
+
121
+ gfa_scbuf_t *gfa_scbuf_init(const gfa_t *g);
122
+ gfa_sub_t *gfa_scc1(void *km0, const gfa_t *g, gfa_scbuf_t *b, uint32_t v0);
123
+ void gfa_scbuf_destroy(gfa_scbuf_t *b);
124
+
125
+ // graph augmentation
126
+ int gfa_ins_adj(const gfa_t *g, int min_len, gfa_ins_t *ins, const char *seq);
127
+ int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins);
128
+ void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq);
129
+
130
+ gfa_sfa_t *gfa_gfa2sfa(const gfa_t *g, int32_t *n_sfa_, int32_t write_seq);
131
+
132
+ void gfa_sort_ref_arc(gfa_t *g);
133
+ gfa_bubble_t *gfa_bubble(const gfa_t *g, int32_t *n_); // FIXME: doesn't work with translocation
134
+
135
+ void gfa_gt_simple_print(const gfa_t *g, float min_dc, int32_t is_path); // FIXME: doesn't work with translocations
136
+
137
+ void gfa_aux_update_cv(gfa_t *g, const char *tag, const double *cov_seg, const double *cov_link);
138
+
139
+ void gfa_sql_write(FILE *fp, const gfa_t *g, int write_seq);
140
+
141
+ static inline int64_t gfa_find_arc(const gfa_t *g, uint32_t v, uint32_t w)
142
+ {
143
+ uint32_t i, nv = gfa_arc_n(g, v), nw = 0, k = (uint32_t)-1;
144
+ gfa_arc_t *av = gfa_arc_a(g, v);
145
+ for (i = 0; i < nv; ++i)
146
+ if (av[i].w == w) ++nw, k = i;
147
+ return nw == 1? (int64_t)(&av[k] - g->arc) : nw == 0? -1 : -2;
148
+ }
149
+
150
+ #ifdef __cplusplus
151
+ }
152
+ #endif
153
+
154
+ #endif // ~__GFA_PRIV_H__