ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,395 @@
1
+ #include <zlib.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <ctype.h>
5
+ #include <string.h>
6
+ #include "kstring.h"
7
+ #include "gfa-priv.h"
8
+
9
+ #include "kseq.h"
10
+ KSTREAM_INIT(gzFile, gzread, 65536)
11
+
12
+ /***********
13
+ * Tag I/O *
14
+ ***********/
15
+
16
+ int gfa_aux_parse(char *s, uint8_t **data, int *max)
17
+ {
18
+ char *q, *p;
19
+ kstring_t str;
20
+ if (s == 0) return 0;
21
+ str.l = 0, str.m = *max, str.s = (char*)*data;
22
+ if (*s == '\t') ++s;
23
+ for (p = q = s;; ++p) {
24
+ if (*p == 0 || *p == '\t') {
25
+ int c = *p;
26
+ *p = 0;
27
+ if (p - q >= 5 && q[2] == ':' && q[4] == ':' && (q[3] == 'A' || q[3] == 'i' || q[3] == 'f' || q[3] == 'Z' || q[3] == 'B')) {
28
+ int type = q[3];
29
+ kputsn_(q, 2, &str);
30
+ q += 5;
31
+ if (type == 'A') {
32
+ kputc_('A', &str);
33
+ kputc_(*q, &str);
34
+ } else if (type == 'i') {
35
+ int32_t x;
36
+ x = strtol(q, &q, 10);
37
+ kputc_(type, &str); kputsn_((char*)&x, 4, &str);
38
+ } else if (type == 'f') {
39
+ float x;
40
+ x = strtod(q, &q);
41
+ kputc_('f', &str); kputsn_(&x, 4, &str);
42
+ } else if (type == 'Z') {
43
+ kputc_('Z', &str); kputsn_(q, p - q + 1, &str); // note that this include the trailing NULL
44
+ } else if (type == 'B') {
45
+ type = *q++; // q points to the first ',' following the typing byte
46
+ if (p - q >= 2 && (type == 'c' || type == 'C' || type == 's' || type == 'S' || type == 'i' || type == 'I' || type != 'f')) {
47
+ int32_t n;
48
+ char *r;
49
+ for (r = q, n = 0; *r; ++r)
50
+ if (*r == ',') ++n;
51
+ kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
52
+ // TODO: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
53
+ if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
54
+ else if (type == 'C') while (q + 1 < p) { uint8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
55
+ else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
56
+ else if (type == 'S') while (q + 1 < p) { uint16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
57
+ else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
58
+ else if (type == 'I') while (q + 1 < p) { uint32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
59
+ else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
60
+ }
61
+ } // should not be here, as we have tested all types
62
+ }
63
+ q = p + 1;
64
+ if (c == 0) break;
65
+ }
66
+ }
67
+ if (str.l > 0 && str.l == str.m) ks_resize(&str, str.l + 1);
68
+ if (str.s) str.s[str.l] = 0;
69
+ *max = str.m, *data = (uint8_t*)str.s;
70
+ return str.l;
71
+ }
72
+
73
+ int gfa_aux_format(int l_aux, const uint8_t *aux, char **t, int *max)
74
+ {
75
+ kstring_t str;
76
+ const uint8_t *s = aux;
77
+ str.l = 0, str.s = *t, str.m = *max;
78
+ while (s < aux + l_aux) {
79
+ uint8_t type, key[2];
80
+ key[0] = s[0]; key[1] = s[1];
81
+ s += 2; type = *s++;
82
+ kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
83
+ if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
84
+ else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
85
+ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
86
+ else if (type == 'Z') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
87
+ else if (type == 'B') {
88
+ uint8_t sub_type = *(s++);
89
+ int32_t i, n;
90
+ memcpy(&n, s, 4);
91
+ s += 4; // no point to the start of the array
92
+ kputsn("B:", 2, &str); kputc(sub_type, &str); // write the typing
93
+ for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
94
+ kputc(',', &str);
95
+ if ('c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
96
+ else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
97
+ else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
98
+ else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
99
+ else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
100
+ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
101
+ else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
102
+ }
103
+ }
104
+ }
105
+ *t = str.s, *max = str.m;
106
+ return str.l;
107
+ }
108
+
109
+ /****************
110
+ * Line parsers *
111
+ ****************/
112
+
113
+ int gfa_parse_S(gfa_t *g, char *s)
114
+ {
115
+ int i, is_ok = 0;
116
+ char *p, *q, *seg = 0, *seq = 0, *rest = 0;
117
+ uint32_t sid, len = 0;
118
+ for (i = 0, p = q = s + 2;; ++p) {
119
+ if (*p == 0 || *p == '\t') {
120
+ int c = *p;
121
+ *p = 0;
122
+ if (i == 0) seg = q;
123
+ else if (i == 1) {
124
+ seq = q[0] == '*'? 0 : gfa_strdup(q);
125
+ is_ok = 1, rest = c? p + 1 : 0;
126
+ break;
127
+ }
128
+ ++i, q = p + 1;
129
+ if (c == 0) break;
130
+ }
131
+ }
132
+ if (is_ok) { // all mandatory fields read
133
+ int l_aux, m_aux = 0, LN = -1;
134
+ uint8_t *aux = 0, *s_LN = 0;
135
+ gfa_seg_t *s;
136
+ l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
137
+ s_LN = l_aux? gfa_aux_get(l_aux, aux, "LN") : 0;
138
+ if (s_LN && s_LN[0] == 'i') {
139
+ LN = *(int32_t*)(s_LN + 1);
140
+ l_aux = gfa_aux_del(l_aux, aux, s_LN);
141
+ }
142
+ if (seq == 0) {
143
+ if (LN >= 0) len = LN;
144
+ } else len = strlen(seq);
145
+ if (LN >= 0 && len != LN && gfa_verbose >= 2)
146
+ fprintf(stderr, "[W] for segment '%s', LN:i:%d tag is different from sequence length %d\n", seg, LN, len);
147
+ sid = gfa_add_seg(g, seg);
148
+ s = &g->seg[sid];
149
+ s->len = len, s->seq = seq;
150
+ if (l_aux > 0) {
151
+ uint8_t *s_SN = 0, *s_SO = 0, *s_SR = 0;
152
+ s_SN = gfa_aux_get(l_aux, aux, "SN");
153
+ if (s_SN && *s_SN == 'Z') { // then parse stable tags
154
+ s->snid = gfa_sseq_add(g, (char*)(s_SN + 1)), s->soff = 0;
155
+ l_aux = gfa_aux_del(l_aux, aux, s_SN);
156
+ s_SO = gfa_aux_get(l_aux, aux, "SO");
157
+ if (s_SO && *s_SO == 'i') {
158
+ s->soff = *(int32_t*)(s_SO + 1);
159
+ l_aux = gfa_aux_del(l_aux, aux, s_SO);
160
+ }
161
+ }
162
+ s_SR = gfa_aux_get(l_aux, aux, "SR");
163
+ if (s_SR && *s_SR == 'i') {
164
+ s->rank = *(int32_t*)(s_SR + 1);
165
+ if (s->rank > g->max_rank) g->max_rank = s->rank;
166
+ l_aux = gfa_aux_del(l_aux, aux, s_SR);
167
+ }
168
+ gfa_sseq_update(g, s);
169
+ }
170
+ if (l_aux > 0)
171
+ s->aux.m_aux = m_aux, s->aux.l_aux = l_aux, s->aux.aux = aux;
172
+ else if (aux)
173
+ free(aux);
174
+ } else return -1;
175
+ return 0;
176
+ }
177
+
178
+ int gfa_parse_L(gfa_t *g, char *s)
179
+ {
180
+ int i, oriv = -1, oriw = -1, is_ok = 0;
181
+ char *p, *q, *segv = 0, *segw = 0, *rest = 0;
182
+ int32_t ov = INT32_MAX, ow = INT32_MAX;
183
+ for (i = 0, p = q = s + 2;; ++p) {
184
+ if (*p == 0 || *p == '\t') {
185
+ int c = *p;
186
+ *p = 0;
187
+ if (i == 0) {
188
+ segv = q;
189
+ } else if (i == 1) {
190
+ if (*q != '+' && *q != '-') return -2;
191
+ oriv = (*q != '+');
192
+ } else if (i == 2) {
193
+ segw = q;
194
+ } else if (i == 3) {
195
+ if (*q != '+' && *q != '-') return -2;
196
+ oriw = (*q != '+');
197
+ } else if (i == 4) {
198
+ if (*q == '*') {
199
+ ov = ow = 0;
200
+ } else if (*q == ':') {
201
+ ov = INT32_MAX;
202
+ ow = isdigit(*(q+1))? strtol(q+1, &q, 10) : INT32_MAX;
203
+ } else if (isdigit(*q)) {
204
+ char *r;
205
+ ov = strtol(q, &r, 10);
206
+ if (isupper(*r)) { // CIGAR
207
+ ov = ow = 0;
208
+ do {
209
+ long l;
210
+ l = strtol(q, &q, 10);
211
+ if (*q == 'M' || *q == 'D' || *q == 'N') ov += l;
212
+ if (*q == 'M' || *q == 'I' || *q == 'S') ow += l;
213
+ ++q;
214
+ } while (isdigit(*q));
215
+ } else if (*r == ':') { // overlap lengths
216
+ ow = isdigit(*(r+1))? strtol(r+1, &r, 10) : INT32_MAX;
217
+ } else break;
218
+ } else break;
219
+ is_ok = 1, rest = c? p + 1 : 0;
220
+ break;
221
+ }
222
+ ++i, q = p + 1;
223
+ if (c == 0) break;
224
+ }
225
+ }
226
+ if (i == 4 && is_ok == 0) ov = ow = 0, is_ok = 1; // no overlap field
227
+ if (is_ok) {
228
+ uint32_t v, w;
229
+ int l_aux, m_aux = 0;
230
+ uint8_t *aux = 0;
231
+ gfa_arc_t *arc;
232
+ v = gfa_add_seg(g, segv) << 1 | oriv;
233
+ w = gfa_add_seg(g, segw) << 1 | oriw;
234
+ arc = gfa_add_arc1(g, v, w, ov, ow, -1, 0);
235
+ l_aux = gfa_aux_parse(rest, &aux, &m_aux); // parse optional tags
236
+ if (l_aux) {
237
+ gfa_aux_t *a = &g->link_aux[arc->link_id];
238
+ uint8_t *s_L1, *s_L2, *s_SR;
239
+ a->l_aux = l_aux, a->m_aux = m_aux, a->aux = aux;
240
+ s_SR = gfa_aux_get(a->l_aux, a->aux, "SR");
241
+ if (s_SR && s_SR[0] == 'i') {
242
+ arc->rank = *(int32_t*)(s_SR+1);
243
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_SR);
244
+ }
245
+ s_L1 = gfa_aux_get(a->l_aux, a->aux, "L1");
246
+ if (s_L1) {
247
+ if (ov != INT32_MAX && s_L1[0] == 'i')
248
+ g->seg[v>>1].len = g->seg[v>>1].len > ov + *(int32_t*)(s_L1+1)? g->seg[v>>1].len : ov + *(int32_t*)(s_L1+1);
249
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L1);
250
+ }
251
+ s_L2 = gfa_aux_get(a->l_aux, a->aux, "L2");
252
+ if (s_L2) {
253
+ if (ow != INT32_MAX && s_L2[0] == 'i')
254
+ g->seg[w>>1].len = g->seg[w>>1].len > ow + *(int32_t*)(s_L2+1)? g->seg[w>>1].len : ow + *(int32_t*)(s_L2+1);
255
+ a->l_aux = gfa_aux_del(a->l_aux, a->aux, s_L2);
256
+ }
257
+ if (a->l_aux == 0) {
258
+ free(a->aux);
259
+ a->aux = 0, a->m_aux = 0;
260
+ }
261
+ }
262
+ } else return -1;
263
+ return 0;
264
+ }
265
+
266
+ static gfa_seg_t *gfa_parse_fa_hdr(gfa_t *g, char *s)
267
+ {
268
+ int32_t i;
269
+ char buf[16];
270
+ gfa_seg_t *seg;
271
+ for (i = 0; s[i]; ++i)
272
+ if (isspace(s[i])) break;
273
+ s[i] = 0;
274
+ sprintf(buf, "s%d", g->n_seg + 1);
275
+ i = gfa_add_seg(g, buf);
276
+ seg = &g->seg[i];
277
+ seg->snid = gfa_sseq_add(g, s + 1);
278
+ seg->soff = seg->rank = 0;
279
+ return seg;
280
+ }
281
+
282
+ static void gfa_update_fa_seq(gfa_t *g, gfa_seg_t *seg, int32_t l_seq, const char *seq)
283
+ {
284
+ if (seg == 0) return;
285
+ seg->seq = gfa_strdup(seq);
286
+ seg->len = l_seq;
287
+ gfa_sseq_update(g, seg);
288
+ }
289
+
290
+ /****************
291
+ * User-end I/O *
292
+ ****************/
293
+
294
+ gfa_t *gfa_read(const char *fn)
295
+ {
296
+ gzFile fp;
297
+ gfa_t *g;
298
+ kstring_t s = {0,0,0}, fa_seq = {0,0,0};
299
+ kstream_t *ks;
300
+ int dret, is_fa = 0;
301
+ gfa_seg_t *fa_seg = 0;
302
+ uint64_t lineno = 0;
303
+
304
+ fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
305
+ if (fp == 0) return 0;
306
+ ks = ks_init(fp);
307
+ g = gfa_init();
308
+ while (ks_getuntil(ks, KS_SEP_LINE, &s, &dret) >= 0) {
309
+ int ret = 0;
310
+ ++lineno;
311
+ if (s.l > 0 && s.s[0] == '>') { // FASTA header
312
+ is_fa = 1;
313
+ if (fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
314
+ fa_seg = gfa_parse_fa_hdr(g, s.s);
315
+ fa_seq.l = 0;
316
+ } else if (is_fa) { // FASTA mode
317
+ if (s.l >= 3 && s.s[1] == '\t') { // likely a GFA line
318
+ gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s); // finalize fa_seg
319
+ fa_seg = 0;
320
+ is_fa = 0;
321
+ } else kputsn(s.s, s.l, &fa_seq); // likely a FASTA sequence line
322
+ }
323
+ if (is_fa) continue;
324
+ if (s.l < 3 || s.s[1] != '\t') continue; // empty line
325
+ if (s.s[0] == 'S') ret = gfa_parse_S(g, s.s);
326
+ else if (s.s[0] == 'L') ret = gfa_parse_L(g, s.s);
327
+ if (ret < 0 && gfa_verbose >= 1)
328
+ fprintf(stderr, "[E] invalid %c-line at line %ld (error code %d)\n", s.s[0], (long)lineno, ret);
329
+ }
330
+ if (is_fa && fa_seg) gfa_update_fa_seq(g, fa_seg, fa_seq.l, fa_seq.s);
331
+ free(fa_seq.s);
332
+ free(s.s);
333
+ gfa_finalize(g);
334
+ ks_destroy(ks);
335
+ gzclose(fp);
336
+ return g;
337
+ }
338
+
339
+ void gfa_print(const gfa_t *g, FILE *fp, int flag)
340
+ {
341
+ uint32_t i;
342
+ uint64_t k;
343
+ for (i = 0; i < g->n_seg; ++i) {
344
+ const gfa_seg_t *s = &g->seg[i];
345
+ if (s->del) continue;
346
+ fprintf(fp, "S\t%s\t", s->name);
347
+ if (s->seq && !(flag & GFA_O_NO_SEQ)) fputs(s->seq, fp);
348
+ else fputc('*', fp);
349
+ fprintf(fp, "\tLN:i:%d", s->len);
350
+ if (s->snid >= 0 && s->soff >= 0)
351
+ fprintf(fp, "\tSN:Z:%s\tSO:i:%d", g->sseq[s->snid].name, s->soff);
352
+ if (s->rank >= 0)
353
+ fprintf(fp, "\tSR:i:%d", s->rank);
354
+ if (s->utg && s->utg->n) fprintf(fp, "\tRC:i:%d\tlc:i:%d", s->utg->n, s->utg->len_comp);
355
+ if (s->aux.l_aux > 0) {
356
+ char *t = 0;
357
+ int max = 0;
358
+ gfa_aux_format(s->aux.l_aux, s->aux.aux, &t, &max);
359
+ fputs(t, fp);
360
+ free(t);
361
+ }
362
+ fputc('\n', fp);
363
+ if (s->utg && s->utg->n) {
364
+ uint32_t j, l;
365
+ for (j = l = 0; j < s->utg->n; ++j) {
366
+ const gfa_utg_t *u = s->utg;
367
+ fprintf(fp, "A\t%s\t%d\t%c\t%s\t%d\t%d\n", s->name, l, "+-"[u->a[j]>>32&1], u->name[j], (int32_t)(u->r[j]>>32), (int32_t)u->r[j]);
368
+ l += (uint32_t)u->a[j];
369
+ }
370
+ }
371
+ }
372
+ for (k = 0; k < g->n_arc; ++k) {
373
+ const gfa_arc_t *a = &g->arc[k];
374
+ const gfa_aux_t *aux = a->link_id < g->n_arc? &g->link_aux[a->link_id] : 0;
375
+ if (a->del || a->comp) continue;
376
+ fprintf(fp, "L\t%s\t%c\t%s\t%c", g->seg[a->v_lv>>33].name, "+-"[a->v_lv>>32&1], g->seg[a->w>>1].name, "+-"[a->w&1]);
377
+ if (!(flag & GFA_O_OV_EXT)) {
378
+ fprintf(fp, "\t%dM", a->ov < a->ow? a->ov : a->ow);
379
+ } else {
380
+ if (a->ov == a->ow) fprintf(fp, "\t%dM", a->ov);
381
+ else fprintf(fp, "\t%d:%d", a->ov, a->ow);
382
+ }
383
+ if (a->rank >= 0) fprintf(fp, "\tSR:i:%d", a->rank);
384
+ fprintf(fp, "\tL1:i:%d", gfa_arc_len(*a));
385
+ fprintf(fp, "\tL2:i:%d", gfa_arc_lw(g, *a));
386
+ if (aux && aux->l_aux) {
387
+ char *t = 0;
388
+ int max = 0;
389
+ gfa_aux_format(aux->l_aux, aux->aux, &t, &max);
390
+ if (t) fputs(t, fp);
391
+ free(t);
392
+ }
393
+ fputc('\n', fp);
394
+ }
395
+ }
@@ -0,0 +1,154 @@
1
+ #ifndef __GFA_PRIV_H__
2
+ #define __GFA_PRIV_H__
3
+
4
+ #include <stdlib.h>
5
+ #include "gfa.h"
6
+
7
+ #define GFA_MALLOC(ptr, len) ((ptr) = (__typeof__(ptr))malloc((len) * sizeof(*(ptr))))
8
+ #define GFA_CALLOC(ptr, len) ((ptr) = (__typeof__(ptr))calloc((len), sizeof(*(ptr))))
9
+ #define GFA_REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
10
+ #define GFA_BZERO(ptr, len) memset((ptr), 0, (len) * sizeof(*(ptr)))
11
+ #define GFA_EXPAND(a, m) do { \
12
+ (m) = (m)? (m) + ((m)>>1) : 16; \
13
+ GFA_REALLOC((a), (m)); \
14
+ } while (0)
15
+
16
+ typedef struct { uint64_t x, y; } gfa128_t;
17
+
18
+ // linearized subgraphs
19
+
20
+ typedef struct {
21
+ uint32_t v, d;
22
+ int32_t off, n;
23
+ } gfa_subv_t;
24
+
25
+ typedef struct {
26
+ int32_t n_v, n_a, is_dag;
27
+ gfa_subv_t *v;
28
+ uint64_t *a; // high 32 bits: point to the neighbor; low 32 bit: arc index in the graph
29
+ void *km;
30
+ } gfa_sub_t;
31
+
32
+ typedef struct {
33
+ int32_t snid, ss, se;
34
+ uint32_t vs, ve;
35
+ int32_t is_bidir, n_seg, len_max, len_min;
36
+ uint32_t *v, n_paths;
37
+ char *seq_max, *seq_min; // seq_max and seq_min point to v[]
38
+ } gfa_bubble_t;
39
+
40
+ struct gfa_scbuf_s;
41
+ typedef struct gfa_scbuf_s gfa_scbuf_t;
42
+
43
+ #ifdef __cplusplus
44
+ extern "C" {
45
+ #endif
46
+
47
+ char *gfa_strdup(const char *src);
48
+ char *gfa_strndup(const char *src, size_t n);
49
+ void radix_sort_gfa64(uint64_t *st, uint64_t *en);
50
+
51
+ // add/delete one segment/arc/stable sequence
52
+ int32_t gfa_add_seg(gfa_t *g, const char *name);
53
+ gfa_arc_t *gfa_add_arc1(gfa_t *g, uint32_t v, uint32_t w, int32_t ov, int32_t ow, int64_t link_id, int comp);
54
+ int32_t gfa_sseq_get(const gfa_t *g, const char *sname);
55
+ int32_t gfa_sseq_add(gfa_t *g, const char *sname);
56
+ void gfa_sseq_update(gfa_t *g, const gfa_seg_t *s);
57
+
58
+ // whole graph operations
59
+ void gfa_arc_sort(gfa_t *g);
60
+ void gfa_arc_index(gfa_t *g);
61
+ uint32_t gfa_fix_symm_add(gfa_t *g);
62
+ void gfa_fix_symm_del(gfa_t *g); // delete multiple edges and restore skew-symmetry
63
+ void gfa_arc_rm(gfa_t *g);
64
+ void gfa_cleanup(gfa_t *g); // permanently delete arcs marked as deleted, sort and then index
65
+ void gfa_finalize(gfa_t *g);
66
+ int32_t gfa_check_multi(const gfa_t *g);
67
+ uint32_t gfa_fix_multi(gfa_t *g);
68
+
69
+ int gfa_arc_del_multi_risky(gfa_t *g);
70
+ int gfa_arc_del_asymm_risky(gfa_t *g);
71
+
72
+ // edit distance
73
+ typedef struct {
74
+ int32_t traceback;
75
+ int32_t bw_dyn, max_lag, max_chk;
76
+ int32_t s_term;
77
+ int64_t i_term;
78
+ } gfa_edopt_t;
79
+
80
+ typedef struct {
81
+ int32_t s;
82
+ uint32_t end_v;
83
+ int32_t end_off;
84
+ int32_t wlen; // length of walk
85
+ int32_t n_end;
86
+ int32_t nv;
87
+ int64_t n_iter;
88
+ int32_t *v;
89
+ } gfa_edrst_t;
90
+
91
+ void gfa_edopt_init(gfa_edopt_t *opt);
92
+ void *gfa_ed_init(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0);
93
+ void gfa_ed_step(void *z_, uint32_t v1, int32_t off1, int32_t s_term, gfa_edrst_t *r);
94
+ void gfa_ed_destroy(void *z_);
95
+
96
+ int32_t gfa_edit_dist(void *km, const gfa_edopt_t *opt, const gfa_t *g, const gfa_edseq_t *es, int32_t ql, const char *q, uint32_t v0, int32_t off0, gfa_edrst_t *rst);
97
+
98
+ // assembly related routines
99
+ int gfa_arc_del_trans(gfa_t *g, int fuzz); // transitive reduction
100
+ int gfa_arc_del_weak(gfa_t *g);
101
+ int gfa_arc_pair_strong(gfa_t *g);
102
+ int gfa_arc_del_short(gfa_t *g, int min_ovlp_len, float drop_ratio); // delete short arcs
103
+ int gfa_drop_tip(gfa_t *g, int tip_cnt, int tip_len); // cut tips
104
+ int gfa_drop_internal(gfa_t *g, int max_ext);
105
+ int gfa_cut_z(gfa_t *g, int32_t min_dist, int32_t max_dist);
106
+ int gfa_topocut(gfa_t *g, float drop_ratio, int32_t tip_cnt, int32_t tip_len);
107
+ int gfa_bub_simple(gfa_t *g, int min_side, int max_side);
108
+ int gfa_pop_bubble(gfa_t *g, int radius, int max_del, int protect_tip); // bubble popping
109
+ gfa_t *gfa_ug_gen(const gfa_t *g);
110
+ void gfa_scc_all(const gfa_t *g);
111
+
112
+ // subset, modifying the graph
113
+ void gfa_sub(gfa_t *g, int n, char *const* seg, int step);
114
+ char **gfa_query_by_reg(const gfa_t *g, int32_t n_bb, const gfa_bubble_t *bb, const char *reg, int *n_seg);
115
+
116
+ // subset, without modifying the graph
117
+ gfa_sub_t *gfa_sub_from(void *km0, const gfa_t *g, uint32_t v0, int32_t max_dist);
118
+ void gfa_sub_destroy(gfa_sub_t *sub);
119
+ void gfa_sub_print(FILE *fp, const gfa_t *g, const gfa_sub_t *sub);
120
+
121
+ gfa_scbuf_t *gfa_scbuf_init(const gfa_t *g);
122
+ gfa_sub_t *gfa_scc1(void *km0, const gfa_t *g, gfa_scbuf_t *b, uint32_t v0);
123
+ void gfa_scbuf_destroy(gfa_scbuf_t *b);
124
+
125
+ // graph augmentation
126
+ int gfa_ins_adj(const gfa_t *g, int min_len, gfa_ins_t *ins, const char *seq);
127
+ int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins);
128
+ void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq);
129
+
130
+ gfa_sfa_t *gfa_gfa2sfa(const gfa_t *g, int32_t *n_sfa_, int32_t write_seq);
131
+
132
+ void gfa_sort_ref_arc(gfa_t *g);
133
+ gfa_bubble_t *gfa_bubble(const gfa_t *g, int32_t *n_); // FIXME: doesn't work with translocation
134
+
135
+ void gfa_gt_simple_print(const gfa_t *g, float min_dc, int32_t is_path); // FIXME: doesn't work with translocations
136
+
137
+ void gfa_aux_update_cv(gfa_t *g, const char *tag, const double *cov_seg, const double *cov_link);
138
+
139
+ void gfa_sql_write(FILE *fp, const gfa_t *g, int write_seq);
140
+
141
+ static inline int64_t gfa_find_arc(const gfa_t *g, uint32_t v, uint32_t w)
142
+ {
143
+ uint32_t i, nv = gfa_arc_n(g, v), nw = 0, k = (uint32_t)-1;
144
+ gfa_arc_t *av = gfa_arc_a(g, v);
145
+ for (i = 0; i < nv; ++i)
146
+ if (av[i].w == w) ++nw, k = i;
147
+ return nw == 1? (int64_t)(&av[k] - g->arc) : nw == 0? -1 : -2;
148
+ }
149
+
150
+ #ifdef __cplusplus
151
+ }
152
+ #endif
153
+
154
+ #endif // ~__GFA_PRIV_H__