ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,372 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include "gfa-priv.h"
4
+ #include "kalloc.h"
5
+ #include "ksort.h"
6
+ #include "kvec.h"
7
+
8
+ #define generic_key(x) (x)
9
+ KRADIX_SORT_INIT(gfa32, uint32_t, generic_key, 4)
10
+
11
+ void gfa_sort_ref_arc(gfa_t *g)
12
+ {
13
+ uint32_t v, n_vtx = gfa_n_vtx(g);
14
+ for (v = 0; v < n_vtx; ++v) {
15
+ gfa_seg_t *s = &g->seg[v>>1];
16
+ int32_t i, nv;
17
+ gfa_arc_t *av, b;
18
+ if (s->rank != 0) continue;
19
+ nv = gfa_arc_n(g, v);
20
+ av = gfa_arc_a(g, v);
21
+ for (i = 0; i < nv; ++i) {
22
+ uint32_t w = av[i].w;
23
+ gfa_seg_t *t = &g->seg[w>>1];
24
+ if (t->rank == 0 && t->snid == s->snid && (v&1) == (w&1)) {
25
+ if (((v&1) == 0 && s->soff + s->len == t->soff) || ((v&1) == 1 && t->soff + t->len == s->soff))
26
+ break;
27
+ }
28
+ }
29
+ if (nv > 0 && i == nv) fprintf(stderr, "X\t%c%s\t%d\t%s\t%d\n", "><"[v&1], s->name, i, g->sseq[s->snid].name, s->soff);
30
+ assert(nv == 0 || i < nv);
31
+ if (i > 0 && i < nv) b = av[i], av[i] = av[0], av[0] = b;
32
+ }
33
+ }
34
+
35
+ void gfa_sub_print(FILE *fp, const gfa_t *g, const gfa_sub_t *sub)
36
+ {
37
+ int32_t i, j;
38
+ for (i = 0; i < sub->n_v; ++i) {
39
+ gfa_subv_t *p = &sub->v[i];
40
+ fprintf(fp, "[%d]\t%d\t%c%s\t%d\t%d", i, p->v, "><"[p->v&1], g->seg[p->v>>1].name, p->d, p->n);
41
+ if (p->n > 0) {
42
+ fputc('\t', fp);
43
+ for (j = 0; j < p->n; ++j) {
44
+ if (j) fputc(',', fp);
45
+ fprintf(fp, "%d", (uint32_t)(sub->a[p->off + j]>>32));
46
+ }
47
+ }
48
+ fputc('\n', fp);
49
+ }
50
+ }
51
+
52
+ /****************
53
+ * Tarjan's SCC *
54
+ ****************/
55
+
56
+ typedef struct {
57
+ uint32_t index, low:31, stack:1;
58
+ uint32_t i; // index in gfa_sub_t::v[]; a temporary field
59
+ uint32_t start; // starting vertex
60
+ } gfa_scinfo_t;
61
+
62
+ struct gfa_scbuf_s {
63
+ uint32_t index;
64
+ gfa_scinfo_t *a; // node information
65
+ kvec_t(uint32_t) ts; // Tarjan's stack
66
+ kvec_t(uint64_t) ds; // DFS stack
67
+ };
68
+
69
+ gfa_scbuf_t *gfa_scbuf_init(const gfa_t *g)
70
+ {
71
+ uint32_t v, n_vtx = gfa_n_vtx(g);
72
+ gfa_scbuf_t *b;
73
+ GFA_CALLOC(b, 1);
74
+ GFA_CALLOC(b->a, n_vtx);
75
+ for (v = 0; v < n_vtx; ++v)
76
+ b->a[v].index = b->a[v].start = (uint32_t)-1;
77
+ return b;
78
+ }
79
+
80
+ void gfa_scbuf_destroy(gfa_scbuf_t *b)
81
+ {
82
+ free(b->a); free(b->ts.a); free(b->ds.a); free(b);
83
+ }
84
+
85
+ gfa_sub_t *gfa_scc1(void *km0, const gfa_t *g, gfa_scbuf_t *b, uint32_t v0)
86
+ {
87
+ gfa_sub_t *sub;
88
+ uint32_t k, off, m_v = 0;
89
+
90
+ KCALLOC(km0, sub, 1);
91
+ sub->km = km0;
92
+
93
+ kv_push(uint64_t, b->ds, (uint64_t)v0<<32);
94
+ while (b->ds.n > 0) {
95
+ uint64_t x = kv_pop(b->ds);
96
+ uint32_t i = (uint32_t)x, v = x>>32, nv;
97
+ if (i == 0) { // i is the number of outgoing edges already visited
98
+ b->a[v].low = b->a[v].index = b->index++;
99
+ b->a[v].stack = 1;
100
+ kv_push(uint32_t, b->ts, v);
101
+ }
102
+ nv = gfa_arc_n(g, v);
103
+ if (i == nv) { // done with v
104
+ if (b->a[v].low == b->a[v].index) {
105
+ int32_t i, j = b->ts.n - 1;
106
+ while (b->ts.a[j] != v) --j;
107
+ for (i = b->ts.n - 1; i >= j; --i) {
108
+ uint32_t w = b->ts.a[i];
109
+ gfa_subv_t *p;
110
+ //fprintf(stderr, "V\t%c%s\t%d\t%c%s\t%d\t%d\n", "><"[v&1], g->seg[v>>1].name, i, "><"[w&1], g->seg[w>>1].name, b->a[w^1].stack, b->a[w].index);
111
+ if (sub->n_v == m_v) KEXPAND(sub->km, sub->v, m_v);
112
+ p = &sub->v[sub->n_v++];
113
+ p->v = w;
114
+ b->a[w].stack = 0;
115
+ }
116
+ b->ts.n = j;
117
+ }
118
+ if (b->ds.n > 0) { // if the DFS stack is not empty, update the top element
119
+ uint32_t w = v;
120
+ v = b->ds.a[b->ds.n - 1] >> 32;
121
+ b->a[v].low = b->a[v].low < b->a[w].low? b->a[v].low : b->a[w].low;
122
+ }
123
+ } else { // process v's neighbor av[i].w
124
+ gfa_arc_t *av = gfa_arc_a(g, v);
125
+ uint32_t w = av[i].w;
126
+ kv_push(uint64_t, b->ds, (uint64_t)v<<32 | (i+1)); // update the old top of the stack
127
+ if (b->a[w].index == (uint32_t)-1 && b->a[w^1].stack == 0)
128
+ kv_push(uint64_t, b->ds, (uint64_t)w<<32);
129
+ else if (b->a[w].stack)
130
+ b->a[v].low = b->a[v].low < b->a[w].index? b->a[v].low : b->a[w].index;
131
+ }
132
+ }
133
+
134
+ // reverse the vertex array
135
+ for (k = 0; k < sub->n_v>>1; ++k) {
136
+ gfa_subv_t x;
137
+ x = sub->v[k], sub->v[k] = sub->v[sub->n_v - k - 1], sub->v[sub->n_v - k - 1] = x;
138
+ }
139
+
140
+ // fill other fields in sub
141
+ for (k = 0; k < sub->n_v; ++k)
142
+ b->a[sub->v[k].v].start = v0, b->a[sub->v[k].v].i = k;
143
+ for (k = 0, off = 0; k < sub->n_v; ++k) { // precompute the length of gfa_sub_t::a[]
144
+ uint32_t v = sub->v[k].v;
145
+ int32_t i, nv = gfa_arc_n(g, v);
146
+ gfa_arc_t *av = gfa_arc_a(g, v);
147
+ for (i = 0; i < nv; ++i)
148
+ if (b->a[av[i].w].start == v0)
149
+ ++off;
150
+ }
151
+ sub->n_a = off;
152
+ KCALLOC(sub->km, sub->a, sub->n_a);
153
+ for (k = 0, off = 0; k < sub->n_v; ++k) {
154
+ uint32_t o0, v = sub->v[k].v;
155
+ int32_t i, nv = gfa_arc_n(g, v);
156
+ gfa_arc_t *av = gfa_arc_a(g, v);
157
+ for (i = 0, o0 = off; i < nv; ++i)
158
+ if (b->a[av[i].w].start == v0)
159
+ sub->a[off++] = (uint64_t)b->a[av[i].w].i << 32 | (&av[i] - g->arc);
160
+ sub->v[k].d = 0;
161
+ sub->v[k].off = o0;
162
+ sub->v[k].n = off - o0;
163
+ if (o0 < off) {
164
+ radix_sort_gfa64(&sub->a[o0], &sub->a[off]);
165
+ if (sub->a[o0]>>32 <= k) sub->is_dag = 0;
166
+ }
167
+ }
168
+ return sub;
169
+ }
170
+
171
+ void gfa_scc_all(const gfa_t *g)
172
+ {
173
+ uint32_t v, n_vtx = gfa_n_vtx(g);
174
+ gfa_scbuf_t *b;
175
+ b = gfa_scbuf_init(g);
176
+ for (v = 0; v < n_vtx; ++v)
177
+ if (b->a[v].index == (uint32_t)-1 && b->a[v^1].index == (uint32_t)-1) {
178
+ gfa_sub_t *sub;
179
+ sub = gfa_scc1(0, g, b, v);
180
+ gfa_sub_print(stderr, g, sub);
181
+ gfa_sub_destroy(sub);
182
+ }
183
+ gfa_scbuf_destroy(b);
184
+ }
185
+
186
+ void gfa_sub_destroy(gfa_sub_t *sub)
187
+ {
188
+ void *km;
189
+ if (sub == 0) return;
190
+ km = sub->km;
191
+ kfree(km, sub->v); kfree(km, sub->a); kfree(km, sub);
192
+ }
193
+
194
+ /******************
195
+ * Bubble calling *
196
+ ******************/
197
+
198
+ typedef struct {
199
+ int32_t ld, sd, rd;
200
+ int32_t lp, sp;
201
+ float lf, sf, rf;
202
+ } bb_aux_t;
203
+
204
+ static void bb_write_seq(const gfa_t *g, int32_t n, const uint32_t *v, int32_t l_seq, char *seq)
205
+ {
206
+ int32_t k, l;
207
+ for (k = n - 1, l = 0; k >= 0; --k) {
208
+ const gfa_seg_t *s = &g->seg[v[k]>>1];
209
+ if (v[k]&1) {
210
+ int32_t p;
211
+ for (p = s->len - 1; p >= 0; --p)
212
+ seq[l++] = gfa_comp_table[(uint8_t)s->seq[p]];
213
+ } else {
214
+ memcpy(&seq[l], s->seq, s->len);
215
+ l += s->len;
216
+ }
217
+ }
218
+ assert(l == l_seq);
219
+ seq[l] = 0;
220
+ }
221
+
222
+ static int32_t bb_n_paths(const gfa_t *g, const gfa_sub_t *sub, int32_t js, int32_t je)
223
+ {
224
+ int32_t j, k;
225
+ int64_t *cnt, c;
226
+ GFA_CALLOC(cnt, je - js + 1);
227
+ cnt[0] = 1;
228
+ for (j = js; j < je; ++j) {
229
+ const gfa_subv_t *t = &sub->v[j];
230
+ for (k = 0; k < t->n; ++k) {
231
+ uint64_t a = sub->a[t->off + k];
232
+ int32_t jv = (int32_t)(a>>32);
233
+ if (jv <= j || jv > je) continue;
234
+ if (cnt[jv - js] + cnt[j - js] > INT32_MAX)
235
+ cnt[jv - js] = INT32_MAX;
236
+ else cnt[jv - js] += cnt[j - js];
237
+ }
238
+ }
239
+ c = cnt[je - js];
240
+ free(cnt);
241
+ return c < INT32_MAX? c : INT32_MAX;
242
+ }
243
+
244
+ gfa_bubble_t *gfa_bubble(const gfa_t *g, int32_t *n_bb_)
245
+ {
246
+ uint32_t i, *vs, *vmin, *vtmp = 0;
247
+ int32_t n_bb = 0, m_bb = 0, m_vtmp = 0;
248
+ gfa_bubble_t *bb = 0;
249
+ gfa_scbuf_t *scbuf;
250
+
251
+ GFA_MALLOC(vs, g->n_sseq);
252
+ GFA_MALLOC(vmin, g->n_sseq);
253
+ for (i = 0; i < g->n_sseq; ++i)
254
+ vs[i] = (uint32_t)-1, vmin[i] = UINT32_MAX;
255
+ for (i = 0; i < g->n_seg; ++i) {
256
+ const gfa_seg_t *s = &g->seg[i];
257
+ if (s->rank != 0 || s->snid < 0) continue;
258
+ if ((uint32_t)s->soff < vmin[s->snid])
259
+ vmin[s->snid] = s->soff, vs[s->snid] = i<<1;
260
+ }
261
+ free(vmin);
262
+
263
+ scbuf = gfa_scbuf_init(g);
264
+ for (i = 0; i < g->n_sseq; ++i) {
265
+ gfa_sub_t *sub;
266
+ int32_t j, jst, max_a, max_soff;
267
+ bb_aux_t *ba;
268
+
269
+ if (vs[i] == (uint32_t)-1) continue;
270
+ #if 0
271
+ sub = gfa_sub_from(0, g, vs[i], 0);
272
+ #else
273
+ sub = gfa_scc1(0, g, scbuf, vs[i]);
274
+ #endif
275
+ //gfa_sub_print(stderr, g, sub);
276
+ GFA_CALLOC(ba, sub->n_v);
277
+ for (j = 0; j < sub->n_v; ++j)
278
+ ba[j].sd = INT32_MAX, ba[j].lp = ba[j].sp = -1;
279
+ ba[0].sd = 0;
280
+ for (j = 0; j < sub->n_v; ++j) {
281
+ gfa_subv_t *t = &sub->v[j];
282
+ int32_t k;
283
+ for (k = 0; k < t->n; ++k) {
284
+ uint64_t a = sub->a[t->off + k];
285
+ int32_t jv = (int32_t)(a>>32);
286
+ int32_t l = (int32_t)g->arc[(uint32_t)a].v_lv;
287
+ if (jv <= j) continue; // skip loop or cycle
288
+ if (ba[jv].sd >= ba[j].sd + l)
289
+ ba[jv].sd = ba[j].sd + l, ba[jv].sp = j;
290
+ if (ba[jv].ld < ba[j].ld + l)
291
+ ba[jv].ld = ba[j].ld + l, ba[jv].lp = j;
292
+ }
293
+ }
294
+ for (j = 0, jst = 0, max_a = max_soff = -1; j < sub->n_v; ++j) {
295
+ gfa_subv_t *t = &sub->v[j];
296
+ int32_t k;
297
+ if (j == max_a && g->seg[t->v>>1].soff > max_soff) {
298
+ const gfa_seg_t *sst = &g->seg[sub->v[jst].v>>1];
299
+ const gfa_seg_t *sen = &g->seg[t->v>>1];
300
+ if (sst->snid == i && sen->snid == i) {
301
+ int32_t n, l;
302
+ uint32_t *v;
303
+ gfa_bubble_t *b;
304
+
305
+ // basic information
306
+ if (n_bb == m_bb) GFA_EXPAND(bb, m_bb);
307
+ b = &bb[n_bb++];
308
+ b->snid = i;
309
+ b->vs = sub->v[jst].v;
310
+ b->ve = t->v;
311
+ b->ss = sst->soff + sst->len;
312
+ b->se = sen->soff;
313
+ b->len_min = ba[j].sd - ba[jst].sd - sst->len;
314
+ b->len_max = ba[j].ld - ba[jst].ld - sst->len;
315
+ b->n_paths = bb_n_paths(g, sub, jst, j);
316
+ //fprintf(stderr, "X\t%s[%d]\tvs=%c%s\tve=%c%s\tlen_min=%d\n", g->sseq[i].name, i, "><"[b->vs&1], g->seg[b->vs>>1].name, "><"[b->ve&1], g->seg[b->ve>>1].name, b->len_min);
317
+ assert(b->len_min >= 0);
318
+ assert(b->len_max >= 0 && b->len_max >= b->len_min);
319
+ b->n_seg = j - jst + 1;
320
+ l = (b->len_min + 1) + (b->len_max + 1);
321
+ l = (l + 3) / 4 + b->n_seg;
322
+ GFA_CALLOC(b->v, l);
323
+ b->seq_min = (char*)(b->v + b->n_seg);
324
+ b->seq_max = b->seq_min + b->len_min + 1;
325
+ for (k = jst; k <= j; ++k)
326
+ b->v[k - jst] = sub->v[k].v;
327
+
328
+ // test bubble involving both strands (mostly inversions)
329
+ if (b->n_seg > m_vtmp) {
330
+ m_vtmp = b->n_seg;
331
+ kroundup32(m_vtmp);
332
+ GFA_REALLOC(vtmp, m_vtmp);
333
+ }
334
+ for (k = 0; k < b->n_seg; ++k) vtmp[k] = b->v[k]>>1;
335
+ radix_sort_gfa32(vtmp, vtmp + b->n_seg);
336
+ for (k = 1; k < b->n_seg; ++k)
337
+ if (vtmp[k] == vtmp[k-1]) break;
338
+ b->is_bidir = (k < b->n_seg);
339
+
340
+ // generate sequences and cf_min/cf_max
341
+ GFA_MALLOC(v, j - jst);
342
+ k = j, n = 0;
343
+ while (k > jst) {
344
+ if (k < j) v[n++] = sub->v[k].v;
345
+ k = ba[k].sp;
346
+ }
347
+ bb_write_seq(g, n, v, b->len_min, b->seq_min);
348
+ k = j, n = 0;
349
+ while (k > jst) {
350
+ if (k < j) v[n++] = sub->v[k].v;
351
+ k = ba[k].lp;
352
+ }
353
+ bb_write_seq(g, n, v, b->len_max, b->seq_max);
354
+ free(v);
355
+ } // ~if(sst->snid==i&&sen->snid==i)
356
+ max_a = max_soff = -1, jst = j;
357
+ } // ~if(j==max_a)
358
+ for (k = 0; k < t->n; ++k)
359
+ if ((int32_t)(sub->a[t->off + k]>>32) > max_a)
360
+ max_a = sub->a[t->off + k]>>32;
361
+ if (g->seg[t->v>>1].snid == i && g->seg[t->v>>1].soff > max_soff)
362
+ max_soff = g->seg[t->v>>1].soff;
363
+ }
364
+ free(ba);
365
+ gfa_sub_destroy(sub);
366
+ }
367
+ free(vtmp);
368
+ gfa_scbuf_destroy(scbuf);
369
+ free(vs);
370
+ *n_bb_ = n_bb;
371
+ return bb;
372
+ }