ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,166 @@
1
+ #ifndef __GFA_H__
2
+ #define __GFA_H__
3
+
4
+ #include <stdio.h>
5
+ #include <stdint.h>
6
+
7
+ #define GFA_VERSION "0.5-r247-dirty"
8
+
9
+ #define GFA_O_OV_EXT 0x1
10
+ #define GFA_O_NO_SEQ 0x2
11
+
12
+ /*
13
+ A segment is a sequence. A vertex is one side of a segment. In the code,
14
+ segment_id is an integer, and vertex_id=segment_id<<1|orientation. The
15
+ convention is to use variable u, v or w for a vertex, not for a segment. An
16
+ arc is a directed edge between two vertices in the graph. Each arc has a
17
+ complement arc. A link represents an arc and its complement. The following
18
+ diagram shows an arc v->w, and the lengths used in the gfa_arc_t struct:
19
+
20
+ |<--- lv --->|<-- ov -->|
21
+ v: ------------------------>
22
+ ||overlap|||
23
+ w: -------------------------->
24
+ |<-- ow -->|<---- lw ---->|
25
+
26
+ The graph topology is solely represented by an array of gfa_arc_t objects
27
+ (see gfa_t::arc[]), where both an arc and its complement are present. The
28
+ array is sorted by gfa_arc_t::v_lv and indexed by gfa_t::idx[] most of time.
29
+ gfa_arc_a(g, v), of size gfa_arc_n(g, v), gives the array of arcs that leaves
30
+ a vertex v in the graph g.
31
+ */
32
+
33
+ typedef struct {
34
+ uint64_t v_lv; // higher 32 bits: vertex_id; lower 32 bits: lv; packed together for sorting
35
+ uint32_t w;
36
+ int32_t rank;
37
+ int32_t ov, ow;
38
+ uint64_t link_id:61, strong:1, del:1, comp:1; // link_id: a pair of dual arcs are supposed to have the same link_id
39
+ } gfa_arc_t;
40
+
41
+ #define gfa_arc_head(a) ((uint32_t)((a).v_lv>>32))
42
+ #define gfa_arc_tail(a) ((a).w)
43
+ #define gfa_arc_len(a) ((uint32_t)(a).v_lv) // different from the original string graph
44
+ #define gfa_arc_lw(g, a) ((g)->seg[(a).w>>1].len - (a).ow)
45
+
46
+ #define gfa_arc_n(g, v) ((uint32_t)(g)->idx[(v)])
47
+ #define gfa_arc_a(g, v) (&(g)->arc[(g)->idx[(v)]>>32])
48
+
49
+ typedef struct {
50
+ uint32_t m_aux, l_aux;
51
+ uint8_t *aux;
52
+ } gfa_aux_t;
53
+
54
+ typedef struct {
55
+ uint32_t start, end; // start: starting vertex in the string graph; end: ending vertex
56
+ uint32_t len_comp, dummy; // len_comp: the length of the complement unitig
57
+ uint32_t m, n; // number of reads
58
+ uint64_t *a; // list of reads
59
+ uint64_t *r; // start and end on each read
60
+ char **name;
61
+ } gfa_utg_t;
62
+
63
+ typedef struct {
64
+ int32_t len;
65
+ uint32_t del:16, circ:16;
66
+ int32_t snid; // stable name ID
67
+ int32_t soff; // stable start position
68
+ int32_t rank; // stable rank
69
+ char *name, *seq;
70
+ gfa_utg_t *utg;
71
+ gfa_aux_t aux;
72
+ } gfa_seg_t;
73
+
74
+ typedef struct {
75
+ int32_t len, snid, soff, rank;
76
+ uint64_t end[2];
77
+ char *seq;
78
+ } gfa_sfa_t;
79
+
80
+ typedef struct {
81
+ char *name;
82
+ int32_t min, max, rank;
83
+ } gfa_sseq_t;
84
+
85
+ #define gfa_n_vtx(g) ((g)->n_seg << 1)
86
+
87
+ typedef struct {
88
+ // segments
89
+ uint32_t m_seg, n_seg, max_rank;
90
+ gfa_seg_t *seg;
91
+ void *h_names;
92
+ // persistent names
93
+ uint32_t m_sseq, n_sseq;
94
+ gfa_sseq_t *sseq;
95
+ void *h_snames;
96
+ // links
97
+ uint64_t m_arc, n_arc;
98
+ gfa_arc_t *arc;
99
+ gfa_aux_t *link_aux;
100
+ uint64_t *idx;
101
+ } gfa_t;
102
+
103
+ typedef struct {
104
+ const char *seq;
105
+ int32_t len;
106
+ } gfa_edseq_t;
107
+
108
+ // graph augmentation
109
+
110
+ typedef struct {
111
+ uint32_t v[2];
112
+ int32_t voff[2];
113
+ int32_t coff[2], ctg;
114
+ } gfa_ins_t;
115
+
116
+ extern int gfa_verbose;
117
+ extern unsigned char gfa_comp_table[256];
118
+
119
+ #ifdef __cplusplus
120
+ extern "C" {
121
+ #endif
122
+
123
+ gfa_t *gfa_init(void);
124
+ void gfa_destroy(gfa_t *g);
125
+ gfa_t *gfa_read(const char *fn);
126
+ void gfa_print(const gfa_t *g, FILE *fp, int M_only);
127
+
128
+ gfa_edseq_t *gfa_edseq_init(const gfa_t *g);
129
+ void gfa_edseq_destroy(int32_t n_seg, gfa_edseq_t *es);
130
+
131
+ int32_t gfa_name2id(const gfa_t *g, const char *name);
132
+ uint8_t *gfa_aux_get(int l_data, const uint8_t *data, const char tag[2]);
133
+ int gfa_aux_del(int l_data, uint8_t *data, uint8_t *s);
134
+
135
+ #ifdef __cplusplus
136
+ }
137
+ #endif
138
+
139
+ #ifndef kroundup32
140
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
141
+ #endif
142
+
143
+ static inline void gfa_arc_del(gfa_t *g, uint32_t v, uint32_t w, int del)
144
+ {
145
+ uint32_t i, nv = gfa_arc_n(g, v);
146
+ gfa_arc_t *av = gfa_arc_a(g, v);
147
+ for (i = 0; i < nv; ++i)
148
+ if (av[i].w == w) av[i].del = !!del;
149
+ }
150
+
151
+ static inline void gfa_seg_del(gfa_t *g, uint32_t s)
152
+ {
153
+ uint32_t k;
154
+ g->seg[s].del = 1;
155
+ for (k = 0; k < 2; ++k) {
156
+ uint32_t i, v = s<<1 | k;
157
+ uint32_t nv = gfa_arc_n(g, v);
158
+ gfa_arc_t *av = gfa_arc_a(g, v);
159
+ for (i = 0; i < nv; ++i) {
160
+ av[i].del = 1;
161
+ gfa_arc_del(g, av[i].w^1, v^1, 1);
162
+ }
163
+ }
164
+ }
165
+
166
+ #endif
@@ -0,0 +1,182 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <assert.h>
4
+ #include <ctype.h>
5
+ #include "kthread.h"
6
+ #include "kalloc.h"
7
+ #include "sys.h"
8
+ #include "bseq.h"
9
+ #include "ggen.h"
10
+ #include "mgpriv.h"
11
+ #include "gfa-priv.h"
12
+
13
+ typedef struct {
14
+ int n_seq;
15
+ mg_bseq1_t *seq;
16
+ mg_gchains_t **gcs;
17
+ } maprst_t;
18
+
19
+ typedef struct {
20
+ const mg_mapopt_t *opt;
21
+ const mg_idx_t *gi;
22
+ mg_tbuf_t **buf;
23
+ maprst_t *r;
24
+ } step_t;
25
+
26
+ static void worker_for(void *_data, long i, int tid) // kt_for() callback
27
+ {
28
+ step_t *s = (step_t*)_data;
29
+ if (mg_dbg_flag & MG_DBG_QNAME)
30
+ fprintf(stderr, "QR\t%s\t%d\t%d\n", s->r->seq[i].name, tid, s->r->seq[i].l_seq);
31
+ if ((s->opt->flag & MG_M_SKIP_GCHECK) == 0 && mg_verbose >= 2) {
32
+ if (gfa_sseq_get(s->gi->g, s->r->seq[i].name) >= 0)
33
+ fprintf(stderr, "[W::%s] stable sequence \"%s\" already present in the graph. This will lead to inconsistent rGFA.\n",
34
+ __func__, s->r->seq[i].name);
35
+ }
36
+ s->r->gcs[i] = mg_map(s->gi, s->r->seq[i].l_seq, s->r->seq[i].seq, s->buf[tid], s->opt, s->r->seq[i].name);
37
+ }
38
+
39
+ static maprst_t *ggen_map(const mg_idx_t *gi, const mg_mapopt_t *opt, const char *fn, int n_threads)
40
+ {
41
+ mg_bseq_file_t *fp;
42
+ maprst_t *r;
43
+ step_t s;
44
+ int i;
45
+
46
+ fp = mg_bseq_open(fn);
47
+ if (fp == 0) return 0;
48
+
49
+ KCALLOC(0, r, 1);
50
+ r->seq = mg_bseq_read(fp, 1ULL<<62, 0, 0, 0, &r->n_seq);
51
+ mg_bseq_close(fp);
52
+ if (mg_verbose >= 3)
53
+ fprintf(stderr, "[M::%s::%.3f*%.2f] loaded file \"%s\"\n", __func__,
54
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), fn);
55
+ for (i = 0; i < r->n_seq; ++i) {
56
+ r->seq[i].rid = i;
57
+ mg_toupper(r->seq[i].l_seq, r->seq[i].seq);
58
+ }
59
+ KCALLOC(0, r->gcs, r->n_seq);
60
+
61
+ s.gi = gi, s.opt = opt, s.r = r;
62
+ KCALLOC(0, s.buf, n_threads);
63
+ for (i = 0; i < n_threads; ++i) s.buf[i] = mg_tbuf_init();
64
+ kt_for(n_threads, worker_for, &s, r->n_seq);
65
+ if (mg_verbose >= 3)
66
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequence(s) to the graph\n", __func__,
67
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), r->n_seq);
68
+ for (i = 0; i < n_threads; ++i) mg_tbuf_destroy(s.buf[i]);
69
+ free(s.buf);
70
+ return r;
71
+ }
72
+
73
+ static void mg_free_maprst(maprst_t *r)
74
+ {
75
+ int i;
76
+ for (i = 0; i < r->n_seq; ++i) {
77
+ mg_gchain_free(r->gcs[i]);
78
+ free(r->seq[i].seq); free(r->seq[i].name);
79
+ }
80
+ free(r->gcs); free(r->seq);
81
+ free(r);
82
+ }
83
+
84
+ int mg_ggen_aug(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
85
+ {
86
+ int i;
87
+ mg_mapopt_t opt = *opt0;
88
+ if (g == 0) return -1;
89
+ for (i = 0; i < n_fn; ++i) {
90
+ mg_idx_t *gi;
91
+ maprst_t *r;
92
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
93
+ r = ggen_map(gi, &opt, fn[i], n_threads);
94
+ if (opt0->flag & MG_M_CIGAR)
95
+ mg_ggsimple_cigar(0, go, g, r->n_seq, r->seq, r->gcs);
96
+ else
97
+ mg_ggsimple(0, go, g, r->n_seq, r->seq, r->gcs);
98
+ mg_free_maprst(r);
99
+ mg_idx_destroy(gi);
100
+ }
101
+ return 0;
102
+ }
103
+
104
+ int mg_ggen_cov(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
105
+ {
106
+ int32_t i;
107
+ mg_mapopt_t opt = *opt0;
108
+ mg_idx_t *gi;
109
+ double *cov_seg, *cov_link;
110
+ int64_t j;
111
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
112
+ KCALLOC(0, cov_seg, g->n_seg);
113
+ KCALLOC(0, cov_link, g->n_arc);
114
+ for (i = 0; i < n_fn; ++i) {
115
+ maprst_t *r;
116
+ r = ggen_map(gi, &opt, fn[i], n_threads);
117
+ mg_cov_asm(g, r->n_seq, r->gcs, go->min_mapq, go->min_map_len, cov_seg, cov_link);
118
+ mg_free_maprst(r);
119
+ }
120
+ mg_idx_destroy(gi);
121
+ for (j = 0; j < g->n_seg; ++j) cov_seg[j] /= n_fn;
122
+ for (j = 0; j < g->n_arc; ++j) cov_link[j] /= n_fn;
123
+ gfa_aux_update_cv(g, "cf", cov_seg, cov_link);
124
+ free(cov_seg); free(cov_link);
125
+ return 0;
126
+ }
127
+
128
+ int mg_ggen_call(gfa_t *g, const char *fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
129
+ {
130
+ mg_mapopt_t opt = *opt0;
131
+ mg_idx_t *gi;
132
+ maprst_t *r;
133
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
134
+ r = ggen_map(gi, &opt, fn, n_threads);
135
+ mg_call_asm(g, r->n_seq, r->seq, r->gcs, go->min_mapq, go->min_map_len);
136
+ mg_free_maprst(r);
137
+ mg_idx_destroy(gi);
138
+ return 0;
139
+ }
140
+
141
+ int mg_ggen(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt, const mg_ggopt_t *go, int n_threads)
142
+ {
143
+ if (go->flag & MG_G_CALL) return mg_ggen_call(g, fn[0], ipt, opt, go, n_threads);
144
+ else if (go->flag & MG_G_CAL_COV) return mg_ggen_cov(g, n_fn, fn, ipt, opt, go, n_threads);
145
+ else return mg_ggen_aug(g, n_fn, fn, ipt, opt, go, n_threads);
146
+ }
147
+
148
+ int32_t mg_path2seq(void *km, const gfa_t *g, const mg_gchains_t *gcs, int32_t ls, int32_t le, int32_t voff[2], char **seq_, int32_t *cap_) // NB: [ls,le] is a CLOSED interval
149
+ {
150
+ extern unsigned char gfa_comp_table[256];
151
+ int32_t i, k, l = 0, cap = *cap_;
152
+ char *seq = *seq_;
153
+ assert(0 <= ls && ls <= le && le < gcs->n_lc);
154
+ for (k = ls; k <= le; ++k) {
155
+ uint32_t v = gcs->lc[k].v, len = g->seg[v>>1].len;
156
+ int32_t st = 0, en = len, tmp;
157
+ if (k == ls) st = voff[0];
158
+ if (k == le) en = voff[1];
159
+ assert(0 <= st && st <= en && en <= len);
160
+ if (en - st + l + 1 > cap) {
161
+ cap = en - st + l + 1;
162
+ kroundup32(cap);
163
+ KREALLOC(km, seq, cap);
164
+ }
165
+ if (v&1) {
166
+ uint8_t *ss = (uint8_t*)g->seg[v>>1].seq;
167
+ tmp = st, st = len - en, en = len - tmp;
168
+ for (i = en - 1; i >= st; --i)
169
+ seq[l++] = gfa_comp_table[ss[i]];
170
+ } else {
171
+ memcpy(&seq[l], &g->seg[v>>1].seq[st], en - st);
172
+ l += en - st;
173
+ }
174
+ }
175
+ if (l == 0 && cap == 0) {
176
+ cap = 8;
177
+ KREALLOC(km, seq, cap);
178
+ }
179
+ seq[l] = 0;
180
+ *seq_ = seq, *cap_ = cap;
181
+ return l;
182
+ }
@@ -0,0 +1,21 @@
1
+ #ifndef MG_GGEN_H
2
+ #define MG_GGEN_H
3
+
4
+ #include "minigraph.h"
5
+ #include "bseq.h"
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ int32_t mg_path2seq(void *km, const gfa_t *g, const mg_gchains_t *gcs, int32_t ls, int32_t le, int32_t voff[2], char **seq_, int32_t *cap_);
12
+ void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs);
13
+ void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs);
14
+
15
+ void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
20
+
21
+ #endif