ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,166 @@
1
+ #ifndef __GFA_H__
2
+ #define __GFA_H__
3
+
4
+ #include <stdio.h>
5
+ #include <stdint.h>
6
+
7
+ #define GFA_VERSION "0.5-r247-dirty"
8
+
9
+ #define GFA_O_OV_EXT 0x1
10
+ #define GFA_O_NO_SEQ 0x2
11
+
12
+ /*
13
+ A segment is a sequence. A vertex is one side of a segment. In the code,
14
+ segment_id is an integer, and vertex_id=segment_id<<1|orientation. The
15
+ convention is to use variable u, v or w for a vertex, not for a segment. An
16
+ arc is a directed edge between two vertices in the graph. Each arc has a
17
+ complement arc. A link represents an arc and its complement. The following
18
+ diagram shows an arc v->w, and the lengths used in the gfa_arc_t struct:
19
+
20
+ |<--- lv --->|<-- ov -->|
21
+ v: ------------------------>
22
+ ||overlap|||
23
+ w: -------------------------->
24
+ |<-- ow -->|<---- lw ---->|
25
+
26
+ The graph topology is solely represented by an array of gfa_arc_t objects
27
+ (see gfa_t::arc[]), where both an arc and its complement are present. The
28
+ array is sorted by gfa_arc_t::v_lv and indexed by gfa_t::idx[] most of time.
29
+ gfa_arc_a(g, v), of size gfa_arc_n(g, v), gives the array of arcs that leaves
30
+ a vertex v in the graph g.
31
+ */
32
+
33
+ typedef struct {
34
+ uint64_t v_lv; // higher 32 bits: vertex_id; lower 32 bits: lv; packed together for sorting
35
+ uint32_t w;
36
+ int32_t rank;
37
+ int32_t ov, ow;
38
+ uint64_t link_id:61, strong:1, del:1, comp:1; // link_id: a pair of dual arcs are supposed to have the same link_id
39
+ } gfa_arc_t;
40
+
41
+ #define gfa_arc_head(a) ((uint32_t)((a).v_lv>>32))
42
+ #define gfa_arc_tail(a) ((a).w)
43
+ #define gfa_arc_len(a) ((uint32_t)(a).v_lv) // different from the original string graph
44
+ #define gfa_arc_lw(g, a) ((g)->seg[(a).w>>1].len - (a).ow)
45
+
46
+ #define gfa_arc_n(g, v) ((uint32_t)(g)->idx[(v)])
47
+ #define gfa_arc_a(g, v) (&(g)->arc[(g)->idx[(v)]>>32])
48
+
49
+ typedef struct {
50
+ uint32_t m_aux, l_aux;
51
+ uint8_t *aux;
52
+ } gfa_aux_t;
53
+
54
+ typedef struct {
55
+ uint32_t start, end; // start: starting vertex in the string graph; end: ending vertex
56
+ uint32_t len_comp, dummy; // len_comp: the length of the complement unitig
57
+ uint32_t m, n; // number of reads
58
+ uint64_t *a; // list of reads
59
+ uint64_t *r; // start and end on each read
60
+ char **name;
61
+ } gfa_utg_t;
62
+
63
+ typedef struct {
64
+ int32_t len;
65
+ uint32_t del:16, circ:16;
66
+ int32_t snid; // stable name ID
67
+ int32_t soff; // stable start position
68
+ int32_t rank; // stable rank
69
+ char *name, *seq;
70
+ gfa_utg_t *utg;
71
+ gfa_aux_t aux;
72
+ } gfa_seg_t;
73
+
74
+ typedef struct {
75
+ int32_t len, snid, soff, rank;
76
+ uint64_t end[2];
77
+ char *seq;
78
+ } gfa_sfa_t;
79
+
80
+ typedef struct {
81
+ char *name;
82
+ int32_t min, max, rank;
83
+ } gfa_sseq_t;
84
+
85
+ #define gfa_n_vtx(g) ((g)->n_seg << 1)
86
+
87
+ typedef struct {
88
+ // segments
89
+ uint32_t m_seg, n_seg, max_rank;
90
+ gfa_seg_t *seg;
91
+ void *h_names;
92
+ // persistent names
93
+ uint32_t m_sseq, n_sseq;
94
+ gfa_sseq_t *sseq;
95
+ void *h_snames;
96
+ // links
97
+ uint64_t m_arc, n_arc;
98
+ gfa_arc_t *arc;
99
+ gfa_aux_t *link_aux;
100
+ uint64_t *idx;
101
+ } gfa_t;
102
+
103
+ typedef struct {
104
+ const char *seq;
105
+ int32_t len;
106
+ } gfa_edseq_t;
107
+
108
+ // graph augmentation
109
+
110
+ typedef struct {
111
+ uint32_t v[2];
112
+ int32_t voff[2];
113
+ int32_t coff[2], ctg;
114
+ } gfa_ins_t;
115
+
116
+ extern int gfa_verbose;
117
+ extern unsigned char gfa_comp_table[256];
118
+
119
+ #ifdef __cplusplus
120
+ extern "C" {
121
+ #endif
122
+
123
+ gfa_t *gfa_init(void);
124
+ void gfa_destroy(gfa_t *g);
125
+ gfa_t *gfa_read(const char *fn);
126
+ void gfa_print(const gfa_t *g, FILE *fp, int M_only);
127
+
128
+ gfa_edseq_t *gfa_edseq_init(const gfa_t *g);
129
+ void gfa_edseq_destroy(int32_t n_seg, gfa_edseq_t *es);
130
+
131
+ int32_t gfa_name2id(const gfa_t *g, const char *name);
132
+ uint8_t *gfa_aux_get(int l_data, const uint8_t *data, const char tag[2]);
133
+ int gfa_aux_del(int l_data, uint8_t *data, uint8_t *s);
134
+
135
+ #ifdef __cplusplus
136
+ }
137
+ #endif
138
+
139
+ #ifndef kroundup32
140
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
141
+ #endif
142
+
143
+ static inline void gfa_arc_del(gfa_t *g, uint32_t v, uint32_t w, int del)
144
+ {
145
+ uint32_t i, nv = gfa_arc_n(g, v);
146
+ gfa_arc_t *av = gfa_arc_a(g, v);
147
+ for (i = 0; i < nv; ++i)
148
+ if (av[i].w == w) av[i].del = !!del;
149
+ }
150
+
151
+ static inline void gfa_seg_del(gfa_t *g, uint32_t s)
152
+ {
153
+ uint32_t k;
154
+ g->seg[s].del = 1;
155
+ for (k = 0; k < 2; ++k) {
156
+ uint32_t i, v = s<<1 | k;
157
+ uint32_t nv = gfa_arc_n(g, v);
158
+ gfa_arc_t *av = gfa_arc_a(g, v);
159
+ for (i = 0; i < nv; ++i) {
160
+ av[i].del = 1;
161
+ gfa_arc_del(g, av[i].w^1, v^1, 1);
162
+ }
163
+ }
164
+ }
165
+
166
+ #endif
@@ -0,0 +1,182 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <assert.h>
4
+ #include <ctype.h>
5
+ #include "kthread.h"
6
+ #include "kalloc.h"
7
+ #include "sys.h"
8
+ #include "bseq.h"
9
+ #include "ggen.h"
10
+ #include "mgpriv.h"
11
+ #include "gfa-priv.h"
12
+
13
+ typedef struct {
14
+ int n_seq;
15
+ mg_bseq1_t *seq;
16
+ mg_gchains_t **gcs;
17
+ } maprst_t;
18
+
19
+ typedef struct {
20
+ const mg_mapopt_t *opt;
21
+ const mg_idx_t *gi;
22
+ mg_tbuf_t **buf;
23
+ maprst_t *r;
24
+ } step_t;
25
+
26
+ static void worker_for(void *_data, long i, int tid) // kt_for() callback
27
+ {
28
+ step_t *s = (step_t*)_data;
29
+ if (mg_dbg_flag & MG_DBG_QNAME)
30
+ fprintf(stderr, "QR\t%s\t%d\t%d\n", s->r->seq[i].name, tid, s->r->seq[i].l_seq);
31
+ if ((s->opt->flag & MG_M_SKIP_GCHECK) == 0 && mg_verbose >= 2) {
32
+ if (gfa_sseq_get(s->gi->g, s->r->seq[i].name) >= 0)
33
+ fprintf(stderr, "[W::%s] stable sequence \"%s\" already present in the graph. This will lead to inconsistent rGFA.\n",
34
+ __func__, s->r->seq[i].name);
35
+ }
36
+ s->r->gcs[i] = mg_map(s->gi, s->r->seq[i].l_seq, s->r->seq[i].seq, s->buf[tid], s->opt, s->r->seq[i].name);
37
+ }
38
+
39
+ static maprst_t *ggen_map(const mg_idx_t *gi, const mg_mapopt_t *opt, const char *fn, int n_threads)
40
+ {
41
+ mg_bseq_file_t *fp;
42
+ maprst_t *r;
43
+ step_t s;
44
+ int i;
45
+
46
+ fp = mg_bseq_open(fn);
47
+ if (fp == 0) return 0;
48
+
49
+ KCALLOC(0, r, 1);
50
+ r->seq = mg_bseq_read(fp, 1ULL<<62, 0, 0, 0, &r->n_seq);
51
+ mg_bseq_close(fp);
52
+ if (mg_verbose >= 3)
53
+ fprintf(stderr, "[M::%s::%.3f*%.2f] loaded file \"%s\"\n", __func__,
54
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), fn);
55
+ for (i = 0; i < r->n_seq; ++i) {
56
+ r->seq[i].rid = i;
57
+ mg_toupper(r->seq[i].l_seq, r->seq[i].seq);
58
+ }
59
+ KCALLOC(0, r->gcs, r->n_seq);
60
+
61
+ s.gi = gi, s.opt = opt, s.r = r;
62
+ KCALLOC(0, s.buf, n_threads);
63
+ for (i = 0; i < n_threads; ++i) s.buf[i] = mg_tbuf_init();
64
+ kt_for(n_threads, worker_for, &s, r->n_seq);
65
+ if (mg_verbose >= 3)
66
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequence(s) to the graph\n", __func__,
67
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), r->n_seq);
68
+ for (i = 0; i < n_threads; ++i) mg_tbuf_destroy(s.buf[i]);
69
+ free(s.buf);
70
+ return r;
71
+ }
72
+
73
+ static void mg_free_maprst(maprst_t *r)
74
+ {
75
+ int i;
76
+ for (i = 0; i < r->n_seq; ++i) {
77
+ mg_gchain_free(r->gcs[i]);
78
+ free(r->seq[i].seq); free(r->seq[i].name);
79
+ }
80
+ free(r->gcs); free(r->seq);
81
+ free(r);
82
+ }
83
+
84
+ int mg_ggen_aug(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
85
+ {
86
+ int i;
87
+ mg_mapopt_t opt = *opt0;
88
+ if (g == 0) return -1;
89
+ for (i = 0; i < n_fn; ++i) {
90
+ mg_idx_t *gi;
91
+ maprst_t *r;
92
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
93
+ r = ggen_map(gi, &opt, fn[i], n_threads);
94
+ if (opt0->flag & MG_M_CIGAR)
95
+ mg_ggsimple_cigar(0, go, g, r->n_seq, r->seq, r->gcs);
96
+ else
97
+ mg_ggsimple(0, go, g, r->n_seq, r->seq, r->gcs);
98
+ mg_free_maprst(r);
99
+ mg_idx_destroy(gi);
100
+ }
101
+ return 0;
102
+ }
103
+
104
+ int mg_ggen_cov(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
105
+ {
106
+ int32_t i;
107
+ mg_mapopt_t opt = *opt0;
108
+ mg_idx_t *gi;
109
+ double *cov_seg, *cov_link;
110
+ int64_t j;
111
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
112
+ KCALLOC(0, cov_seg, g->n_seg);
113
+ KCALLOC(0, cov_link, g->n_arc);
114
+ for (i = 0; i < n_fn; ++i) {
115
+ maprst_t *r;
116
+ r = ggen_map(gi, &opt, fn[i], n_threads);
117
+ mg_cov_asm(g, r->n_seq, r->gcs, go->min_mapq, go->min_map_len, cov_seg, cov_link);
118
+ mg_free_maprst(r);
119
+ }
120
+ mg_idx_destroy(gi);
121
+ for (j = 0; j < g->n_seg; ++j) cov_seg[j] /= n_fn;
122
+ for (j = 0; j < g->n_arc; ++j) cov_link[j] /= n_fn;
123
+ gfa_aux_update_cv(g, "cf", cov_seg, cov_link);
124
+ free(cov_seg); free(cov_link);
125
+ return 0;
126
+ }
127
+
128
+ int mg_ggen_call(gfa_t *g, const char *fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, const mg_ggopt_t *go, int n_threads)
129
+ {
130
+ mg_mapopt_t opt = *opt0;
131
+ mg_idx_t *gi;
132
+ maprst_t *r;
133
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
134
+ r = ggen_map(gi, &opt, fn, n_threads);
135
+ mg_call_asm(g, r->n_seq, r->seq, r->gcs, go->min_mapq, go->min_map_len);
136
+ mg_free_maprst(r);
137
+ mg_idx_destroy(gi);
138
+ return 0;
139
+ }
140
+
141
+ int mg_ggen(gfa_t *g, int32_t n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt, const mg_ggopt_t *go, int n_threads)
142
+ {
143
+ if (go->flag & MG_G_CALL) return mg_ggen_call(g, fn[0], ipt, opt, go, n_threads);
144
+ else if (go->flag & MG_G_CAL_COV) return mg_ggen_cov(g, n_fn, fn, ipt, opt, go, n_threads);
145
+ else return mg_ggen_aug(g, n_fn, fn, ipt, opt, go, n_threads);
146
+ }
147
+
148
+ int32_t mg_path2seq(void *km, const gfa_t *g, const mg_gchains_t *gcs, int32_t ls, int32_t le, int32_t voff[2], char **seq_, int32_t *cap_) // NB: [ls,le] is a CLOSED interval
149
+ {
150
+ extern unsigned char gfa_comp_table[256];
151
+ int32_t i, k, l = 0, cap = *cap_;
152
+ char *seq = *seq_;
153
+ assert(0 <= ls && ls <= le && le < gcs->n_lc);
154
+ for (k = ls; k <= le; ++k) {
155
+ uint32_t v = gcs->lc[k].v, len = g->seg[v>>1].len;
156
+ int32_t st = 0, en = len, tmp;
157
+ if (k == ls) st = voff[0];
158
+ if (k == le) en = voff[1];
159
+ assert(0 <= st && st <= en && en <= len);
160
+ if (en - st + l + 1 > cap) {
161
+ cap = en - st + l + 1;
162
+ kroundup32(cap);
163
+ KREALLOC(km, seq, cap);
164
+ }
165
+ if (v&1) {
166
+ uint8_t *ss = (uint8_t*)g->seg[v>>1].seq;
167
+ tmp = st, st = len - en, en = len - tmp;
168
+ for (i = en - 1; i >= st; --i)
169
+ seq[l++] = gfa_comp_table[ss[i]];
170
+ } else {
171
+ memcpy(&seq[l], &g->seg[v>>1].seq[st], en - st);
172
+ l += en - st;
173
+ }
174
+ }
175
+ if (l == 0 && cap == 0) {
176
+ cap = 8;
177
+ KREALLOC(km, seq, cap);
178
+ }
179
+ seq[l] = 0;
180
+ *seq_ = seq, *cap_ = cap;
181
+ return l;
182
+ }
@@ -0,0 +1,21 @@
1
+ #ifndef MG_GGEN_H
2
+ #define MG_GGEN_H
3
+
4
+ #include "minigraph.h"
5
+ #include "bseq.h"
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ int32_t mg_path2seq(void *km, const gfa_t *g, const mg_gchains_t *gcs, int32_t ls, int32_t le, int32_t voff[2], char **seq_, int32_t *cap_);
12
+ void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs);
13
+ void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const* gcs);
14
+
15
+ void mg_call_asm(const gfa_t *g, int32_t n_seq, const mg_bseq1_t *seq, mg_gchains_t *const *gcs, int32_t min_mapq, int32_t min_blen);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
20
+
21
+ #endif