ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,211 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #include "kthread.h"
4
+ #include "kalloc.h"
5
+ #include "bseq.h"
6
+ #include "sys.h"
7
+ #include "mgpriv.h"
8
+ #include "gfa-priv.h"
9
+
10
+ typedef struct {
11
+ int64_t mini_batch_size;
12
+ int n_processed, n_threads, n_fp;
13
+ const mg_mapopt_t *opt;
14
+ mg_bseq_file_t **fp;
15
+ const mg_idx_t *gi;
16
+ kstring_t str;
17
+ double *c_seg, *c_link;
18
+ } pipeline_t;
19
+
20
+ typedef struct {
21
+ const pipeline_t *p;
22
+ int n_seq, n_frag;
23
+ mg_bseq1_t *seq;
24
+ int *seg_off, *n_seg;
25
+ mg_gchains_t **gcs;
26
+ mg_tbuf_t **buf;
27
+ } step_t;
28
+
29
+ static void worker_for(void *_data, long i, int tid) // kt_for() callback
30
+ {
31
+ step_t *s = (step_t*)_data;
32
+ int qlens[MG_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
33
+ const char *qseqs[MG_MAX_SEG];
34
+ mg_tbuf_t *b = s->buf[tid];
35
+ assert(s->n_seg[i] <= MG_MAX_SEG);
36
+ if (mg_dbg_flag & MG_DBG_QNAME)
37
+ fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
38
+ for (j = 0; j < s->n_seg[i]; ++j) {
39
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
40
+ mg_revcomp_bseq(&s->seq[off + j]);
41
+ qlens[j] = s->seq[off + j].l_seq;
42
+ qseqs[j] = s->seq[off + j].seq;
43
+ }
44
+ if (s->p->opt->flag & MG_M_INDEPEND_SEG) {
45
+ for (j = 0; j < s->n_seg[i]; ++j)
46
+ mg_map_frag(s->p->gi, 1, &qlens[j], &qseqs[j], &s->gcs[off+j], b, s->p->opt, s->seq[off+j].name);
47
+ } else {
48
+ mg_map_frag(s->p->gi, s->n_seg[i], qlens, qseqs, &s->gcs[off], b, s->p->opt, s->seq[off].name);
49
+ }
50
+ #if 0 // for paired-end reads
51
+ for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
52
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
53
+ int k, t;
54
+ mg_revcomp_bseq(&s->seq[off + j]);
55
+ for (k = 0; k < s->n_reg[off + j]; ++k) {
56
+ mg_lchain_t *r = &s->reg[off + j][k];
57
+ t = r->qs;
58
+ r->qs = qlens[j] - r->qe;
59
+ r->qe = qlens[j] - t;
60
+ r->v ^= 1;
61
+ }
62
+ }
63
+ #endif
64
+ }
65
+
66
+ static void *worker_pipeline(void *shared, int step, void *in)
67
+ {
68
+ int i, j, k;
69
+ pipeline_t *p = (pipeline_t*)shared;
70
+ if (step == 0) { // step 0: read sequences
71
+ int with_qual = !(p->opt->flag & MG_M_NO_QUAL);
72
+ int with_comment = !!(p->opt->flag & MG_M_COPY_COMMENT);
73
+ int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MG_M_FRAG_MODE));
74
+ step_t *s;
75
+ s = (step_t*)calloc(1, sizeof(step_t));
76
+ if (p->n_fp > 1) s->seq = mg_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
77
+ else s->seq = mg_bseq_read(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
78
+ if (s->seq) {
79
+ s->p = p;
80
+ for (i = 0; i < s->n_seq; ++i)
81
+ mg_toupper(s->seq[i].l_seq, s->seq[i].seq);
82
+ for (i = 0; i < s->n_seq; ++i)
83
+ s->seq[i].rid = p->n_processed++;
84
+ s->buf = (mg_tbuf_t**)calloc(p->n_threads, sizeof(mg_tbuf_t*));
85
+ for (i = 0; i < p->n_threads; ++i)
86
+ s->buf[i] = mg_tbuf_init();
87
+ s->seg_off = (int*)calloc(2 * s->n_seq, sizeof(int));
88
+ s->n_seg = s->seg_off + s->n_seq; // n_seg, rep_len and frag_gap are allocated together with seg_off
89
+ KCALLOC(0, s->gcs, s->n_seq);
90
+ for (i = 1, j = 0; i <= s->n_seq; ++i)
91
+ if (i == s->n_seq || !frag_mode || !mg_qname_same(s->seq[i-1].name, s->seq[i].name)) {
92
+ s->n_seg[s->n_frag] = i - j;
93
+ s->seg_off[s->n_frag++] = j;
94
+ j = i;
95
+ }
96
+ return s;
97
+ } else free(s);
98
+ } else if (step == 1) { // step 1: map
99
+ kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
100
+ return in;
101
+ } else if (step == 2) { // step 2: output
102
+ void *km = 0;
103
+ step_t *s = (step_t*)in;
104
+ for (i = 0; i < p->n_threads; ++i) mg_tbuf_destroy(s->buf[i]);
105
+ free(s->buf);
106
+ if (!(mg_dbg_flag & MG_DBG_NO_KALLOC)) km = km_init();
107
+ for (k = 0; k < s->n_frag; ++k) {
108
+ int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
109
+ if ((p->opt->flag & MG_M_FRAG_MODE) && (p->opt->flag & MG_M_FRAG_MERGE)) {
110
+ mg_bseq1_t *t = &s->seq[seg_st];
111
+ int32_t *qlens;
112
+ KMALLOC(km, qlens, seg_en - seg_st); // TODO: if this is an issue (quite unlikely), preallocate
113
+ for (i = seg_st; i < seg_en; ++i)
114
+ qlens[i - seg_st] = s->seq[i].l_seq;
115
+ if (p->opt->flag & MG_M_CAL_COV)
116
+ mg_cov_map(p->gi->g, s->gcs[seg_st], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
117
+ else mg_write_gaf(&p->str, p->gi->g, s->gcs[seg_st], seg_en - seg_st, qlens, t->name, p->opt->flag, km);
118
+ kfree(km, qlens);
119
+ if (p->str.l) mg_err_fputs(p->str.s, stdout);
120
+ } else {
121
+ for (i = seg_st; i < seg_en; ++i) {
122
+ mg_bseq1_t *t = &s->seq[i];
123
+ if (p->opt->flag & MG_M_CAL_COV)
124
+ mg_cov_map(p->gi->g, s->gcs[i], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
125
+ else mg_write_gaf(&p->str, p->gi->g, s->gcs[i], 1, &t->l_seq, t->name, p->opt->flag, km);
126
+ if (p->str.l) mg_err_fputs(p->str.s, stdout);
127
+ }
128
+ }
129
+ for (i = seg_st; i < seg_en; ++i) {
130
+ mg_gchain_free(s->gcs[i]);
131
+ free(s->seq[i].seq); free(s->seq[i].name);
132
+ if (s->seq[i].qual) free(s->seq[i].qual);
133
+ if (s->seq[i].comment) free(s->seq[i].comment);
134
+ }
135
+ }
136
+ free(s->gcs); free(s->seg_off); free(s->seq); // n_seg, rep_len and frag_gap were allocated with seg_off; no memory leak here
137
+ if (km) km_destroy(km);
138
+ if (mg_verbose >= 3)
139
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), s->n_seq);
140
+ free(s);
141
+ }
142
+ return 0;
143
+ }
144
+
145
+ static mg_bseq_file_t **open_bseqs(int n, const char **fn)
146
+ {
147
+ mg_bseq_file_t **fp;
148
+ int i, j;
149
+ fp = (mg_bseq_file_t**)calloc(n, sizeof(mg_bseq_file_t*));
150
+ for (i = 0; i < n; ++i) {
151
+ if ((fp[i] = mg_bseq_open(fn[i])) == 0) {
152
+ if (mg_verbose >= 1)
153
+ fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
154
+ for (j = 0; j < i; ++j)
155
+ mg_bseq_close(fp[j]);
156
+ free(fp);
157
+ return 0;
158
+ }
159
+ }
160
+ return fp;
161
+ }
162
+
163
+ int mg_map_file_frag(const mg_idx_t *idx, int n_segs, const char **fn, const mg_mapopt_t *opt, int n_threads, double *c_seg, double *c_link)
164
+ {
165
+ int i, pl_threads;
166
+ pipeline_t pl;
167
+ if (n_segs < 1) return -1;
168
+ memset(&pl, 0, sizeof(pipeline_t));
169
+ pl.n_fp = n_segs;
170
+ pl.fp = open_bseqs(pl.n_fp, fn);
171
+ if (pl.fp == 0) return -1;
172
+ pl.opt = opt, pl.gi = idx;
173
+ pl.n_threads = n_threads > 1? n_threads : 1;
174
+ pl.mini_batch_size = opt->mini_batch_size;
175
+ pl.c_seg = c_seg, pl.c_link = c_link;
176
+ pl_threads = n_threads == 1? 1 : (opt->flag&MG_M_2_IO_THREADS)? 3 : 2;
177
+ kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
178
+
179
+ free(pl.str.s);
180
+ for (i = 0; i < pl.n_fp; ++i)
181
+ mg_bseq_close(pl.fp[i]);
182
+ free(pl.fp);
183
+ return 0;
184
+ }
185
+
186
+ int mg_map_files(gfa_t *g, int n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, int n_threads)
187
+ {
188
+ mg_mapopt_t opt = *opt0;
189
+ mg_idx_t *gi;
190
+ int i, ret = 0;
191
+ double *cov_seg = 0, *cov_link = 0;
192
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
193
+ if (opt.flag & MG_M_CAL_COV) {
194
+ KCALLOC(0, cov_seg, g->n_seg);
195
+ KCALLOC(0, cov_link, g->n_arc);
196
+ }
197
+ if (opt.flag & MG_M_FRAG_MODE) {
198
+ ret = mg_map_file_frag(gi, n_fn, fn, &opt, n_threads, cov_seg, cov_link);
199
+ } else {
200
+ for (i = 0; i < n_fn; ++i) {
201
+ ret = mg_map_file_frag(gi, 1, &fn[i], &opt, n_threads, cov_seg, cov_link);
202
+ if (ret != 0) break;
203
+ }
204
+ }
205
+ if (opt.flag & MG_M_CAL_COV) {
206
+ gfa_aux_update_cv(g, "dc", cov_seg, cov_link);
207
+ free(cov_seg); free(cov_link);
208
+ }
209
+ mg_idx_destroy(gi);
210
+ return ret;
211
+ }
@@ -0,0 +1,230 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "khashl.h"
4
+ #include "kthread.h"
5
+ #include "kvec-km.h"
6
+ #include "sys.h"
7
+
8
+ #define idx_hash(a) ((a)>>1)
9
+ #define idx_eq(a, b) ((a)>>1 == (b)>>1)
10
+ KHASHL_MAP_INIT(KH_LOCAL, idxhash_t, mg_hidx, uint64_t, uint64_t, idx_hash, idx_eq)
11
+
12
+ typedef struct mg_idx_bucket_s {
13
+ mg128_v a; // (minimizer, position) array
14
+ int32_t n; // size of the _p_ array
15
+ uint64_t *p; // position array for minimizers appearing >1 times
16
+ void *h; // hash table indexing _p_ and minimizers appearing once
17
+ } mg_idx_bucket_t;
18
+
19
+ mg_idx_t *mg_idx_init(int k, int w, int b)
20
+ {
21
+ mg_idx_t *gi;
22
+ if (k*2 < b) b = k * 2;
23
+ if (w < 1) w = 1;
24
+ KCALLOC(0, gi, 1);
25
+ gi->w = w, gi->k = k, gi->b = b;
26
+ KCALLOC(0, gi->B, 1<<b);
27
+ return gi;
28
+ }
29
+
30
+ void mg_idx_destroy(mg_idx_t *gi)
31
+ {
32
+ uint32_t i;
33
+ if (gi == 0) return;
34
+ if (gi->B) {
35
+ for (i = 0; i < 1U<<gi->b; ++i) {
36
+ free(gi->B[i].p);
37
+ free(gi->B[i].a.a);
38
+ mg_hidx_destroy((idxhash_t*)gi->B[i].h);
39
+ }
40
+ free(gi->B);
41
+ }
42
+ gfa_edseq_destroy(gi->n_seg, gi->es);
43
+ free(gi);
44
+ }
45
+
46
+ /****************
47
+ * Index access *
48
+ ****************/
49
+
50
+ const uint64_t *mg_idx_hget(const void *h_, const uint64_t *q, int suflen, uint64_t minier, int *n)
51
+ {
52
+ khint_t k;
53
+ const idxhash_t *h = (const idxhash_t*)h_;
54
+ *n = 0;
55
+ if (h == 0) return 0;
56
+ k = mg_hidx_get(h, minier>>suflen<<1);
57
+ if (k == kh_end(h)) return 0;
58
+ if (kh_key(h, k)&1) { // special casing when there is only one k-mer
59
+ *n = 1;
60
+ return &kh_val(h, k);
61
+ } else {
62
+ *n = (uint32_t)kh_val(h, k);
63
+ return &q[kh_val(h, k)>>32];
64
+ }
65
+ }
66
+
67
+ const uint64_t *mg_idx_get(const mg_idx_t *gi, uint64_t minier, int *n)
68
+ {
69
+ int mask = (1<<gi->b) - 1;
70
+ mg_idx_bucket_t *b = &gi->B[minier&mask];
71
+ return mg_idx_hget(b->h, b->p, gi->b, minier, n);
72
+ }
73
+
74
+ void mg_idx_cal_quantile(const mg_idx_t *gi, int32_t m, float f[], int32_t q[])
75
+ {
76
+ int32_t i;
77
+ uint64_t n = 0;
78
+ khint_t *a, k;
79
+ for (i = 0; i < 1<<gi->b; ++i)
80
+ if (gi->B[i].h) n += kh_size((idxhash_t*)gi->B[i].h);
81
+ a = (uint32_t*)malloc(n * 4);
82
+ for (i = 0, n = 0; i < 1<<gi->b; ++i) {
83
+ idxhash_t *h = (idxhash_t*)gi->B[i].h;
84
+ if (h == 0) continue;
85
+ for (k = 0; k < kh_end(h); ++k) {
86
+ if (!kh_exist(h, k)) continue;
87
+ a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
88
+ }
89
+ }
90
+ for (i = 0; i < m; ++i)
91
+ q[i] = ks_ksmall_uint32_t(n, a, (size_t)((1.0 - (double)f[i]) * n));
92
+ free(a);
93
+ }
94
+
95
+ /***************
96
+ * Index build *
97
+ ***************/
98
+
99
+ static void mg_idx_add(mg_idx_t *gi, int n, const mg128_t *a)
100
+ {
101
+ int i, mask = (1<<gi->b) - 1;
102
+ for (i = 0; i < n; ++i) {
103
+ mg128_v *p = &gi->B[a[i].x>>8&mask].a;
104
+ kv_push(mg128_t, 0, *p, a[i]);
105
+ }
106
+ }
107
+
108
+ void mg_idx_hfree(void *h_)
109
+ {
110
+ idxhash_t *h = (idxhash_t*)h_;
111
+ if (h == 0) return;
112
+ mg_hidx_destroy(h);
113
+ }
114
+
115
+ void *mg_idx_a2h(void *km, int32_t n_a, mg128_t *a, int suflen, uint64_t **q_, int32_t *n_)
116
+ {
117
+ int32_t N, n, n_keys;
118
+ int32_t j, start_a, start_q;
119
+ idxhash_t *h;
120
+ uint64_t *q;
121
+
122
+ *q_ = 0, *n_ = 0;
123
+ if (n_a == 0) return 0;
124
+
125
+ // sort by minimizer
126
+ radix_sort_128x(a, a + n_a);
127
+
128
+ // count and preallocate
129
+ for (j = 1, n = 1, n_keys = 0, N = 0; j <= n_a; ++j) {
130
+ if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
131
+ ++n_keys;
132
+ if (n > 1) N += n;
133
+ n = 1;
134
+ } else ++n;
135
+ }
136
+ h = mg_hidx_init2(km);
137
+ mg_hidx_resize(h, n_keys);
138
+ KCALLOC(km, q, N);
139
+ *q_ = q, *n_ = N;
140
+
141
+ // create the hash table
142
+ for (j = 1, n = 1, start_a = start_q = 0; j <= n_a; ++j) {
143
+ if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
144
+ khint_t itr;
145
+ int absent;
146
+ mg128_t *p = &a[j-1];
147
+ itr = mg_hidx_put(h, p->x>>8>>suflen<<1, &absent);
148
+ assert(absent && j == start_a + n);
149
+ if (n == 1) {
150
+ kh_key(h, itr) |= 1;
151
+ kh_val(h, itr) = p->y;
152
+ } else {
153
+ int k;
154
+ for (k = 0; k < n; ++k)
155
+ q[start_q + k] = a[start_a + k].y;
156
+ radix_sort_gfa64(&q[start_q], &q[start_q + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
157
+ kh_val(h, itr) = (uint64_t)start_q<<32 | n;
158
+ start_q += n;
159
+ }
160
+ start_a = j, n = 1;
161
+ } else ++n;
162
+ }
163
+ assert(N == start_q);
164
+ return h;
165
+ }
166
+
167
+ static void worker_post(void *g, long i, int tid)
168
+ {
169
+ mg_idx_t *gi = (mg_idx_t*)g;
170
+ mg_idx_bucket_t *b = &gi->B[i];
171
+ if (b->a.n == 0) return;
172
+ b->h = (idxhash_t*)mg_idx_a2h(0, b->a.n, b->a.a, gi->b, &b->p, &b->n);
173
+ kfree(0, b->a.a);
174
+ b->a.n = b->a.m = 0, b->a.a = 0;
175
+ }
176
+
177
+ int mg_gfa_overlap(const gfa_t *g)
178
+ {
179
+ int64_t i;
180
+ for (i = 0; i < g->n_arc; ++i) // non-zero overlap
181
+ if (g->arc[i].ov != 0 || g->arc[i].ow != 0)
182
+ return 1;
183
+ return 0;
184
+ }
185
+
186
+ mg_idx_t *mg_index_core(gfa_t *g, int k, int w, int b, int n_threads)
187
+ {
188
+ mg_idx_t *gi;
189
+ mg128_v a = {0,0,0};
190
+ int i;
191
+
192
+ if (mg_gfa_overlap(g)) {
193
+ if (mg_verbose >= 1)
194
+ fprintf(stderr, "[E::%s] minigraph doesn't work with graphs containing overlapping segments\n", __func__);
195
+ return 0;
196
+ }
197
+ gi = mg_idx_init(k, w, b);
198
+ gi->g = g;
199
+
200
+ for (i = 0; i < g->n_seg; ++i) {
201
+ gfa_seg_t *s = &g->seg[i];
202
+ a.n = 0;
203
+ mg_sketch(0, s->seq, s->len, w, k, i, &a); // TODO: this can be parallelized
204
+ mg_idx_add(gi, a.n, a.a);
205
+ }
206
+ free(a.a);
207
+ kt_for(n_threads, worker_post, gi, 1<<gi->b);
208
+ return gi;
209
+ }
210
+
211
+ mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo)
212
+ {
213
+ int32_t i, j;
214
+ mg_idx_t *gi;
215
+ for (i = 0; i < g->n_seg; ++i) { // uppercase
216
+ gfa_seg_t *s = &g->seg[i];
217
+ for (j = 0; j < s->len; ++j)
218
+ if (s->seq[j] >= 'a' && s->seq[j] <= 'z')
219
+ s->seq[j] -= 32;
220
+ }
221
+ gi = mg_index_core(g, io->k, io->w, io->bucket_bits, n_threads);
222
+ if (gi == 0) return 0;
223
+ gi->es = gfa_edseq_init(gi->g);
224
+ gi->n_seg = g->n_seg;
225
+ if (mg_verbose >= 3)
226
+ fprintf(stderr, "[M::%s::%.3f*%.2f] indexed the graph\n", __func__,
227
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0));
228
+ if (mo) mg_opt_update(gi, mo, 0);
229
+ return gi;
230
+ }
@@ -0,0 +1,224 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "kalloc.h"
5
+
6
+ /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
7
+ * associated with a master header, which keeps the size of the current core
8
+ * and the pointer to next core. Kalloc allocates small *blocks* of memory from
9
+ * the cores and organizes free memory blocks in a circular single-linked list.
10
+ *
11
+ * In the following diagram, "@" stands for the header of a free block (of type
12
+ * header_t), "#" for the header of an allocated block (of type size_t), "-"
13
+ * for free memory, and "+" for allocated memory.
14
+ *
15
+ * master This region is core 1. master This region is core 2.
16
+ * | |
17
+ * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------
18
+ * | | | |
19
+ * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr
20
+ */
21
+ typedef struct header_t {
22
+ size_t size;
23
+ struct header_t *ptr;
24
+ } header_t;
25
+
26
+ typedef struct {
27
+ void *par;
28
+ size_t min_core_size;
29
+ header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
30
+ } kmem_t;
31
+
32
+ static void panic(const char *s)
33
+ {
34
+ fprintf(stderr, "%s\n", s);
35
+ abort();
36
+ }
37
+
38
+ void *km_init2(void *km_par, size_t min_core_size)
39
+ {
40
+ kmem_t *km;
41
+ km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
42
+ km->par = km_par;
43
+ if (km_par) km->min_core_size = min_core_size > 0? min_core_size : ((kmem_t*)km_par)->min_core_size - 2;
44
+ else km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
45
+ return (void*)km;
46
+ }
47
+
48
+ void *km_init(void) { return km_init2(0, 0); }
49
+
50
+ void km_destroy(void *_km)
51
+ {
52
+ kmem_t *km = (kmem_t*)_km;
53
+ void *km_par;
54
+ header_t *p, *q;
55
+ if (km == NULL) return;
56
+ km_par = km->par;
57
+ for (p = km->core_head; p != NULL;) {
58
+ q = p->ptr;
59
+ kfree(km_par, p);
60
+ p = q;
61
+ }
62
+ kfree(km_par, km);
63
+ }
64
+
65
+ static header_t *morecore(kmem_t *km, size_t nu)
66
+ {
67
+ header_t *q;
68
+ size_t bytes, *p;
69
+ nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
70
+ bytes = nu * sizeof(header_t);
71
+ q = (header_t*)kmalloc(km->par, bytes);
72
+ if (!q) panic("[morecore] insufficient memory");
73
+ q->ptr = km->core_head, q->size = nu, km->core_head = q;
74
+ p = (size_t*)(q + 1);
75
+ *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
76
+ kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
77
+ return km->loop_head;
78
+ }
79
+
80
+ void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
81
+ {
82
+ header_t *p, *q;
83
+ kmem_t *km = (kmem_t*)_km;
84
+
85
+ if (!ap) return;
86
+ if (km == NULL) {
87
+ free(ap);
88
+ return;
89
+ }
90
+ p = (header_t*)((size_t*)ap - 1);
91
+ p->size = *((size_t*)ap - 1);
92
+ /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
93
+ *
94
+ * a) "p>q && p<q->ptr": @------#++++++++#+++++++@------- @---------------#+++++++@-------
95
+ * (can also be in | | | -> | |
96
+ * two cores) q p q->ptr q q->ptr
97
+ *
98
+ * @-------- #+++++++++@-------- @-------- @------------------
99
+ * | | | -> | |
100
+ * q p q->ptr q q->ptr
101
+ *
102
+ * b) "q>=q->ptr && (p>q || p<q->ptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @----------------
103
+ * | | | -> | |
104
+ * q->ptr q p q->ptr q
105
+ *
106
+ * #+++++++@----- #++++++++@------- @------------- #++++++++@-------
107
+ * | | | -> | |
108
+ * p q->ptr q q->ptr q
109
+ */
110
+ for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
111
+ if (q >= q->ptr && (p > q || p < q->ptr)) break;
112
+ if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
113
+ p->size += q->ptr->size;
114
+ p->ptr = q->ptr->ptr;
115
+ } else if (p + p->size > q->ptr && q->ptr >= p) {
116
+ panic("[kfree] The end of the allocated block enters a free block.");
117
+ } else p->ptr = q->ptr; /* backup q->ptr */
118
+
119
+ if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
120
+ q->size += p->size;
121
+ q->ptr = p->ptr;
122
+ km->loop_head = q;
123
+ } else if (q + q->size > p && p >= q) {
124
+ panic("[kfree] The end of a free block enters the allocated block.");
125
+ } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
126
+ }
127
+
128
+ void *kmalloc(void *_km, size_t n_bytes)
129
+ {
130
+ kmem_t *km = (kmem_t*)_km;
131
+ size_t n_units;
132
+ header_t *p, *q;
133
+
134
+ if (n_bytes == 0) return 0;
135
+ if (km == NULL) return malloc(n_bytes);
136
+ n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
137
+
138
+ if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
139
+ q = km->loop_head = km->base.ptr = &km->base;
140
+ for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
141
+ if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
142
+ if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
143
+ else { /* split the block. NB: memory is allocated at the end of the block! */
144
+ p->size -= n_units; /* reduce the size of the free block */
145
+ p += p->size; /* p points to the allocated block */
146
+ *(size_t*)p = n_units; /* set the size */
147
+ }
148
+ km->loop_head = q; /* set the end of chain */
149
+ return (size_t*)p + 1;
150
+ }
151
+ if (p == km->loop_head) { /* then ask for more "cores" */
152
+ if ((p = morecore(km, n_units)) == 0) return 0;
153
+ }
154
+ }
155
+ }
156
+
157
+ void *kcalloc(void *_km, size_t count, size_t size)
158
+ {
159
+ kmem_t *km = (kmem_t*)_km;
160
+ void *p;
161
+ if (size == 0 || count == 0) return 0;
162
+ if (km == NULL) return calloc(count, size);
163
+ p = kmalloc(km, count * size);
164
+ memset(p, 0, count * size);
165
+ return p;
166
+ }
167
+
168
+ void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
169
+ {
170
+ kmem_t *km = (kmem_t*)_km;
171
+ size_t cap, *p, *q;
172
+
173
+ if (n_bytes == 0) {
174
+ kfree(km, ap); return 0;
175
+ }
176
+ if (km == NULL) return realloc(ap, n_bytes);
177
+ if (ap == NULL) return kmalloc(km, n_bytes);
178
+ p = (size_t*)ap - 1;
179
+ cap = (*p) * sizeof(header_t) - sizeof(size_t);
180
+ if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
181
+ q = (size_t*)kmalloc(km, n_bytes);
182
+ memcpy(q, ap, cap);
183
+ kfree(km, ap);
184
+ return q;
185
+ }
186
+
187
+ void *krelocate(void *km, void *ap, size_t n_bytes)
188
+ {
189
+ void *p;
190
+ if (km == 0 || ap == 0) return ap;
191
+ p = kmalloc(km, n_bytes);
192
+ memcpy(p, ap, n_bytes);
193
+ kfree(km, ap);
194
+ return p;
195
+ }
196
+
197
+ void km_stat(const void *_km, km_stat_t *s)
198
+ {
199
+ kmem_t *km = (kmem_t*)_km;
200
+ header_t *p;
201
+ memset(s, 0, sizeof(km_stat_t));
202
+ if (km == NULL || km->loop_head == NULL) return;
203
+ for (p = km->loop_head;; p = p->ptr) {
204
+ s->available += p->size * sizeof(header_t);
205
+ if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
206
+ if (p->ptr > p && p + p->size > p->ptr)
207
+ panic("[km_stat] The end of a free block enters another free block.");
208
+ if (p->ptr == km->loop_head) break;
209
+ }
210
+ for (p = km->core_head; p != NULL; p = p->ptr) {
211
+ size_t size = p->size * sizeof(header_t);
212
+ ++s->n_cores;
213
+ s->capacity += size;
214
+ s->largest = s->largest > size? s->largest : size;
215
+ }
216
+ }
217
+
218
+ void km_stat_print(const void *km)
219
+ {
220
+ km_stat_t st;
221
+ km_stat(km, &st);
222
+ fprintf(stderr, "[km_stat] cap=%ld, avail=%ld, largest=%ld, n_core=%ld, n_block=%ld\n",
223
+ st.capacity, st.available, st.largest, st.n_blocks, st.n_cores);
224
+ }