ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,211 @@
1
+ #include <stdlib.h>
2
+ #include <assert.h>
3
+ #include "kthread.h"
4
+ #include "kalloc.h"
5
+ #include "bseq.h"
6
+ #include "sys.h"
7
+ #include "mgpriv.h"
8
+ #include "gfa-priv.h"
9
+
10
+ typedef struct {
11
+ int64_t mini_batch_size;
12
+ int n_processed, n_threads, n_fp;
13
+ const mg_mapopt_t *opt;
14
+ mg_bseq_file_t **fp;
15
+ const mg_idx_t *gi;
16
+ kstring_t str;
17
+ double *c_seg, *c_link;
18
+ } pipeline_t;
19
+
20
+ typedef struct {
21
+ const pipeline_t *p;
22
+ int n_seq, n_frag;
23
+ mg_bseq1_t *seq;
24
+ int *seg_off, *n_seg;
25
+ mg_gchains_t **gcs;
26
+ mg_tbuf_t **buf;
27
+ } step_t;
28
+
29
+ static void worker_for(void *_data, long i, int tid) // kt_for() callback
30
+ {
31
+ step_t *s = (step_t*)_data;
32
+ int qlens[MG_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
33
+ const char *qseqs[MG_MAX_SEG];
34
+ mg_tbuf_t *b = s->buf[tid];
35
+ assert(s->n_seg[i] <= MG_MAX_SEG);
36
+ if (mg_dbg_flag & MG_DBG_QNAME)
37
+ fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
38
+ for (j = 0; j < s->n_seg[i]; ++j) {
39
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
40
+ mg_revcomp_bseq(&s->seq[off + j]);
41
+ qlens[j] = s->seq[off + j].l_seq;
42
+ qseqs[j] = s->seq[off + j].seq;
43
+ }
44
+ if (s->p->opt->flag & MG_M_INDEPEND_SEG) {
45
+ for (j = 0; j < s->n_seg[i]; ++j)
46
+ mg_map_frag(s->p->gi, 1, &qlens[j], &qseqs[j], &s->gcs[off+j], b, s->p->opt, s->seq[off+j].name);
47
+ } else {
48
+ mg_map_frag(s->p->gi, s->n_seg[i], qlens, qseqs, &s->gcs[off], b, s->p->opt, s->seq[off].name);
49
+ }
50
+ #if 0 // for paired-end reads
51
+ for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
52
+ if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
53
+ int k, t;
54
+ mg_revcomp_bseq(&s->seq[off + j]);
55
+ for (k = 0; k < s->n_reg[off + j]; ++k) {
56
+ mg_lchain_t *r = &s->reg[off + j][k];
57
+ t = r->qs;
58
+ r->qs = qlens[j] - r->qe;
59
+ r->qe = qlens[j] - t;
60
+ r->v ^= 1;
61
+ }
62
+ }
63
+ #endif
64
+ }
65
+
66
+ static void *worker_pipeline(void *shared, int step, void *in)
67
+ {
68
+ int i, j, k;
69
+ pipeline_t *p = (pipeline_t*)shared;
70
+ if (step == 0) { // step 0: read sequences
71
+ int with_qual = !(p->opt->flag & MG_M_NO_QUAL);
72
+ int with_comment = !!(p->opt->flag & MG_M_COPY_COMMENT);
73
+ int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MG_M_FRAG_MODE));
74
+ step_t *s;
75
+ s = (step_t*)calloc(1, sizeof(step_t));
76
+ if (p->n_fp > 1) s->seq = mg_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
77
+ else s->seq = mg_bseq_read(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
78
+ if (s->seq) {
79
+ s->p = p;
80
+ for (i = 0; i < s->n_seq; ++i)
81
+ mg_toupper(s->seq[i].l_seq, s->seq[i].seq);
82
+ for (i = 0; i < s->n_seq; ++i)
83
+ s->seq[i].rid = p->n_processed++;
84
+ s->buf = (mg_tbuf_t**)calloc(p->n_threads, sizeof(mg_tbuf_t*));
85
+ for (i = 0; i < p->n_threads; ++i)
86
+ s->buf[i] = mg_tbuf_init();
87
+ s->seg_off = (int*)calloc(2 * s->n_seq, sizeof(int));
88
+ s->n_seg = s->seg_off + s->n_seq; // n_seg, rep_len and frag_gap are allocated together with seg_off
89
+ KCALLOC(0, s->gcs, s->n_seq);
90
+ for (i = 1, j = 0; i <= s->n_seq; ++i)
91
+ if (i == s->n_seq || !frag_mode || !mg_qname_same(s->seq[i-1].name, s->seq[i].name)) {
92
+ s->n_seg[s->n_frag] = i - j;
93
+ s->seg_off[s->n_frag++] = j;
94
+ j = i;
95
+ }
96
+ return s;
97
+ } else free(s);
98
+ } else if (step == 1) { // step 1: map
99
+ kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
100
+ return in;
101
+ } else if (step == 2) { // step 2: output
102
+ void *km = 0;
103
+ step_t *s = (step_t*)in;
104
+ for (i = 0; i < p->n_threads; ++i) mg_tbuf_destroy(s->buf[i]);
105
+ free(s->buf);
106
+ if (!(mg_dbg_flag & MG_DBG_NO_KALLOC)) km = km_init();
107
+ for (k = 0; k < s->n_frag; ++k) {
108
+ int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
109
+ if ((p->opt->flag & MG_M_FRAG_MODE) && (p->opt->flag & MG_M_FRAG_MERGE)) {
110
+ mg_bseq1_t *t = &s->seq[seg_st];
111
+ int32_t *qlens;
112
+ KMALLOC(km, qlens, seg_en - seg_st); // TODO: if this is an issue (quite unlikely), preallocate
113
+ for (i = seg_st; i < seg_en; ++i)
114
+ qlens[i - seg_st] = s->seq[i].l_seq;
115
+ if (p->opt->flag & MG_M_CAL_COV)
116
+ mg_cov_map(p->gi->g, s->gcs[seg_st], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
117
+ else mg_write_gaf(&p->str, p->gi->g, s->gcs[seg_st], seg_en - seg_st, qlens, t->name, p->opt->flag, km);
118
+ kfree(km, qlens);
119
+ if (p->str.l) mg_err_fputs(p->str.s, stdout);
120
+ } else {
121
+ for (i = seg_st; i < seg_en; ++i) {
122
+ mg_bseq1_t *t = &s->seq[i];
123
+ if (p->opt->flag & MG_M_CAL_COV)
124
+ mg_cov_map(p->gi->g, s->gcs[i], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
125
+ else mg_write_gaf(&p->str, p->gi->g, s->gcs[i], 1, &t->l_seq, t->name, p->opt->flag, km);
126
+ if (p->str.l) mg_err_fputs(p->str.s, stdout);
127
+ }
128
+ }
129
+ for (i = seg_st; i < seg_en; ++i) {
130
+ mg_gchain_free(s->gcs[i]);
131
+ free(s->seq[i].seq); free(s->seq[i].name);
132
+ if (s->seq[i].qual) free(s->seq[i].qual);
133
+ if (s->seq[i].comment) free(s->seq[i].comment);
134
+ }
135
+ }
136
+ free(s->gcs); free(s->seg_off); free(s->seq); // n_seg, rep_len and frag_gap were allocated with seg_off; no memory leak here
137
+ if (km) km_destroy(km);
138
+ if (mg_verbose >= 3)
139
+ fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), s->n_seq);
140
+ free(s);
141
+ }
142
+ return 0;
143
+ }
144
+
145
+ static mg_bseq_file_t **open_bseqs(int n, const char **fn)
146
+ {
147
+ mg_bseq_file_t **fp;
148
+ int i, j;
149
+ fp = (mg_bseq_file_t**)calloc(n, sizeof(mg_bseq_file_t*));
150
+ for (i = 0; i < n; ++i) {
151
+ if ((fp[i] = mg_bseq_open(fn[i])) == 0) {
152
+ if (mg_verbose >= 1)
153
+ fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
154
+ for (j = 0; j < i; ++j)
155
+ mg_bseq_close(fp[j]);
156
+ free(fp);
157
+ return 0;
158
+ }
159
+ }
160
+ return fp;
161
+ }
162
+
163
+ int mg_map_file_frag(const mg_idx_t *idx, int n_segs, const char **fn, const mg_mapopt_t *opt, int n_threads, double *c_seg, double *c_link)
164
+ {
165
+ int i, pl_threads;
166
+ pipeline_t pl;
167
+ if (n_segs < 1) return -1;
168
+ memset(&pl, 0, sizeof(pipeline_t));
169
+ pl.n_fp = n_segs;
170
+ pl.fp = open_bseqs(pl.n_fp, fn);
171
+ if (pl.fp == 0) return -1;
172
+ pl.opt = opt, pl.gi = idx;
173
+ pl.n_threads = n_threads > 1? n_threads : 1;
174
+ pl.mini_batch_size = opt->mini_batch_size;
175
+ pl.c_seg = c_seg, pl.c_link = c_link;
176
+ pl_threads = n_threads == 1? 1 : (opt->flag&MG_M_2_IO_THREADS)? 3 : 2;
177
+ kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
178
+
179
+ free(pl.str.s);
180
+ for (i = 0; i < pl.n_fp; ++i)
181
+ mg_bseq_close(pl.fp[i]);
182
+ free(pl.fp);
183
+ return 0;
184
+ }
185
+
186
+ int mg_map_files(gfa_t *g, int n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, int n_threads)
187
+ {
188
+ mg_mapopt_t opt = *opt0;
189
+ mg_idx_t *gi;
190
+ int i, ret = 0;
191
+ double *cov_seg = 0, *cov_link = 0;
192
+ if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
193
+ if (opt.flag & MG_M_CAL_COV) {
194
+ KCALLOC(0, cov_seg, g->n_seg);
195
+ KCALLOC(0, cov_link, g->n_arc);
196
+ }
197
+ if (opt.flag & MG_M_FRAG_MODE) {
198
+ ret = mg_map_file_frag(gi, n_fn, fn, &opt, n_threads, cov_seg, cov_link);
199
+ } else {
200
+ for (i = 0; i < n_fn; ++i) {
201
+ ret = mg_map_file_frag(gi, 1, &fn[i], &opt, n_threads, cov_seg, cov_link);
202
+ if (ret != 0) break;
203
+ }
204
+ }
205
+ if (opt.flag & MG_M_CAL_COV) {
206
+ gfa_aux_update_cv(g, "dc", cov_seg, cov_link);
207
+ free(cov_seg); free(cov_link);
208
+ }
209
+ mg_idx_destroy(gi);
210
+ return ret;
211
+ }
@@ -0,0 +1,230 @@
1
+ #include <assert.h>
2
+ #include "mgpriv.h"
3
+ #include "khashl.h"
4
+ #include "kthread.h"
5
+ #include "kvec-km.h"
6
+ #include "sys.h"
7
+
8
+ #define idx_hash(a) ((a)>>1)
9
+ #define idx_eq(a, b) ((a)>>1 == (b)>>1)
10
+ KHASHL_MAP_INIT(KH_LOCAL, idxhash_t, mg_hidx, uint64_t, uint64_t, idx_hash, idx_eq)
11
+
12
+ typedef struct mg_idx_bucket_s {
13
+ mg128_v a; // (minimizer, position) array
14
+ int32_t n; // size of the _p_ array
15
+ uint64_t *p; // position array for minimizers appearing >1 times
16
+ void *h; // hash table indexing _p_ and minimizers appearing once
17
+ } mg_idx_bucket_t;
18
+
19
+ mg_idx_t *mg_idx_init(int k, int w, int b)
20
+ {
21
+ mg_idx_t *gi;
22
+ if (k*2 < b) b = k * 2;
23
+ if (w < 1) w = 1;
24
+ KCALLOC(0, gi, 1);
25
+ gi->w = w, gi->k = k, gi->b = b;
26
+ KCALLOC(0, gi->B, 1<<b);
27
+ return gi;
28
+ }
29
+
30
+ void mg_idx_destroy(mg_idx_t *gi)
31
+ {
32
+ uint32_t i;
33
+ if (gi == 0) return;
34
+ if (gi->B) {
35
+ for (i = 0; i < 1U<<gi->b; ++i) {
36
+ free(gi->B[i].p);
37
+ free(gi->B[i].a.a);
38
+ mg_hidx_destroy((idxhash_t*)gi->B[i].h);
39
+ }
40
+ free(gi->B);
41
+ }
42
+ gfa_edseq_destroy(gi->n_seg, gi->es);
43
+ free(gi);
44
+ }
45
+
46
+ /****************
47
+ * Index access *
48
+ ****************/
49
+
50
+ const uint64_t *mg_idx_hget(const void *h_, const uint64_t *q, int suflen, uint64_t minier, int *n)
51
+ {
52
+ khint_t k;
53
+ const idxhash_t *h = (const idxhash_t*)h_;
54
+ *n = 0;
55
+ if (h == 0) return 0;
56
+ k = mg_hidx_get(h, minier>>suflen<<1);
57
+ if (k == kh_end(h)) return 0;
58
+ if (kh_key(h, k)&1) { // special casing when there is only one k-mer
59
+ *n = 1;
60
+ return &kh_val(h, k);
61
+ } else {
62
+ *n = (uint32_t)kh_val(h, k);
63
+ return &q[kh_val(h, k)>>32];
64
+ }
65
+ }
66
+
67
+ const uint64_t *mg_idx_get(const mg_idx_t *gi, uint64_t minier, int *n)
68
+ {
69
+ int mask = (1<<gi->b) - 1;
70
+ mg_idx_bucket_t *b = &gi->B[minier&mask];
71
+ return mg_idx_hget(b->h, b->p, gi->b, minier, n);
72
+ }
73
+
74
+ void mg_idx_cal_quantile(const mg_idx_t *gi, int32_t m, float f[], int32_t q[])
75
+ {
76
+ int32_t i;
77
+ uint64_t n = 0;
78
+ khint_t *a, k;
79
+ for (i = 0; i < 1<<gi->b; ++i)
80
+ if (gi->B[i].h) n += kh_size((idxhash_t*)gi->B[i].h);
81
+ a = (uint32_t*)malloc(n * 4);
82
+ for (i = 0, n = 0; i < 1<<gi->b; ++i) {
83
+ idxhash_t *h = (idxhash_t*)gi->B[i].h;
84
+ if (h == 0) continue;
85
+ for (k = 0; k < kh_end(h); ++k) {
86
+ if (!kh_exist(h, k)) continue;
87
+ a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
88
+ }
89
+ }
90
+ for (i = 0; i < m; ++i)
91
+ q[i] = ks_ksmall_uint32_t(n, a, (size_t)((1.0 - (double)f[i]) * n));
92
+ free(a);
93
+ }
94
+
95
+ /***************
96
+ * Index build *
97
+ ***************/
98
+
99
+ static void mg_idx_add(mg_idx_t *gi, int n, const mg128_t *a)
100
+ {
101
+ int i, mask = (1<<gi->b) - 1;
102
+ for (i = 0; i < n; ++i) {
103
+ mg128_v *p = &gi->B[a[i].x>>8&mask].a;
104
+ kv_push(mg128_t, 0, *p, a[i]);
105
+ }
106
+ }
107
+
108
+ void mg_idx_hfree(void *h_)
109
+ {
110
+ idxhash_t *h = (idxhash_t*)h_;
111
+ if (h == 0) return;
112
+ mg_hidx_destroy(h);
113
+ }
114
+
115
+ void *mg_idx_a2h(void *km, int32_t n_a, mg128_t *a, int suflen, uint64_t **q_, int32_t *n_)
116
+ {
117
+ int32_t N, n, n_keys;
118
+ int32_t j, start_a, start_q;
119
+ idxhash_t *h;
120
+ uint64_t *q;
121
+
122
+ *q_ = 0, *n_ = 0;
123
+ if (n_a == 0) return 0;
124
+
125
+ // sort by minimizer
126
+ radix_sort_128x(a, a + n_a);
127
+
128
+ // count and preallocate
129
+ for (j = 1, n = 1, n_keys = 0, N = 0; j <= n_a; ++j) {
130
+ if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
131
+ ++n_keys;
132
+ if (n > 1) N += n;
133
+ n = 1;
134
+ } else ++n;
135
+ }
136
+ h = mg_hidx_init2(km);
137
+ mg_hidx_resize(h, n_keys);
138
+ KCALLOC(km, q, N);
139
+ *q_ = q, *n_ = N;
140
+
141
+ // create the hash table
142
+ for (j = 1, n = 1, start_a = start_q = 0; j <= n_a; ++j) {
143
+ if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
144
+ khint_t itr;
145
+ int absent;
146
+ mg128_t *p = &a[j-1];
147
+ itr = mg_hidx_put(h, p->x>>8>>suflen<<1, &absent);
148
+ assert(absent && j == start_a + n);
149
+ if (n == 1) {
150
+ kh_key(h, itr) |= 1;
151
+ kh_val(h, itr) = p->y;
152
+ } else {
153
+ int k;
154
+ for (k = 0; k < n; ++k)
155
+ q[start_q + k] = a[start_a + k].y;
156
+ radix_sort_gfa64(&q[start_q], &q[start_q + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
157
+ kh_val(h, itr) = (uint64_t)start_q<<32 | n;
158
+ start_q += n;
159
+ }
160
+ start_a = j, n = 1;
161
+ } else ++n;
162
+ }
163
+ assert(N == start_q);
164
+ return h;
165
+ }
166
+
167
+ static void worker_post(void *g, long i, int tid)
168
+ {
169
+ mg_idx_t *gi = (mg_idx_t*)g;
170
+ mg_idx_bucket_t *b = &gi->B[i];
171
+ if (b->a.n == 0) return;
172
+ b->h = (idxhash_t*)mg_idx_a2h(0, b->a.n, b->a.a, gi->b, &b->p, &b->n);
173
+ kfree(0, b->a.a);
174
+ b->a.n = b->a.m = 0, b->a.a = 0;
175
+ }
176
+
177
+ int mg_gfa_overlap(const gfa_t *g)
178
+ {
179
+ int64_t i;
180
+ for (i = 0; i < g->n_arc; ++i) // non-zero overlap
181
+ if (g->arc[i].ov != 0 || g->arc[i].ow != 0)
182
+ return 1;
183
+ return 0;
184
+ }
185
+
186
+ mg_idx_t *mg_index_core(gfa_t *g, int k, int w, int b, int n_threads)
187
+ {
188
+ mg_idx_t *gi;
189
+ mg128_v a = {0,0,0};
190
+ int i;
191
+
192
+ if (mg_gfa_overlap(g)) {
193
+ if (mg_verbose >= 1)
194
+ fprintf(stderr, "[E::%s] minigraph doesn't work with graphs containing overlapping segments\n", __func__);
195
+ return 0;
196
+ }
197
+ gi = mg_idx_init(k, w, b);
198
+ gi->g = g;
199
+
200
+ for (i = 0; i < g->n_seg; ++i) {
201
+ gfa_seg_t *s = &g->seg[i];
202
+ a.n = 0;
203
+ mg_sketch(0, s->seq, s->len, w, k, i, &a); // TODO: this can be parallelized
204
+ mg_idx_add(gi, a.n, a.a);
205
+ }
206
+ free(a.a);
207
+ kt_for(n_threads, worker_post, gi, 1<<gi->b);
208
+ return gi;
209
+ }
210
+
211
+ mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo)
212
+ {
213
+ int32_t i, j;
214
+ mg_idx_t *gi;
215
+ for (i = 0; i < g->n_seg; ++i) { // uppercase
216
+ gfa_seg_t *s = &g->seg[i];
217
+ for (j = 0; j < s->len; ++j)
218
+ if (s->seq[j] >= 'a' && s->seq[j] <= 'z')
219
+ s->seq[j] -= 32;
220
+ }
221
+ gi = mg_index_core(g, io->k, io->w, io->bucket_bits, n_threads);
222
+ if (gi == 0) return 0;
223
+ gi->es = gfa_edseq_init(gi->g);
224
+ gi->n_seg = g->n_seg;
225
+ if (mg_verbose >= 3)
226
+ fprintf(stderr, "[M::%s::%.3f*%.2f] indexed the graph\n", __func__,
227
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0));
228
+ if (mo) mg_opt_update(gi, mo, 0);
229
+ return gi;
230
+ }
@@ -0,0 +1,224 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "kalloc.h"
5
+
6
+ /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
7
+ * associated with a master header, which keeps the size of the current core
8
+ * and the pointer to next core. Kalloc allocates small *blocks* of memory from
9
+ * the cores and organizes free memory blocks in a circular single-linked list.
10
+ *
11
+ * In the following diagram, "@" stands for the header of a free block (of type
12
+ * header_t), "#" for the header of an allocated block (of type size_t), "-"
13
+ * for free memory, and "+" for allocated memory.
14
+ *
15
+ * master This region is core 1. master This region is core 2.
16
+ * | |
17
+ * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------
18
+ * | | | |
19
+ * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr
20
+ */
21
+ typedef struct header_t {
22
+ size_t size;
23
+ struct header_t *ptr;
24
+ } header_t;
25
+
26
+ typedef struct {
27
+ void *par;
28
+ size_t min_core_size;
29
+ header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
30
+ } kmem_t;
31
+
32
+ static void panic(const char *s)
33
+ {
34
+ fprintf(stderr, "%s\n", s);
35
+ abort();
36
+ }
37
+
38
+ void *km_init2(void *km_par, size_t min_core_size)
39
+ {
40
+ kmem_t *km;
41
+ km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
42
+ km->par = km_par;
43
+ if (km_par) km->min_core_size = min_core_size > 0? min_core_size : ((kmem_t*)km_par)->min_core_size - 2;
44
+ else km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
45
+ return (void*)km;
46
+ }
47
+
48
+ void *km_init(void) { return km_init2(0, 0); }
49
+
50
+ void km_destroy(void *_km)
51
+ {
52
+ kmem_t *km = (kmem_t*)_km;
53
+ void *km_par;
54
+ header_t *p, *q;
55
+ if (km == NULL) return;
56
+ km_par = km->par;
57
+ for (p = km->core_head; p != NULL;) {
58
+ q = p->ptr;
59
+ kfree(km_par, p);
60
+ p = q;
61
+ }
62
+ kfree(km_par, km);
63
+ }
64
+
65
+ static header_t *morecore(kmem_t *km, size_t nu)
66
+ {
67
+ header_t *q;
68
+ size_t bytes, *p;
69
+ nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
70
+ bytes = nu * sizeof(header_t);
71
+ q = (header_t*)kmalloc(km->par, bytes);
72
+ if (!q) panic("[morecore] insufficient memory");
73
+ q->ptr = km->core_head, q->size = nu, km->core_head = q;
74
+ p = (size_t*)(q + 1);
75
+ *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
76
+ kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
77
+ return km->loop_head;
78
+ }
79
+
80
+ void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
81
+ {
82
+ header_t *p, *q;
83
+ kmem_t *km = (kmem_t*)_km;
84
+
85
+ if (!ap) return;
86
+ if (km == NULL) {
87
+ free(ap);
88
+ return;
89
+ }
90
+ p = (header_t*)((size_t*)ap - 1);
91
+ p->size = *((size_t*)ap - 1);
92
+ /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
93
+ *
94
+ * a) "p>q && p<q->ptr": @------#++++++++#+++++++@------- @---------------#+++++++@-------
95
+ * (can also be in | | | -> | |
96
+ * two cores) q p q->ptr q q->ptr
97
+ *
98
+ * @-------- #+++++++++@-------- @-------- @------------------
99
+ * | | | -> | |
100
+ * q p q->ptr q q->ptr
101
+ *
102
+ * b) "q>=q->ptr && (p>q || p<q->ptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @----------------
103
+ * | | | -> | |
104
+ * q->ptr q p q->ptr q
105
+ *
106
+ * #+++++++@----- #++++++++@------- @------------- #++++++++@-------
107
+ * | | | -> | |
108
+ * p q->ptr q q->ptr q
109
+ */
110
+ for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
111
+ if (q >= q->ptr && (p > q || p < q->ptr)) break;
112
+ if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
113
+ p->size += q->ptr->size;
114
+ p->ptr = q->ptr->ptr;
115
+ } else if (p + p->size > q->ptr && q->ptr >= p) {
116
+ panic("[kfree] The end of the allocated block enters a free block.");
117
+ } else p->ptr = q->ptr; /* backup q->ptr */
118
+
119
+ if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
120
+ q->size += p->size;
121
+ q->ptr = p->ptr;
122
+ km->loop_head = q;
123
+ } else if (q + q->size > p && p >= q) {
124
+ panic("[kfree] The end of a free block enters the allocated block.");
125
+ } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
126
+ }
127
+
128
+ void *kmalloc(void *_km, size_t n_bytes)
129
+ {
130
+ kmem_t *km = (kmem_t*)_km;
131
+ size_t n_units;
132
+ header_t *p, *q;
133
+
134
+ if (n_bytes == 0) return 0;
135
+ if (km == NULL) return malloc(n_bytes);
136
+ n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
137
+
138
+ if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
139
+ q = km->loop_head = km->base.ptr = &km->base;
140
+ for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
141
+ if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
142
+ if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
143
+ else { /* split the block. NB: memory is allocated at the end of the block! */
144
+ p->size -= n_units; /* reduce the size of the free block */
145
+ p += p->size; /* p points to the allocated block */
146
+ *(size_t*)p = n_units; /* set the size */
147
+ }
148
+ km->loop_head = q; /* set the end of chain */
149
+ return (size_t*)p + 1;
150
+ }
151
+ if (p == km->loop_head) { /* then ask for more "cores" */
152
+ if ((p = morecore(km, n_units)) == 0) return 0;
153
+ }
154
+ }
155
+ }
156
+
157
+ void *kcalloc(void *_km, size_t count, size_t size)
158
+ {
159
+ kmem_t *km = (kmem_t*)_km;
160
+ void *p;
161
+ if (size == 0 || count == 0) return 0;
162
+ if (km == NULL) return calloc(count, size);
163
+ p = kmalloc(km, count * size);
164
+ memset(p, 0, count * size);
165
+ return p;
166
+ }
167
+
168
+ void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
169
+ {
170
+ kmem_t *km = (kmem_t*)_km;
171
+ size_t cap, *p, *q;
172
+
173
+ if (n_bytes == 0) {
174
+ kfree(km, ap); return 0;
175
+ }
176
+ if (km == NULL) return realloc(ap, n_bytes);
177
+ if (ap == NULL) return kmalloc(km, n_bytes);
178
+ p = (size_t*)ap - 1;
179
+ cap = (*p) * sizeof(header_t) - sizeof(size_t);
180
+ if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
181
+ q = (size_t*)kmalloc(km, n_bytes);
182
+ memcpy(q, ap, cap);
183
+ kfree(km, ap);
184
+ return q;
185
+ }
186
+
187
+ void *krelocate(void *km, void *ap, size_t n_bytes)
188
+ {
189
+ void *p;
190
+ if (km == 0 || ap == 0) return ap;
191
+ p = kmalloc(km, n_bytes);
192
+ memcpy(p, ap, n_bytes);
193
+ kfree(km, ap);
194
+ return p;
195
+ }
196
+
197
+ void km_stat(const void *_km, km_stat_t *s)
198
+ {
199
+ kmem_t *km = (kmem_t*)_km;
200
+ header_t *p;
201
+ memset(s, 0, sizeof(km_stat_t));
202
+ if (km == NULL || km->loop_head == NULL) return;
203
+ for (p = km->loop_head;; p = p->ptr) {
204
+ s->available += p->size * sizeof(header_t);
205
+ if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
206
+ if (p->ptr > p && p + p->size > p->ptr)
207
+ panic("[km_stat] The end of a free block enters another free block.");
208
+ if (p->ptr == km->loop_head) break;
209
+ }
210
+ for (p = km->core_head; p != NULL; p = p->ptr) {
211
+ size_t size = p->size * sizeof(header_t);
212
+ ++s->n_cores;
213
+ s->capacity += size;
214
+ s->largest = s->largest > size? s->largest : size;
215
+ }
216
+ }
217
+
218
+ void km_stat_print(const void *km)
219
+ {
220
+ km_stat_t st;
221
+ km_stat(km, &st);
222
+ fprintf(stderr, "[km_stat] cap=%ld, avail=%ld, largest=%ld, n_core=%ld, n_block=%ld\n",
223
+ st.capacity, st.available, st.largest, st.n_blocks, st.n_cores);
224
+ }