ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,211 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "kthread.h"
|
4
|
+
#include "kalloc.h"
|
5
|
+
#include "bseq.h"
|
6
|
+
#include "sys.h"
|
7
|
+
#include "mgpriv.h"
|
8
|
+
#include "gfa-priv.h"
|
9
|
+
|
10
|
+
typedef struct {
|
11
|
+
int64_t mini_batch_size;
|
12
|
+
int n_processed, n_threads, n_fp;
|
13
|
+
const mg_mapopt_t *opt;
|
14
|
+
mg_bseq_file_t **fp;
|
15
|
+
const mg_idx_t *gi;
|
16
|
+
kstring_t str;
|
17
|
+
double *c_seg, *c_link;
|
18
|
+
} pipeline_t;
|
19
|
+
|
20
|
+
typedef struct {
|
21
|
+
const pipeline_t *p;
|
22
|
+
int n_seq, n_frag;
|
23
|
+
mg_bseq1_t *seq;
|
24
|
+
int *seg_off, *n_seg;
|
25
|
+
mg_gchains_t **gcs;
|
26
|
+
mg_tbuf_t **buf;
|
27
|
+
} step_t;
|
28
|
+
|
29
|
+
static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
30
|
+
{
|
31
|
+
step_t *s = (step_t*)_data;
|
32
|
+
int qlens[MG_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
|
33
|
+
const char *qseqs[MG_MAX_SEG];
|
34
|
+
mg_tbuf_t *b = s->buf[tid];
|
35
|
+
assert(s->n_seg[i] <= MG_MAX_SEG);
|
36
|
+
if (mg_dbg_flag & MG_DBG_QNAME)
|
37
|
+
fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
|
38
|
+
for (j = 0; j < s->n_seg[i]; ++j) {
|
39
|
+
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
|
40
|
+
mg_revcomp_bseq(&s->seq[off + j]);
|
41
|
+
qlens[j] = s->seq[off + j].l_seq;
|
42
|
+
qseqs[j] = s->seq[off + j].seq;
|
43
|
+
}
|
44
|
+
if (s->p->opt->flag & MG_M_INDEPEND_SEG) {
|
45
|
+
for (j = 0; j < s->n_seg[i]; ++j)
|
46
|
+
mg_map_frag(s->p->gi, 1, &qlens[j], &qseqs[j], &s->gcs[off+j], b, s->p->opt, s->seq[off+j].name);
|
47
|
+
} else {
|
48
|
+
mg_map_frag(s->p->gi, s->n_seg[i], qlens, qseqs, &s->gcs[off], b, s->p->opt, s->seq[off].name);
|
49
|
+
}
|
50
|
+
#if 0 // for paired-end reads
|
51
|
+
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
|
52
|
+
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
|
53
|
+
int k, t;
|
54
|
+
mg_revcomp_bseq(&s->seq[off + j]);
|
55
|
+
for (k = 0; k < s->n_reg[off + j]; ++k) {
|
56
|
+
mg_lchain_t *r = &s->reg[off + j][k];
|
57
|
+
t = r->qs;
|
58
|
+
r->qs = qlens[j] - r->qe;
|
59
|
+
r->qe = qlens[j] - t;
|
60
|
+
r->v ^= 1;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
#endif
|
64
|
+
}
|
65
|
+
|
66
|
+
static void *worker_pipeline(void *shared, int step, void *in)
|
67
|
+
{
|
68
|
+
int i, j, k;
|
69
|
+
pipeline_t *p = (pipeline_t*)shared;
|
70
|
+
if (step == 0) { // step 0: read sequences
|
71
|
+
int with_qual = !(p->opt->flag & MG_M_NO_QUAL);
|
72
|
+
int with_comment = !!(p->opt->flag & MG_M_COPY_COMMENT);
|
73
|
+
int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MG_M_FRAG_MODE));
|
74
|
+
step_t *s;
|
75
|
+
s = (step_t*)calloc(1, sizeof(step_t));
|
76
|
+
if (p->n_fp > 1) s->seq = mg_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
|
77
|
+
else s->seq = mg_bseq_read(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
|
78
|
+
if (s->seq) {
|
79
|
+
s->p = p;
|
80
|
+
for (i = 0; i < s->n_seq; ++i)
|
81
|
+
mg_toupper(s->seq[i].l_seq, s->seq[i].seq);
|
82
|
+
for (i = 0; i < s->n_seq; ++i)
|
83
|
+
s->seq[i].rid = p->n_processed++;
|
84
|
+
s->buf = (mg_tbuf_t**)calloc(p->n_threads, sizeof(mg_tbuf_t*));
|
85
|
+
for (i = 0; i < p->n_threads; ++i)
|
86
|
+
s->buf[i] = mg_tbuf_init();
|
87
|
+
s->seg_off = (int*)calloc(2 * s->n_seq, sizeof(int));
|
88
|
+
s->n_seg = s->seg_off + s->n_seq; // n_seg, rep_len and frag_gap are allocated together with seg_off
|
89
|
+
KCALLOC(0, s->gcs, s->n_seq);
|
90
|
+
for (i = 1, j = 0; i <= s->n_seq; ++i)
|
91
|
+
if (i == s->n_seq || !frag_mode || !mg_qname_same(s->seq[i-1].name, s->seq[i].name)) {
|
92
|
+
s->n_seg[s->n_frag] = i - j;
|
93
|
+
s->seg_off[s->n_frag++] = j;
|
94
|
+
j = i;
|
95
|
+
}
|
96
|
+
return s;
|
97
|
+
} else free(s);
|
98
|
+
} else if (step == 1) { // step 1: map
|
99
|
+
kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag);
|
100
|
+
return in;
|
101
|
+
} else if (step == 2) { // step 2: output
|
102
|
+
void *km = 0;
|
103
|
+
step_t *s = (step_t*)in;
|
104
|
+
for (i = 0; i < p->n_threads; ++i) mg_tbuf_destroy(s->buf[i]);
|
105
|
+
free(s->buf);
|
106
|
+
if (!(mg_dbg_flag & MG_DBG_NO_KALLOC)) km = km_init();
|
107
|
+
for (k = 0; k < s->n_frag; ++k) {
|
108
|
+
int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
|
109
|
+
if ((p->opt->flag & MG_M_FRAG_MODE) && (p->opt->flag & MG_M_FRAG_MERGE)) {
|
110
|
+
mg_bseq1_t *t = &s->seq[seg_st];
|
111
|
+
int32_t *qlens;
|
112
|
+
KMALLOC(km, qlens, seg_en - seg_st); // TODO: if this is an issue (quite unlikely), preallocate
|
113
|
+
for (i = seg_st; i < seg_en; ++i)
|
114
|
+
qlens[i - seg_st] = s->seq[i].l_seq;
|
115
|
+
if (p->opt->flag & MG_M_CAL_COV)
|
116
|
+
mg_cov_map(p->gi->g, s->gcs[seg_st], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
|
117
|
+
else mg_write_gaf(&p->str, p->gi->g, s->gcs[seg_st], seg_en - seg_st, qlens, t->name, p->opt->flag, km);
|
118
|
+
kfree(km, qlens);
|
119
|
+
if (p->str.l) mg_err_fputs(p->str.s, stdout);
|
120
|
+
} else {
|
121
|
+
for (i = seg_st; i < seg_en; ++i) {
|
122
|
+
mg_bseq1_t *t = &s->seq[i];
|
123
|
+
if (p->opt->flag & MG_M_CAL_COV)
|
124
|
+
mg_cov_map(p->gi->g, s->gcs[i], p->opt->min_cov_mapq, p->opt->min_cov_blen, p->c_seg, p->c_link, t->name);
|
125
|
+
else mg_write_gaf(&p->str, p->gi->g, s->gcs[i], 1, &t->l_seq, t->name, p->opt->flag, km);
|
126
|
+
if (p->str.l) mg_err_fputs(p->str.s, stdout);
|
127
|
+
}
|
128
|
+
}
|
129
|
+
for (i = seg_st; i < seg_en; ++i) {
|
130
|
+
mg_gchain_free(s->gcs[i]);
|
131
|
+
free(s->seq[i].seq); free(s->seq[i].name);
|
132
|
+
if (s->seq[i].qual) free(s->seq[i].qual);
|
133
|
+
if (s->seq[i].comment) free(s->seq[i].comment);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
free(s->gcs); free(s->seg_off); free(s->seq); // n_seg, rep_len and frag_gap were allocated with seg_off; no memory leak here
|
137
|
+
if (km) km_destroy(km);
|
138
|
+
if (mg_verbose >= 3)
|
139
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), s->n_seq);
|
140
|
+
free(s);
|
141
|
+
}
|
142
|
+
return 0;
|
143
|
+
}
|
144
|
+
|
145
|
+
static mg_bseq_file_t **open_bseqs(int n, const char **fn)
|
146
|
+
{
|
147
|
+
mg_bseq_file_t **fp;
|
148
|
+
int i, j;
|
149
|
+
fp = (mg_bseq_file_t**)calloc(n, sizeof(mg_bseq_file_t*));
|
150
|
+
for (i = 0; i < n; ++i) {
|
151
|
+
if ((fp[i] = mg_bseq_open(fn[i])) == 0) {
|
152
|
+
if (mg_verbose >= 1)
|
153
|
+
fprintf(stderr, "ERROR: failed to open file '%s'\n", fn[i]);
|
154
|
+
for (j = 0; j < i; ++j)
|
155
|
+
mg_bseq_close(fp[j]);
|
156
|
+
free(fp);
|
157
|
+
return 0;
|
158
|
+
}
|
159
|
+
}
|
160
|
+
return fp;
|
161
|
+
}
|
162
|
+
|
163
|
+
int mg_map_file_frag(const mg_idx_t *idx, int n_segs, const char **fn, const mg_mapopt_t *opt, int n_threads, double *c_seg, double *c_link)
|
164
|
+
{
|
165
|
+
int i, pl_threads;
|
166
|
+
pipeline_t pl;
|
167
|
+
if (n_segs < 1) return -1;
|
168
|
+
memset(&pl, 0, sizeof(pipeline_t));
|
169
|
+
pl.n_fp = n_segs;
|
170
|
+
pl.fp = open_bseqs(pl.n_fp, fn);
|
171
|
+
if (pl.fp == 0) return -1;
|
172
|
+
pl.opt = opt, pl.gi = idx;
|
173
|
+
pl.n_threads = n_threads > 1? n_threads : 1;
|
174
|
+
pl.mini_batch_size = opt->mini_batch_size;
|
175
|
+
pl.c_seg = c_seg, pl.c_link = c_link;
|
176
|
+
pl_threads = n_threads == 1? 1 : (opt->flag&MG_M_2_IO_THREADS)? 3 : 2;
|
177
|
+
kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
|
178
|
+
|
179
|
+
free(pl.str.s);
|
180
|
+
for (i = 0; i < pl.n_fp; ++i)
|
181
|
+
mg_bseq_close(pl.fp[i]);
|
182
|
+
free(pl.fp);
|
183
|
+
return 0;
|
184
|
+
}
|
185
|
+
|
186
|
+
int mg_map_files(gfa_t *g, int n_fn, const char **fn, const mg_idxopt_t *ipt, const mg_mapopt_t *opt0, int n_threads)
|
187
|
+
{
|
188
|
+
mg_mapopt_t opt = *opt0;
|
189
|
+
mg_idx_t *gi;
|
190
|
+
int i, ret = 0;
|
191
|
+
double *cov_seg = 0, *cov_link = 0;
|
192
|
+
if ((gi = mg_index(g, ipt, n_threads, &opt)) == 0) return -1;
|
193
|
+
if (opt.flag & MG_M_CAL_COV) {
|
194
|
+
KCALLOC(0, cov_seg, g->n_seg);
|
195
|
+
KCALLOC(0, cov_link, g->n_arc);
|
196
|
+
}
|
197
|
+
if (opt.flag & MG_M_FRAG_MODE) {
|
198
|
+
ret = mg_map_file_frag(gi, n_fn, fn, &opt, n_threads, cov_seg, cov_link);
|
199
|
+
} else {
|
200
|
+
for (i = 0; i < n_fn; ++i) {
|
201
|
+
ret = mg_map_file_frag(gi, 1, &fn[i], &opt, n_threads, cov_seg, cov_link);
|
202
|
+
if (ret != 0) break;
|
203
|
+
}
|
204
|
+
}
|
205
|
+
if (opt.flag & MG_M_CAL_COV) {
|
206
|
+
gfa_aux_update_cv(g, "dc", cov_seg, cov_link);
|
207
|
+
free(cov_seg); free(cov_link);
|
208
|
+
}
|
209
|
+
mg_idx_destroy(gi);
|
210
|
+
return ret;
|
211
|
+
}
|
@@ -0,0 +1,230 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "khashl.h"
|
4
|
+
#include "kthread.h"
|
5
|
+
#include "kvec-km.h"
|
6
|
+
#include "sys.h"
|
7
|
+
|
8
|
+
#define idx_hash(a) ((a)>>1)
|
9
|
+
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
|
10
|
+
KHASHL_MAP_INIT(KH_LOCAL, idxhash_t, mg_hidx, uint64_t, uint64_t, idx_hash, idx_eq)
|
11
|
+
|
12
|
+
typedef struct mg_idx_bucket_s {
|
13
|
+
mg128_v a; // (minimizer, position) array
|
14
|
+
int32_t n; // size of the _p_ array
|
15
|
+
uint64_t *p; // position array for minimizers appearing >1 times
|
16
|
+
void *h; // hash table indexing _p_ and minimizers appearing once
|
17
|
+
} mg_idx_bucket_t;
|
18
|
+
|
19
|
+
mg_idx_t *mg_idx_init(int k, int w, int b)
|
20
|
+
{
|
21
|
+
mg_idx_t *gi;
|
22
|
+
if (k*2 < b) b = k * 2;
|
23
|
+
if (w < 1) w = 1;
|
24
|
+
KCALLOC(0, gi, 1);
|
25
|
+
gi->w = w, gi->k = k, gi->b = b;
|
26
|
+
KCALLOC(0, gi->B, 1<<b);
|
27
|
+
return gi;
|
28
|
+
}
|
29
|
+
|
30
|
+
void mg_idx_destroy(mg_idx_t *gi)
|
31
|
+
{
|
32
|
+
uint32_t i;
|
33
|
+
if (gi == 0) return;
|
34
|
+
if (gi->B) {
|
35
|
+
for (i = 0; i < 1U<<gi->b; ++i) {
|
36
|
+
free(gi->B[i].p);
|
37
|
+
free(gi->B[i].a.a);
|
38
|
+
mg_hidx_destroy((idxhash_t*)gi->B[i].h);
|
39
|
+
}
|
40
|
+
free(gi->B);
|
41
|
+
}
|
42
|
+
gfa_edseq_destroy(gi->n_seg, gi->es);
|
43
|
+
free(gi);
|
44
|
+
}
|
45
|
+
|
46
|
+
/****************
|
47
|
+
* Index access *
|
48
|
+
****************/
|
49
|
+
|
50
|
+
const uint64_t *mg_idx_hget(const void *h_, const uint64_t *q, int suflen, uint64_t minier, int *n)
|
51
|
+
{
|
52
|
+
khint_t k;
|
53
|
+
const idxhash_t *h = (const idxhash_t*)h_;
|
54
|
+
*n = 0;
|
55
|
+
if (h == 0) return 0;
|
56
|
+
k = mg_hidx_get(h, minier>>suflen<<1);
|
57
|
+
if (k == kh_end(h)) return 0;
|
58
|
+
if (kh_key(h, k)&1) { // special casing when there is only one k-mer
|
59
|
+
*n = 1;
|
60
|
+
return &kh_val(h, k);
|
61
|
+
} else {
|
62
|
+
*n = (uint32_t)kh_val(h, k);
|
63
|
+
return &q[kh_val(h, k)>>32];
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
const uint64_t *mg_idx_get(const mg_idx_t *gi, uint64_t minier, int *n)
|
68
|
+
{
|
69
|
+
int mask = (1<<gi->b) - 1;
|
70
|
+
mg_idx_bucket_t *b = &gi->B[minier&mask];
|
71
|
+
return mg_idx_hget(b->h, b->p, gi->b, minier, n);
|
72
|
+
}
|
73
|
+
|
74
|
+
void mg_idx_cal_quantile(const mg_idx_t *gi, int32_t m, float f[], int32_t q[])
|
75
|
+
{
|
76
|
+
int32_t i;
|
77
|
+
uint64_t n = 0;
|
78
|
+
khint_t *a, k;
|
79
|
+
for (i = 0; i < 1<<gi->b; ++i)
|
80
|
+
if (gi->B[i].h) n += kh_size((idxhash_t*)gi->B[i].h);
|
81
|
+
a = (uint32_t*)malloc(n * 4);
|
82
|
+
for (i = 0, n = 0; i < 1<<gi->b; ++i) {
|
83
|
+
idxhash_t *h = (idxhash_t*)gi->B[i].h;
|
84
|
+
if (h == 0) continue;
|
85
|
+
for (k = 0; k < kh_end(h); ++k) {
|
86
|
+
if (!kh_exist(h, k)) continue;
|
87
|
+
a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
for (i = 0; i < m; ++i)
|
91
|
+
q[i] = ks_ksmall_uint32_t(n, a, (size_t)((1.0 - (double)f[i]) * n));
|
92
|
+
free(a);
|
93
|
+
}
|
94
|
+
|
95
|
+
/***************
|
96
|
+
* Index build *
|
97
|
+
***************/
|
98
|
+
|
99
|
+
static void mg_idx_add(mg_idx_t *gi, int n, const mg128_t *a)
|
100
|
+
{
|
101
|
+
int i, mask = (1<<gi->b) - 1;
|
102
|
+
for (i = 0; i < n; ++i) {
|
103
|
+
mg128_v *p = &gi->B[a[i].x>>8&mask].a;
|
104
|
+
kv_push(mg128_t, 0, *p, a[i]);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
void mg_idx_hfree(void *h_)
|
109
|
+
{
|
110
|
+
idxhash_t *h = (idxhash_t*)h_;
|
111
|
+
if (h == 0) return;
|
112
|
+
mg_hidx_destroy(h);
|
113
|
+
}
|
114
|
+
|
115
|
+
void *mg_idx_a2h(void *km, int32_t n_a, mg128_t *a, int suflen, uint64_t **q_, int32_t *n_)
|
116
|
+
{
|
117
|
+
int32_t N, n, n_keys;
|
118
|
+
int32_t j, start_a, start_q;
|
119
|
+
idxhash_t *h;
|
120
|
+
uint64_t *q;
|
121
|
+
|
122
|
+
*q_ = 0, *n_ = 0;
|
123
|
+
if (n_a == 0) return 0;
|
124
|
+
|
125
|
+
// sort by minimizer
|
126
|
+
radix_sort_128x(a, a + n_a);
|
127
|
+
|
128
|
+
// count and preallocate
|
129
|
+
for (j = 1, n = 1, n_keys = 0, N = 0; j <= n_a; ++j) {
|
130
|
+
if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
|
131
|
+
++n_keys;
|
132
|
+
if (n > 1) N += n;
|
133
|
+
n = 1;
|
134
|
+
} else ++n;
|
135
|
+
}
|
136
|
+
h = mg_hidx_init2(km);
|
137
|
+
mg_hidx_resize(h, n_keys);
|
138
|
+
KCALLOC(km, q, N);
|
139
|
+
*q_ = q, *n_ = N;
|
140
|
+
|
141
|
+
// create the hash table
|
142
|
+
for (j = 1, n = 1, start_a = start_q = 0; j <= n_a; ++j) {
|
143
|
+
if (j == n_a || a[j].x>>8 != a[j-1].x>>8) {
|
144
|
+
khint_t itr;
|
145
|
+
int absent;
|
146
|
+
mg128_t *p = &a[j-1];
|
147
|
+
itr = mg_hidx_put(h, p->x>>8>>suflen<<1, &absent);
|
148
|
+
assert(absent && j == start_a + n);
|
149
|
+
if (n == 1) {
|
150
|
+
kh_key(h, itr) |= 1;
|
151
|
+
kh_val(h, itr) = p->y;
|
152
|
+
} else {
|
153
|
+
int k;
|
154
|
+
for (k = 0; k < n; ++k)
|
155
|
+
q[start_q + k] = a[start_a + k].y;
|
156
|
+
radix_sort_gfa64(&q[start_q], &q[start_q + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
|
157
|
+
kh_val(h, itr) = (uint64_t)start_q<<32 | n;
|
158
|
+
start_q += n;
|
159
|
+
}
|
160
|
+
start_a = j, n = 1;
|
161
|
+
} else ++n;
|
162
|
+
}
|
163
|
+
assert(N == start_q);
|
164
|
+
return h;
|
165
|
+
}
|
166
|
+
|
167
|
+
static void worker_post(void *g, long i, int tid)
|
168
|
+
{
|
169
|
+
mg_idx_t *gi = (mg_idx_t*)g;
|
170
|
+
mg_idx_bucket_t *b = &gi->B[i];
|
171
|
+
if (b->a.n == 0) return;
|
172
|
+
b->h = (idxhash_t*)mg_idx_a2h(0, b->a.n, b->a.a, gi->b, &b->p, &b->n);
|
173
|
+
kfree(0, b->a.a);
|
174
|
+
b->a.n = b->a.m = 0, b->a.a = 0;
|
175
|
+
}
|
176
|
+
|
177
|
+
int mg_gfa_overlap(const gfa_t *g)
|
178
|
+
{
|
179
|
+
int64_t i;
|
180
|
+
for (i = 0; i < g->n_arc; ++i) // non-zero overlap
|
181
|
+
if (g->arc[i].ov != 0 || g->arc[i].ow != 0)
|
182
|
+
return 1;
|
183
|
+
return 0;
|
184
|
+
}
|
185
|
+
|
186
|
+
mg_idx_t *mg_index_core(gfa_t *g, int k, int w, int b, int n_threads)
|
187
|
+
{
|
188
|
+
mg_idx_t *gi;
|
189
|
+
mg128_v a = {0,0,0};
|
190
|
+
int i;
|
191
|
+
|
192
|
+
if (mg_gfa_overlap(g)) {
|
193
|
+
if (mg_verbose >= 1)
|
194
|
+
fprintf(stderr, "[E::%s] minigraph doesn't work with graphs containing overlapping segments\n", __func__);
|
195
|
+
return 0;
|
196
|
+
}
|
197
|
+
gi = mg_idx_init(k, w, b);
|
198
|
+
gi->g = g;
|
199
|
+
|
200
|
+
for (i = 0; i < g->n_seg; ++i) {
|
201
|
+
gfa_seg_t *s = &g->seg[i];
|
202
|
+
a.n = 0;
|
203
|
+
mg_sketch(0, s->seq, s->len, w, k, i, &a); // TODO: this can be parallelized
|
204
|
+
mg_idx_add(gi, a.n, a.a);
|
205
|
+
}
|
206
|
+
free(a.a);
|
207
|
+
kt_for(n_threads, worker_post, gi, 1<<gi->b);
|
208
|
+
return gi;
|
209
|
+
}
|
210
|
+
|
211
|
+
mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo)
|
212
|
+
{
|
213
|
+
int32_t i, j;
|
214
|
+
mg_idx_t *gi;
|
215
|
+
for (i = 0; i < g->n_seg; ++i) { // uppercase
|
216
|
+
gfa_seg_t *s = &g->seg[i];
|
217
|
+
for (j = 0; j < s->len; ++j)
|
218
|
+
if (s->seq[j] >= 'a' && s->seq[j] <= 'z')
|
219
|
+
s->seq[j] -= 32;
|
220
|
+
}
|
221
|
+
gi = mg_index_core(g, io->k, io->w, io->bucket_bits, n_threads);
|
222
|
+
if (gi == 0) return 0;
|
223
|
+
gi->es = gfa_edseq_init(gi->g);
|
224
|
+
gi->n_seg = g->n_seg;
|
225
|
+
if (mg_verbose >= 3)
|
226
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] indexed the graph\n", __func__,
|
227
|
+
realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0));
|
228
|
+
if (mo) mg_opt_update(gi, mo, 0);
|
229
|
+
return gi;
|
230
|
+
}
|
@@ -0,0 +1,224 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "kalloc.h"
|
5
|
+
|
6
|
+
/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
|
7
|
+
* associated with a master header, which keeps the size of the current core
|
8
|
+
* and the pointer to next core. Kalloc allocates small *blocks* of memory from
|
9
|
+
* the cores and organizes free memory blocks in a circular single-linked list.
|
10
|
+
*
|
11
|
+
* In the following diagram, "@" stands for the header of a free block (of type
|
12
|
+
* header_t), "#" for the header of an allocated block (of type size_t), "-"
|
13
|
+
* for free memory, and "+" for allocated memory.
|
14
|
+
*
|
15
|
+
* master This region is core 1. master This region is core 2.
|
16
|
+
* | |
|
17
|
+
* *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------
|
18
|
+
* | | | |
|
19
|
+
* p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr
|
20
|
+
*/
|
21
|
+
typedef struct header_t {
|
22
|
+
size_t size;
|
23
|
+
struct header_t *ptr;
|
24
|
+
} header_t;
|
25
|
+
|
26
|
+
typedef struct {
|
27
|
+
void *par;
|
28
|
+
size_t min_core_size;
|
29
|
+
header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
|
30
|
+
} kmem_t;
|
31
|
+
|
32
|
+
static void panic(const char *s)
|
33
|
+
{
|
34
|
+
fprintf(stderr, "%s\n", s);
|
35
|
+
abort();
|
36
|
+
}
|
37
|
+
|
38
|
+
void *km_init2(void *km_par, size_t min_core_size)
|
39
|
+
{
|
40
|
+
kmem_t *km;
|
41
|
+
km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
|
42
|
+
km->par = km_par;
|
43
|
+
if (km_par) km->min_core_size = min_core_size > 0? min_core_size : ((kmem_t*)km_par)->min_core_size - 2;
|
44
|
+
else km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
|
45
|
+
return (void*)km;
|
46
|
+
}
|
47
|
+
|
48
|
+
void *km_init(void) { return km_init2(0, 0); }
|
49
|
+
|
50
|
+
void km_destroy(void *_km)
|
51
|
+
{
|
52
|
+
kmem_t *km = (kmem_t*)_km;
|
53
|
+
void *km_par;
|
54
|
+
header_t *p, *q;
|
55
|
+
if (km == NULL) return;
|
56
|
+
km_par = km->par;
|
57
|
+
for (p = km->core_head; p != NULL;) {
|
58
|
+
q = p->ptr;
|
59
|
+
kfree(km_par, p);
|
60
|
+
p = q;
|
61
|
+
}
|
62
|
+
kfree(km_par, km);
|
63
|
+
}
|
64
|
+
|
65
|
+
static header_t *morecore(kmem_t *km, size_t nu)
|
66
|
+
{
|
67
|
+
header_t *q;
|
68
|
+
size_t bytes, *p;
|
69
|
+
nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
|
70
|
+
bytes = nu * sizeof(header_t);
|
71
|
+
q = (header_t*)kmalloc(km->par, bytes);
|
72
|
+
if (!q) panic("[morecore] insufficient memory");
|
73
|
+
q->ptr = km->core_head, q->size = nu, km->core_head = q;
|
74
|
+
p = (size_t*)(q + 1);
|
75
|
+
*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
|
76
|
+
kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
|
77
|
+
return km->loop_head;
|
78
|
+
}
|
79
|
+
|
80
|
+
void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
|
81
|
+
{
|
82
|
+
header_t *p, *q;
|
83
|
+
kmem_t *km = (kmem_t*)_km;
|
84
|
+
|
85
|
+
if (!ap) return;
|
86
|
+
if (km == NULL) {
|
87
|
+
free(ap);
|
88
|
+
return;
|
89
|
+
}
|
90
|
+
p = (header_t*)((size_t*)ap - 1);
|
91
|
+
p->size = *((size_t*)ap - 1);
|
92
|
+
/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
|
93
|
+
*
|
94
|
+
* a) "p>q && p<q->ptr": @------#++++++++#+++++++@------- @---------------#+++++++@-------
|
95
|
+
* (can also be in | | | -> | |
|
96
|
+
* two cores) q p q->ptr q q->ptr
|
97
|
+
*
|
98
|
+
* @-------- #+++++++++@-------- @-------- @------------------
|
99
|
+
* | | | -> | |
|
100
|
+
* q p q->ptr q q->ptr
|
101
|
+
*
|
102
|
+
* b) "q>=q->ptr && (p>q || p<q->ptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @----------------
|
103
|
+
* | | | -> | |
|
104
|
+
* q->ptr q p q->ptr q
|
105
|
+
*
|
106
|
+
* #+++++++@----- #++++++++@------- @------------- #++++++++@-------
|
107
|
+
* | | | -> | |
|
108
|
+
* p q->ptr q q->ptr q
|
109
|
+
*/
|
110
|
+
for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
|
111
|
+
if (q >= q->ptr && (p > q || p < q->ptr)) break;
|
112
|
+
if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
|
113
|
+
p->size += q->ptr->size;
|
114
|
+
p->ptr = q->ptr->ptr;
|
115
|
+
} else if (p + p->size > q->ptr && q->ptr >= p) {
|
116
|
+
panic("[kfree] The end of the allocated block enters a free block.");
|
117
|
+
} else p->ptr = q->ptr; /* backup q->ptr */
|
118
|
+
|
119
|
+
if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
|
120
|
+
q->size += p->size;
|
121
|
+
q->ptr = p->ptr;
|
122
|
+
km->loop_head = q;
|
123
|
+
} else if (q + q->size > p && p >= q) {
|
124
|
+
panic("[kfree] The end of a free block enters the allocated block.");
|
125
|
+
} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
|
126
|
+
}
|
127
|
+
|
128
|
+
void *kmalloc(void *_km, size_t n_bytes)
|
129
|
+
{
|
130
|
+
kmem_t *km = (kmem_t*)_km;
|
131
|
+
size_t n_units;
|
132
|
+
header_t *p, *q;
|
133
|
+
|
134
|
+
if (n_bytes == 0) return 0;
|
135
|
+
if (km == NULL) return malloc(n_bytes);
|
136
|
+
n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
|
137
|
+
|
138
|
+
if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
|
139
|
+
q = km->loop_head = km->base.ptr = &km->base;
|
140
|
+
for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
|
141
|
+
if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
|
142
|
+
if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
|
143
|
+
else { /* split the block. NB: memory is allocated at the end of the block! */
|
144
|
+
p->size -= n_units; /* reduce the size of the free block */
|
145
|
+
p += p->size; /* p points to the allocated block */
|
146
|
+
*(size_t*)p = n_units; /* set the size */
|
147
|
+
}
|
148
|
+
km->loop_head = q; /* set the end of chain */
|
149
|
+
return (size_t*)p + 1;
|
150
|
+
}
|
151
|
+
if (p == km->loop_head) { /* then ask for more "cores" */
|
152
|
+
if ((p = morecore(km, n_units)) == 0) return 0;
|
153
|
+
}
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
void *kcalloc(void *_km, size_t count, size_t size)
|
158
|
+
{
|
159
|
+
kmem_t *km = (kmem_t*)_km;
|
160
|
+
void *p;
|
161
|
+
if (size == 0 || count == 0) return 0;
|
162
|
+
if (km == NULL) return calloc(count, size);
|
163
|
+
p = kmalloc(km, count * size);
|
164
|
+
memset(p, 0, count * size);
|
165
|
+
return p;
|
166
|
+
}
|
167
|
+
|
168
|
+
void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
|
169
|
+
{
|
170
|
+
kmem_t *km = (kmem_t*)_km;
|
171
|
+
size_t cap, *p, *q;
|
172
|
+
|
173
|
+
if (n_bytes == 0) {
|
174
|
+
kfree(km, ap); return 0;
|
175
|
+
}
|
176
|
+
if (km == NULL) return realloc(ap, n_bytes);
|
177
|
+
if (ap == NULL) return kmalloc(km, n_bytes);
|
178
|
+
p = (size_t*)ap - 1;
|
179
|
+
cap = (*p) * sizeof(header_t) - sizeof(size_t);
|
180
|
+
if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
|
181
|
+
q = (size_t*)kmalloc(km, n_bytes);
|
182
|
+
memcpy(q, ap, cap);
|
183
|
+
kfree(km, ap);
|
184
|
+
return q;
|
185
|
+
}
|
186
|
+
|
187
|
+
void *krelocate(void *km, void *ap, size_t n_bytes)
|
188
|
+
{
|
189
|
+
void *p;
|
190
|
+
if (km == 0 || ap == 0) return ap;
|
191
|
+
p = kmalloc(km, n_bytes);
|
192
|
+
memcpy(p, ap, n_bytes);
|
193
|
+
kfree(km, ap);
|
194
|
+
return p;
|
195
|
+
}
|
196
|
+
|
197
|
+
void km_stat(const void *_km, km_stat_t *s)
|
198
|
+
{
|
199
|
+
kmem_t *km = (kmem_t*)_km;
|
200
|
+
header_t *p;
|
201
|
+
memset(s, 0, sizeof(km_stat_t));
|
202
|
+
if (km == NULL || km->loop_head == NULL) return;
|
203
|
+
for (p = km->loop_head;; p = p->ptr) {
|
204
|
+
s->available += p->size * sizeof(header_t);
|
205
|
+
if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
|
206
|
+
if (p->ptr > p && p + p->size > p->ptr)
|
207
|
+
panic("[km_stat] The end of a free block enters another free block.");
|
208
|
+
if (p->ptr == km->loop_head) break;
|
209
|
+
}
|
210
|
+
for (p = km->core_head; p != NULL; p = p->ptr) {
|
211
|
+
size_t size = p->size * sizeof(header_t);
|
212
|
+
++s->n_cores;
|
213
|
+
s->capacity += size;
|
214
|
+
s->largest = s->largest > size? s->largest : size;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
void km_stat_print(const void *km)
|
219
|
+
{
|
220
|
+
km_stat_t st;
|
221
|
+
km_stat(km, &st);
|
222
|
+
fprintf(stderr, "[km_stat] cap=%ld, avail=%ld, largest=%ld, n_core=%ld, n_block=%ld\n",
|
223
|
+
st.capacity, st.available, st.largest, st.n_blocks, st.n_cores);
|
224
|
+
}
|