ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "ksort.h"
|
4
|
+
|
5
|
+
int mg_verbose = 1;
|
6
|
+
int mg_dbg_flag = 0;
|
7
|
+
double mg_realtime0;
|
8
|
+
|
9
|
+
#define sort_key_128x(a) ((a).x)
|
10
|
+
KRADIX_SORT_INIT(128x, mg128_t, sort_key_128x, 8)
|
11
|
+
|
12
|
+
KSORT_INIT_GENERIC(uint32_t)
|
@@ -0,0 +1,134 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include "mgpriv.h"
|
3
|
+
#include "sys.h"
|
4
|
+
|
5
|
+
void mg_idxopt_init(mg_idxopt_t *io)
|
6
|
+
{
|
7
|
+
memset(io, 0, sizeof(mg_idxopt_t));
|
8
|
+
io->k = 17;
|
9
|
+
io->w = 11;
|
10
|
+
io->bucket_bits = 14;
|
11
|
+
}
|
12
|
+
|
13
|
+
void mg_mapopt_init(mg_mapopt_t *mo)
|
14
|
+
{
|
15
|
+
memset(mo, 0, sizeof(mg_mapopt_t));
|
16
|
+
mo->seed = 11;
|
17
|
+
mo->occ_max1 = 50, mo->occ_max1_cap = 250;
|
18
|
+
mo->occ_max1_frac = 2e-4f;
|
19
|
+
mo->max_gap = 5000;
|
20
|
+
mo->max_gap_ref = -1;
|
21
|
+
mo->max_gap_pre = 1000;
|
22
|
+
mo->max_lc_skip = 25, mo->max_gc_skip = 25;
|
23
|
+
mo->max_lc_iter = 5000;
|
24
|
+
mo->bw = 500, mo->bw_long = 20000;
|
25
|
+
mo->rmq_size_cap = 100000;
|
26
|
+
mo->rmq_rescue_size = 1000;
|
27
|
+
mo->rmq_rescue_ratio = 0.1f;
|
28
|
+
mo->mini_batch_size = 500000000;
|
29
|
+
mo->div = 0.1f;
|
30
|
+
mo->chn_pen_gap = 1.0f, mo->chn_pen_skip = 0.05f;
|
31
|
+
mo->min_lc_cnt = 5, mo->min_lc_score = 40;
|
32
|
+
mo->min_gc_cnt = 5, mo->min_gc_score = 50;
|
33
|
+
mo->gdp_max_ed = 10000;
|
34
|
+
mo->lc_max_trim = 50;
|
35
|
+
mo->lc_max_occ = 2;
|
36
|
+
mo->mask_level = 0.5f;
|
37
|
+
mo->sub_diff = 6;
|
38
|
+
mo->best_n = 5;
|
39
|
+
mo->pri_ratio = 0.8f;
|
40
|
+
mo->ref_bonus = 0;
|
41
|
+
mo->pe_ori = 0; // FF
|
42
|
+
mo->min_cov_mapq = 20;
|
43
|
+
mo->min_cov_blen = 1000;
|
44
|
+
mo->cap_kalloc = 1000000000;
|
45
|
+
}
|
46
|
+
|
47
|
+
void mg_ggopt_init(mg_ggopt_t *go)
|
48
|
+
{
|
49
|
+
memset(go, 0, sizeof(mg_ggopt_t));
|
50
|
+
go->algo = MG_G_NONE;
|
51
|
+
go->flag |= MG_G_NO_QOVLP;
|
52
|
+
go->min_map_len = 100000;
|
53
|
+
go->min_depth_len = 20000;
|
54
|
+
go->min_mapq = 5;
|
55
|
+
go->min_var_len = 50;
|
56
|
+
go->match_pen = 10;
|
57
|
+
// for ggs
|
58
|
+
go->ggs_shrink_pen = 9;
|
59
|
+
go->ggs_min_end_cnt = 10;
|
60
|
+
go->ggs_min_end_frac = 0.1f;
|
61
|
+
go->ggs_max_iden = 0.80f;
|
62
|
+
go->ggs_min_inv_iden = 0.95f;
|
63
|
+
}
|
64
|
+
|
65
|
+
int mg_opt_set(const char *preset, mg_idxopt_t *io, mg_mapopt_t *mo, mg_ggopt_t *go)
|
66
|
+
{
|
67
|
+
if (preset == 0) {
|
68
|
+
mg_idxopt_init(io);
|
69
|
+
mg_mapopt_init(mo);
|
70
|
+
mg_ggopt_init(go);
|
71
|
+
} else if (strcmp(preset, "lr") == 0) { // this is the default
|
72
|
+
} else if (strcmp(preset, "asm") == 0 || strcmp(preset, "ggs") == 0) {
|
73
|
+
io->k = 19, io->w = 10;
|
74
|
+
mo->flag |= MG_M_RMQ;
|
75
|
+
mo->occ_max1 = 10, mo->occ_max1_cap = 100;
|
76
|
+
mo->bw = 1000, mo->bw_long = 150000;
|
77
|
+
mo->max_gap = 10000, mo->max_gap_pre = 1000;
|
78
|
+
mo->min_lc_cnt = 5, mo->min_lc_score = 40;
|
79
|
+
mo->min_gc_cnt = 5, mo->min_gc_score = 1000;
|
80
|
+
mo->min_cov_mapq = 5;
|
81
|
+
mo->min_cov_blen = 100000;
|
82
|
+
mo->max_lc_skip = mo->max_gc_skip = 50;
|
83
|
+
mo->div = 0.01f;
|
84
|
+
mo->mini_batch_size = 4000000000LL;
|
85
|
+
if (strcmp(preset, "ggs") == 0)
|
86
|
+
go->algo = MG_G_GGSIMPLE, mo->best_n = 0;
|
87
|
+
} else if (strcmp(preset, "se") == 0 || strcmp(preset, "sr") == 0) {
|
88
|
+
io->k = 21, io->w = 10;
|
89
|
+
mo->flag |= MG_M_SR | MG_M_HEAP_SORT | MG_M_2_IO_THREADS;
|
90
|
+
mo->occ_max1 = 1000;
|
91
|
+
mo->occ_max1_cap = 2500;
|
92
|
+
mo->max_gap = 100;
|
93
|
+
mo->bw = mo->bw_long = 100;
|
94
|
+
mo->max_frag_len = 800;
|
95
|
+
mo->pri_ratio = 0.5f;
|
96
|
+
mo->min_lc_cnt = 2, mo->min_lc_score = 25;
|
97
|
+
mo->min_gc_cnt = 3, mo->min_gc_score = 40;
|
98
|
+
mo->mini_batch_size = 50000000;
|
99
|
+
mo->min_cov_blen = 50;
|
100
|
+
mo->chn_pen_gap = 0.2f;
|
101
|
+
mo->ref_bonus = 1;
|
102
|
+
if (strcmp(preset, "sr") == 0) {
|
103
|
+
mo->flag |= MG_M_FRAG_MODE | MG_M_FRAG_MERGE;
|
104
|
+
mo->pe_ori = 0<<1|1; // FR
|
105
|
+
}
|
106
|
+
} else return -1;
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
int mg_opt_check(const mg_idxopt_t *io, const mg_mapopt_t *mo, const mg_ggopt_t *go)
|
111
|
+
{
|
112
|
+
if ((mo->flag & MG_M_FRAG_MODE) && !(mo->flag & MG_M_FRAG_MERGE)) {
|
113
|
+
if (mg_verbose >= 1)
|
114
|
+
fprintf(stderr, "[ERROR]\033[1;31m the fragment-without-merge mode is not implemented\033[0m\n");
|
115
|
+
return -1;
|
116
|
+
}
|
117
|
+
return 0;
|
118
|
+
}
|
119
|
+
|
120
|
+
void mg_opt_update(const mg_idx_t *gi, mg_mapopt_t *mo, mg_ggopt_t *go)
|
121
|
+
{
|
122
|
+
float f[2];
|
123
|
+
int32_t q[2];
|
124
|
+
f[0] = 0.1f, f[1] = mo->occ_max1_frac;
|
125
|
+
mg_idx_cal_quantile(gi, 2, f, q);
|
126
|
+
if (q[0] > mo->lc_max_occ) mo->lc_max_occ = q[0];
|
127
|
+
if (mo->lc_max_occ > mo->occ_max1_cap) mo->lc_max_occ = mo->occ_max1_cap;
|
128
|
+
if (q[1] > mo->occ_max1) mo->occ_max1 = q[1];
|
129
|
+
if (mo->occ_max1 > mo->occ_max1_cap) mo->occ_max1 = mo->occ_max1_cap;
|
130
|
+
if (mo->bw_long < mo->bw) mo->bw_long = mo->bw;
|
131
|
+
if (mg_verbose >= 3)
|
132
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] occ_max1=%d; lc_max_occ=%d\n", __func__,
|
133
|
+
realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), mo->occ_max1, mo->lc_max_occ);
|
134
|
+
}
|
@@ -0,0 +1,251 @@
|
|
1
|
+
#include "mgpriv.h"
|
2
|
+
#include "ksort.h"
|
3
|
+
#include "kavl.h"
|
4
|
+
#include "algo.h"
|
5
|
+
#include "khashl.h"
|
6
|
+
|
7
|
+
typedef struct sp_node_s {
|
8
|
+
uint64_t di; // dist<<32 | unique_id
|
9
|
+
uint32_t v;
|
10
|
+
int32_t pre;
|
11
|
+
uint32_t hash;
|
12
|
+
int32_t is_0;
|
13
|
+
KAVL_HEAD(struct sp_node_s) head;
|
14
|
+
} sp_node_t, *sp_node_p;
|
15
|
+
|
16
|
+
#define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di))
|
17
|
+
KAVL_INIT(sp, sp_node_t, head, sp_node_cmp)
|
18
|
+
|
19
|
+
#define sp_node_lt(a, b) ((a)->di < (b)->di)
|
20
|
+
KSORT_INIT(sp, sp_node_p, sp_node_lt)
|
21
|
+
|
22
|
+
typedef struct {
|
23
|
+
int32_t k;
|
24
|
+
int32_t qs, qe;
|
25
|
+
sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap
|
26
|
+
} sp_topk_t;
|
27
|
+
|
28
|
+
KHASHL_MAP_INIT(KH_LOCAL, kh_sp_t, sp, uint32_t, sp_topk_t, kh_hash_uint32, kh_eq_generic)
|
29
|
+
KHASHL_MAP_INIT(KH_LOCAL, kh_sp2_t, sp2, uint32_t, uint64_t, kh_hash_uint32, kh_eq_generic)
|
30
|
+
|
31
|
+
#define MG_SHORT_K_EXT 1000
|
32
|
+
|
33
|
+
static inline sp_node_t *gen_sp_node(void *km, const gfa_t *g, uint32_t v, int32_t d, int32_t id)
|
34
|
+
{
|
35
|
+
sp_node_t *p;
|
36
|
+
KMALLOC(km, p, 1);
|
37
|
+
p->v = v, p->di = (uint64_t)d<<32 | id, p->pre = -1, p->is_0 = 1;
|
38
|
+
return p;
|
39
|
+
}
|
40
|
+
|
41
|
+
mg_pathv_t *mg_shortest_k(void *km0, const gfa_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t *n_pathv)
|
42
|
+
{
|
43
|
+
sp_node_t *p, *root = 0, **out;
|
44
|
+
sp_topk_t *q;
|
45
|
+
kh_sp_t *h;
|
46
|
+
kh_sp2_t *h2;
|
47
|
+
void *km;
|
48
|
+
khint_t k;
|
49
|
+
int absent;
|
50
|
+
int32_t i, j, n_done, n_found;
|
51
|
+
uint32_t id, n_out, m_out;
|
52
|
+
int8_t *dst_done;
|
53
|
+
mg_pathv_t *ret = 0;
|
54
|
+
uint64_t *dst_group, *seeds = 0;
|
55
|
+
void *h_seeds = 0;
|
56
|
+
mg128_v mini = {0,0,0};
|
57
|
+
|
58
|
+
if (n_pathv) *n_pathv = 0;
|
59
|
+
if (n_dst <= 0) return 0;
|
60
|
+
for (i = 0; i < n_dst; ++i) { // initialize
|
61
|
+
mg_path_dst_t *t = &dst[i];
|
62
|
+
if (t->inner)
|
63
|
+
t->dist = 0, t->n_path = 1, t->path_end = -1;
|
64
|
+
else
|
65
|
+
t->dist = -1, t->n_path = 0, t->path_end = -1;
|
66
|
+
}
|
67
|
+
if (max_k > MG_MAX_SHORT_K) max_k = MG_MAX_SHORT_K;
|
68
|
+
km = (mg_dbg_flag&MG_DBG_NO_KALLOC) && (mg_dbg_flag&MG_DBG_SHORTK)? 0 : km_init2(km0, 0x4000);
|
69
|
+
|
70
|
+
KCALLOC(km, dst_done, n_dst);
|
71
|
+
KMALLOC(km, dst_group, n_dst);
|
72
|
+
for (i = 0; i < n_dst; ++i) // multiple dst[] may have the same dst[].v. We need to group them first.
|
73
|
+
dst_group[i] = (uint64_t)dst[i].v<<32 | i;
|
74
|
+
radix_sort_gfa64(dst_group, dst_group + n_dst);
|
75
|
+
|
76
|
+
h2 = sp2_init2(km); // this hash table keeps all destinations
|
77
|
+
sp2_resize(h2, n_dst * 2);
|
78
|
+
for (i = 1, j = 0; i <= n_dst; ++i) {
|
79
|
+
if (i == n_dst || dst_group[i]>>32 != dst_group[j]>>32) {
|
80
|
+
k = sp2_put(h2, dst_group[j]>>32, &absent);
|
81
|
+
kh_val(h2, k) = (uint64_t)j << 32 | (i - j);
|
82
|
+
assert(absent);
|
83
|
+
j = i;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
h = sp_init2(km); // this hash table keeps visited vertices
|
88
|
+
sp_resize(h, 16);
|
89
|
+
m_out = 16, n_out = 0;
|
90
|
+
KMALLOC(km, out, m_out);
|
91
|
+
|
92
|
+
id = 0;
|
93
|
+
p = gen_sp_node(km, g, src, 0, id++);
|
94
|
+
p->hash = kh_hash_uint32(src);
|
95
|
+
kavl_insert(sp, &root, p, 0);
|
96
|
+
k = sp_put(h, src, &absent);
|
97
|
+
q = &kh_val(h, k);
|
98
|
+
q->k = 1, q->p[0] = p, q->qs = q->qe = -1;
|
99
|
+
|
100
|
+
n_done = 0;
|
101
|
+
while (kavl_size(head, root) > 0) {
|
102
|
+
int32_t i, nv;
|
103
|
+
gfa_arc_t *av;
|
104
|
+
sp_node_t *r;
|
105
|
+
|
106
|
+
r = kavl_erase_first(sp, &root); // take out the closest vertex in the heap (as a binary tree)
|
107
|
+
//fprintf(stderr, "XX\t%d\t%d\t%d\t%c%s[%d]\t%d\n", n_out, kavl_size(head, root), n_finished, "><"[(r->v&1)^1], g->seg[r->v>>1].name, r->v, (int32_t)(r->di>>32));
|
108
|
+
if (n_out == m_out) KEXPAND(km, out, m_out);
|
109
|
+
r->di = r->di>>32<<32 | n_out; // lower 32 bits now for position in the out[] array
|
110
|
+
out[n_out++] = r;
|
111
|
+
|
112
|
+
k = sp2_get(h2, r->v);
|
113
|
+
if (k != kh_end(h2)) { // we have reached one dst vertex
|
114
|
+
int32_t j, dist = r->di>>32, off = kh_val(h2, k) >> 32, cnt = (int32_t)kh_val(h2, k);
|
115
|
+
for (j = 0; j < cnt; ++j) {
|
116
|
+
mg_path_dst_t *t = &dst[(int32_t)dst_group[off + j]];
|
117
|
+
int32_t done = 0;
|
118
|
+
if (t->inner) {
|
119
|
+
done = 1;
|
120
|
+
} else {
|
121
|
+
int32_t copy = 0;
|
122
|
+
//if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " src=%c%s[%d],qlen=%d\tdst=%c%s[%d]\ttarget_distx=%d,target_hash=%x\tdistx=%d,hash=%x\n", "><"[src&1], g->seg[src>>1].name, src, ql, "><"[t->v&1], g->seg[t->v>>1].name, t->v, t->target_dist - g->seg[src>>1].len, t->target_hash, dist - g->seg[src>>1].len, r->hash);
|
123
|
+
if (t->n_path == 0) { // keep the shortest path
|
124
|
+
copy = 1;
|
125
|
+
} else if (t->target_dist >= 0) { // we have a target distance; choose the closest
|
126
|
+
if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) { // we found the target path
|
127
|
+
copy = 1, done = 1;
|
128
|
+
} else {
|
129
|
+
int32_t d0 = t->dist, d1 = dist;
|
130
|
+
d0 = d0 > t->target_dist? d0 - t->target_dist : t->target_dist - d0;
|
131
|
+
d1 = d1 > t->target_dist? d1 - t->target_dist : t->target_dist - d1;
|
132
|
+
if (d1 < d0) copy = 1;
|
133
|
+
}
|
134
|
+
}
|
135
|
+
if (copy) {
|
136
|
+
t->path_end = n_out - 1, t->dist = dist, t->hash = r->hash, t->is_0 = r->is_0;
|
137
|
+
if (t->target_dist >= 0) {
|
138
|
+
if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) done = 1;
|
139
|
+
else if (dist > t->target_dist + MG_SHORT_K_EXT) done = 1;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
++t->n_path;
|
143
|
+
if (t->n_path >= max_k) done = 1;
|
144
|
+
}
|
145
|
+
if (dst_done[off + j] == 0 && done)
|
146
|
+
dst_done[off + j] = 1, ++n_done;
|
147
|
+
}
|
148
|
+
if (n_done == n_dst) break;
|
149
|
+
}
|
150
|
+
|
151
|
+
nv = gfa_arc_n(g, r->v);
|
152
|
+
av = gfa_arc_a(g, r->v);
|
153
|
+
for (i = 0; i < nv; ++i) { // visit all neighbors
|
154
|
+
gfa_arc_t *ai = &av[i];
|
155
|
+
int32_t d = (r->di>>32) + (uint32_t)ai->v_lv;
|
156
|
+
if (d > max_dist) continue; // don't probe vertices too far away
|
157
|
+
k = sp_put(h, ai->w, &absent);
|
158
|
+
q = &kh_val(h, k);
|
159
|
+
if (absent) { // a new vertex visited
|
160
|
+
q->k = 0, q->qs = q->qe = -1;
|
161
|
+
//if (ql && qs) fprintf(stderr, "ql=%d,src=%d\tv=%c%s[%d]\n", ql, src, "><"[ai->w&1], g->seg[ai->w>>1].name, ai->w);
|
162
|
+
}
|
163
|
+
if (q->k < max_k) { // enough room: add to the heap
|
164
|
+
p = gen_sp_node(km, g, ai->w, d, id++);
|
165
|
+
p->pre = n_out - 1;
|
166
|
+
p->hash = r->hash + kh_hash_uint32(ai->w);
|
167
|
+
p->is_0 = r->is_0;
|
168
|
+
if (ai->rank > 0) p->is_0 = 0;
|
169
|
+
kavl_insert(sp, &root, p, 0);
|
170
|
+
q->p[q->k++] = p;
|
171
|
+
ks_heapup_sp(q->k, q->p);
|
172
|
+
} else if (q->p[0]->di>>32 > d) { // shorter than the longest path so far: replace the longest
|
173
|
+
p = kavl_erase(sp, &root, q->p[0], 0);
|
174
|
+
if (p) {
|
175
|
+
p->di = (uint64_t)d<<32 | (id++);
|
176
|
+
p->pre = n_out - 1;
|
177
|
+
p->hash = r->hash + kh_hash_uint32(ai->w);
|
178
|
+
p->is_0 = r->is_0;
|
179
|
+
if (ai->rank > 0) p->is_0 = 0;
|
180
|
+
kavl_insert(sp, &root, p, 0);
|
181
|
+
ks_heapdown_sp(0, q->k, q->p);
|
182
|
+
} else {
|
183
|
+
fprintf(stderr, "Warning: logical bug in gfa_shortest_k(): q->k=%d,q->p[0]->{d,i}={%d,%d},d=%d,src=%u,max_dist=%d,n_dst=%d\n", q->k, (int32_t)(q->p[0]->di>>32), (int32_t)q->p[0]->di, d, src, max_dist, n_dst);
|
184
|
+
km_destroy(km);
|
185
|
+
return 0;
|
186
|
+
}
|
187
|
+
} // else: the path is longer than all the existing paths ended at ai->w
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
kfree(km, dst_group);
|
192
|
+
kfree(km, dst_done);
|
193
|
+
sp_destroy(h);
|
194
|
+
mg_idx_hfree(h_seeds);
|
195
|
+
kfree(km, seeds);
|
196
|
+
kfree(km, mini.a);
|
197
|
+
// NB: AVL nodes are not deallocated. When km==0, they are memory leaks.
|
198
|
+
|
199
|
+
for (i = 0, n_found = 0; i < n_dst; ++i)
|
200
|
+
if (dst[i].n_path > 0) ++n_found;
|
201
|
+
|
202
|
+
if (n_found > 0 && n_pathv) { // then generate the backtrack array
|
203
|
+
int32_t n, *trans;
|
204
|
+
KCALLOC(km, trans, n_out); // used to squeeze unused elements in out[]
|
205
|
+
for (i = 0; i < n_dst; ++i) { // mark dst vertices with a target distance
|
206
|
+
mg_path_dst_t *t = &dst[i];
|
207
|
+
if (t->n_path > 0 && t->target_dist >= 0 && t->path_end >= 0)
|
208
|
+
trans[(int32_t)out[t->path_end]->di] = 1;
|
209
|
+
}
|
210
|
+
for (i = 0; i < n_out; ++i) { // mark dst vertices without a target distance
|
211
|
+
k = sp2_get(h2, out[i]->v);
|
212
|
+
if (k != kh_end(h2)) { // TODO: check if this is correct!
|
213
|
+
int32_t off = kh_val(h2, k)>>32, cnt = (int32_t)kh_val(h2, k);
|
214
|
+
for (j = off; j < off + cnt; ++j)
|
215
|
+
if (dst[j].target_dist < 0)
|
216
|
+
trans[i] = 1;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
for (i = n_out - 1; i >= 0; --i) // mark all predecessors
|
220
|
+
if (trans[i] && out[i]->pre >= 0)
|
221
|
+
trans[out[i]->pre] = 1;
|
222
|
+
for (i = n = 0; i < n_out; ++i) // generate coordinate translations
|
223
|
+
if (trans[i]) trans[i] = n++;
|
224
|
+
else trans[i] = -1;
|
225
|
+
|
226
|
+
*n_pathv = n;
|
227
|
+
KMALLOC(km0, ret, n);
|
228
|
+
for (i = 0; i < n_out; ++i) { // generate the backtrack array
|
229
|
+
mg_pathv_t *p;
|
230
|
+
if (trans[i] < 0) continue;
|
231
|
+
p = &ret[trans[i]];
|
232
|
+
p->v = out[i]->v, p->d = out[i]->di >> 32;
|
233
|
+
p->pre = out[i]->pre < 0? out[i]->pre : trans[out[i]->pre];
|
234
|
+
}
|
235
|
+
for (i = 0; i < n_dst; ++i) // translate "path_end"
|
236
|
+
if (dst[i].path_end >= 0)
|
237
|
+
dst[i].path_end = trans[dst[i].path_end];
|
238
|
+
}
|
239
|
+
|
240
|
+
km_destroy(km);
|
241
|
+
return ret;
|
242
|
+
}
|
243
|
+
|
244
|
+
void mg_sub_print_path(FILE *fp, const gfa_t *g, int32_t n, mg_pathv_t *path)
|
245
|
+
{
|
246
|
+
int32_t i;
|
247
|
+
for (i = 0; i < n; ++i) {
|
248
|
+
mg_pathv_t *p = &path[i];
|
249
|
+
fprintf(fp, "[%d]\t%d\t%s\t%d\t%d\n", i, p->v, g->seg[p->v>>1].name, p->d, p->pre);
|
250
|
+
}
|
251
|
+
}
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <assert.h>
|
4
|
+
#include <string.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "kvec-km.h"
|
7
|
+
#include "mgpriv.h"
|
8
|
+
|
9
|
+
unsigned char seq_nt4_table[256] = {
|
10
|
+
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
11
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
12
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
13
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
14
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
15
|
+
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
16
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
17
|
+
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
18
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
19
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
20
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
21
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
22
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
23
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
24
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
25
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
26
|
+
};
|
27
|
+
|
28
|
+
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
29
|
+
{
|
30
|
+
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
|
31
|
+
key = key ^ key >> 24;
|
32
|
+
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
|
33
|
+
key = key ^ key >> 14;
|
34
|
+
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
|
35
|
+
key = key ^ key >> 28;
|
36
|
+
key = (key + (key << 31)) & mask;
|
37
|
+
return key;
|
38
|
+
}
|
39
|
+
|
40
|
+
/**
|
41
|
+
* Find symmetric (w,k)-minimizers on a DNA sequence
|
42
|
+
*
|
43
|
+
* @param km thread-local memory pool; using NULL falls back to malloc()
|
44
|
+
* @param str DNA sequence
|
45
|
+
* @param len length of $str
|
46
|
+
* @param w find a minimizer for every $w consecutive k-mers
|
47
|
+
* @param k k-mer size
|
48
|
+
* @param rid reference ID; will be copied to the output $p array
|
49
|
+
* @param p minimizers
|
50
|
+
* p->a[i].x = kMer<<8 | kmerSpan
|
51
|
+
* p->a[i].y = rid<<32 | lastPos<<1 | strand
|
52
|
+
* where lastPos is the position of the last base of the i-th minimizer,
|
53
|
+
* and strand indicates whether the minimizer comes from the top or the bottom strand.
|
54
|
+
* Callers may want to set "p->n = 0"; otherwise results are appended to p
|
55
|
+
*/
|
56
|
+
void mg_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, mg128_v *p)
|
57
|
+
{
|
58
|
+
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
|
59
|
+
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
60
|
+
mg128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
|
61
|
+
|
62
|
+
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
63
|
+
memset(buf, 0xff, w * 16);
|
64
|
+
kv_resize(mg128_t, km, *p, p->n + len/w);
|
65
|
+
|
66
|
+
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
67
|
+
int c = seq_nt4_table[(uint8_t)str[i]];
|
68
|
+
mg128_t info = { UINT64_MAX, UINT64_MAX };
|
69
|
+
if (c < 4) { // not an ambiguous base
|
70
|
+
int z;
|
71
|
+
kmer_span = l + 1 < k? l + 1 : k;
|
72
|
+
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
73
|
+
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
74
|
+
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
75
|
+
z = kmer[0] < kmer[1]? 0 : 1; // strand
|
76
|
+
++l;
|
77
|
+
if (l >= k && kmer_span < 256) {
|
78
|
+
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
79
|
+
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
80
|
+
}
|
81
|
+
} else l = 0, kmer_span = 0;
|
82
|
+
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
83
|
+
if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
|
84
|
+
for (j = buf_pos + 1; j < w; ++j)
|
85
|
+
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mg128_t, km, *p, buf[j]);
|
86
|
+
for (j = 0; j < buf_pos; ++j)
|
87
|
+
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mg128_t, km, *p, buf[j]);
|
88
|
+
}
|
89
|
+
if (info.x <= min.x) { // a new minimum; then write the old min
|
90
|
+
if (l >= w + k && min.x != UINT64_MAX) kv_push(mg128_t, km, *p, min);
|
91
|
+
min = info, min_pos = buf_pos;
|
92
|
+
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
93
|
+
if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mg128_t, km, *p, min);
|
94
|
+
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
95
|
+
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
96
|
+
for (j = 0; j <= buf_pos; ++j)
|
97
|
+
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
98
|
+
if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
|
99
|
+
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
100
|
+
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mg128_t, km, *p, buf[j]);
|
101
|
+
for (j = 0; j <= buf_pos; ++j)
|
102
|
+
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mg128_t, km, *p, buf[j]);
|
103
|
+
}
|
104
|
+
}
|
105
|
+
if (++buf_pos == w) buf_pos = 0;
|
106
|
+
}
|
107
|
+
if (min.x != UINT64_MAX)
|
108
|
+
kv_push(mg128_t, km, *p, min);
|
109
|
+
}
|
data/ext/minigraph/sys.c
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include "sys.h"
|
3
|
+
|
4
|
+
#if defined(WIN32) || defined(_WIN32)
|
5
|
+
#include <windows.h>
|
6
|
+
|
7
|
+
struct timezone
|
8
|
+
{
|
9
|
+
__int32 tz_minuteswest; /* minutes W of Greenwich */
|
10
|
+
int tz_dsttime; /* type of dst correction */
|
11
|
+
};
|
12
|
+
|
13
|
+
/*
|
14
|
+
* gettimeofday.c
|
15
|
+
* Win32 gettimeofday() replacement
|
16
|
+
* taken from PostgreSQL, according to
|
17
|
+
* https://stackoverflow.com/questions/1676036/what-should-i-use-to-replace-gettimeofday-on-windows
|
18
|
+
*
|
19
|
+
* src/port/gettimeofday.c
|
20
|
+
*
|
21
|
+
* Copyright (c) 2003 SRA, Inc.
|
22
|
+
* Copyright (c) 2003 SKC, Inc.
|
23
|
+
*
|
24
|
+
* Permission to use, copy, modify, and distribute this software and
|
25
|
+
* its documentation for any purpose, without fee, and without a
|
26
|
+
* written agreement is hereby granted, provided that the above
|
27
|
+
* copyright notice and this paragraph and the following two
|
28
|
+
* paragraphs appear in all copies.
|
29
|
+
*
|
30
|
+
* IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
|
31
|
+
* INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
|
32
|
+
* LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
|
33
|
+
* DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
|
34
|
+
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
35
|
+
*
|
36
|
+
* THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
|
37
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
38
|
+
* A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
|
39
|
+
* IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
|
40
|
+
* SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
|
41
|
+
*/
|
42
|
+
|
43
|
+
/* FILETIME of Jan 1 1970 00:00:00. */
|
44
|
+
static const unsigned __int64 epoch = ((unsigned __int64) 116444736000000000ULL);
|
45
|
+
|
46
|
+
/*
|
47
|
+
* timezone information is stored outside the kernel so tzp isn't used anymore.
|
48
|
+
*
|
49
|
+
* Note: this function is not for Win32 high precision timing purpose. See
|
50
|
+
* elapsed_time().
|
51
|
+
*/
|
52
|
+
int gettimeofday(struct timeval * tp, struct timezone *tzp)
|
53
|
+
{
|
54
|
+
FILETIME file_time;
|
55
|
+
SYSTEMTIME system_time;
|
56
|
+
ULARGE_INTEGER ularge;
|
57
|
+
|
58
|
+
GetSystemTime(&system_time);
|
59
|
+
SystemTimeToFileTime(&system_time, &file_time);
|
60
|
+
ularge.LowPart = file_time.dwLowDateTime;
|
61
|
+
ularge.HighPart = file_time.dwHighDateTime;
|
62
|
+
|
63
|
+
tp->tv_sec = (long) ((ularge.QuadPart - epoch) / 10000000L);
|
64
|
+
tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
|
65
|
+
|
66
|
+
return 0;
|
67
|
+
}
|
68
|
+
|
69
|
+
// taken from https://stackoverflow.com/questions/5272470/c-get-cpu-usage-on-linux-and-windows
|
70
|
+
double cputime()
|
71
|
+
{
|
72
|
+
HANDLE hProcess = GetCurrentProcess();
|
73
|
+
FILETIME ftCreation, ftExit, ftKernel, ftUser;
|
74
|
+
SYSTEMTIME stKernel;
|
75
|
+
SYSTEMTIME stUser;
|
76
|
+
|
77
|
+
GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser);
|
78
|
+
FileTimeToSystemTime(&ftKernel, &stKernel);
|
79
|
+
FileTimeToSystemTime(&ftUser, &stUser);
|
80
|
+
|
81
|
+
double kernelModeTime = ((stKernel.wHour * 60.) + stKernel.wMinute * 60.) + stKernel.wSecond * 1. + stKernel.wMilliseconds / 1000.;
|
82
|
+
double userModeTime = ((stUser.wHour * 60.) + stUser.wMinute * 60.) + stUser.wSecond * 1. + stUser.wMilliseconds / 1000.;
|
83
|
+
|
84
|
+
return kernelModeTime + userModeTime;
|
85
|
+
}
|
86
|
+
|
87
|
+
long peakrss(void) { return 0; }
|
88
|
+
#else
|
89
|
+
#include <sys/resource.h>
|
90
|
+
#include <sys/time.h>
|
91
|
+
|
92
|
+
double cputime(void)
|
93
|
+
{
|
94
|
+
struct rusage r;
|
95
|
+
getrusage(RUSAGE_SELF, &r);
|
96
|
+
return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
|
97
|
+
}
|
98
|
+
|
99
|
+
long peakrss(void)
|
100
|
+
{
|
101
|
+
struct rusage r;
|
102
|
+
getrusage(RUSAGE_SELF, &r);
|
103
|
+
#ifdef __linux__
|
104
|
+
return r.ru_maxrss * 1024;
|
105
|
+
#else
|
106
|
+
return r.ru_maxrss;
|
107
|
+
#endif
|
108
|
+
}
|
109
|
+
|
110
|
+
#endif /* WIN32 || _WIN32 */
|
111
|
+
|
112
|
+
double realtime(void)
|
113
|
+
{
|
114
|
+
struct timeval tp;
|
115
|
+
gettimeofday(&tp, NULL);
|
116
|
+
return tp.tv_sec + tp.tv_usec * 1e-6;
|
117
|
+
}
|
118
|
+
|
119
|
+
void mg_err_fputs(const char *str, FILE *fp)
|
120
|
+
{
|
121
|
+
int ret;
|
122
|
+
ret = fputs(str, fp);
|
123
|
+
if (ret == EOF) {
|
124
|
+
fprintf(stderr, "[ERROR] failed to write the results\n");
|
125
|
+
exit(EXIT_FAILURE);
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
void mg_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
|
130
|
+
{
|
131
|
+
int ret;
|
132
|
+
ret = fwrite(p, size, nitems, fp);
|
133
|
+
if (ret == EOF) {
|
134
|
+
fprintf(stderr, "[ERROR] failed to write data\n");
|
135
|
+
exit(EXIT_FAILURE);
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
void mg_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
|
140
|
+
{
|
141
|
+
int ret;
|
142
|
+
ret = fread(p, size, nitems, fp);
|
143
|
+
if (ret == EOF) {
|
144
|
+
fprintf(stderr, "[ERROR] failed to read data\n");
|
145
|
+
exit(EXIT_FAILURE);
|
146
|
+
}
|
147
|
+
}
|
data/ext/minigraph/sys.h
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef MG_SYS_H
|
2
|
+
#define MG_SYS_H
|
3
|
+
|
4
|
+
#include <stdio.h>
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
void mg_err_fputs(const char *str, FILE *fp);
|
11
|
+
|
12
|
+
double realtime(void);
|
13
|
+
double cputime(void);
|
14
|
+
long peakrss(void);
|
15
|
+
|
16
|
+
#ifdef __cplusplus
|
17
|
+
}
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#endif
|