ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,12 @@
1
+ #include <stdlib.h>
2
+ #include "mgpriv.h"
3
+ #include "ksort.h"
4
+
5
+ int mg_verbose = 1;
6
+ int mg_dbg_flag = 0;
7
+ double mg_realtime0;
8
+
9
+ #define sort_key_128x(a) ((a).x)
10
+ KRADIX_SORT_INIT(128x, mg128_t, sort_key_128x, 8)
11
+
12
+ KSORT_INIT_GENERIC(uint32_t)
@@ -0,0 +1,134 @@
1
+ #include <string.h>
2
+ #include "mgpriv.h"
3
+ #include "sys.h"
4
+
5
+ void mg_idxopt_init(mg_idxopt_t *io)
6
+ {
7
+ memset(io, 0, sizeof(mg_idxopt_t));
8
+ io->k = 17;
9
+ io->w = 11;
10
+ io->bucket_bits = 14;
11
+ }
12
+
13
+ void mg_mapopt_init(mg_mapopt_t *mo)
14
+ {
15
+ memset(mo, 0, sizeof(mg_mapopt_t));
16
+ mo->seed = 11;
17
+ mo->occ_max1 = 50, mo->occ_max1_cap = 250;
18
+ mo->occ_max1_frac = 2e-4f;
19
+ mo->max_gap = 5000;
20
+ mo->max_gap_ref = -1;
21
+ mo->max_gap_pre = 1000;
22
+ mo->max_lc_skip = 25, mo->max_gc_skip = 25;
23
+ mo->max_lc_iter = 5000;
24
+ mo->bw = 500, mo->bw_long = 20000;
25
+ mo->rmq_size_cap = 100000;
26
+ mo->rmq_rescue_size = 1000;
27
+ mo->rmq_rescue_ratio = 0.1f;
28
+ mo->mini_batch_size = 500000000;
29
+ mo->div = 0.1f;
30
+ mo->chn_pen_gap = 1.0f, mo->chn_pen_skip = 0.05f;
31
+ mo->min_lc_cnt = 5, mo->min_lc_score = 40;
32
+ mo->min_gc_cnt = 5, mo->min_gc_score = 50;
33
+ mo->gdp_max_ed = 10000;
34
+ mo->lc_max_trim = 50;
35
+ mo->lc_max_occ = 2;
36
+ mo->mask_level = 0.5f;
37
+ mo->sub_diff = 6;
38
+ mo->best_n = 5;
39
+ mo->pri_ratio = 0.8f;
40
+ mo->ref_bonus = 0;
41
+ mo->pe_ori = 0; // FF
42
+ mo->min_cov_mapq = 20;
43
+ mo->min_cov_blen = 1000;
44
+ mo->cap_kalloc = 1000000000;
45
+ }
46
+
47
+ void mg_ggopt_init(mg_ggopt_t *go)
48
+ {
49
+ memset(go, 0, sizeof(mg_ggopt_t));
50
+ go->algo = MG_G_NONE;
51
+ go->flag |= MG_G_NO_QOVLP;
52
+ go->min_map_len = 100000;
53
+ go->min_depth_len = 20000;
54
+ go->min_mapq = 5;
55
+ go->min_var_len = 50;
56
+ go->match_pen = 10;
57
+ // for ggs
58
+ go->ggs_shrink_pen = 9;
59
+ go->ggs_min_end_cnt = 10;
60
+ go->ggs_min_end_frac = 0.1f;
61
+ go->ggs_max_iden = 0.80f;
62
+ go->ggs_min_inv_iden = 0.95f;
63
+ }
64
+
65
+ int mg_opt_set(const char *preset, mg_idxopt_t *io, mg_mapopt_t *mo, mg_ggopt_t *go)
66
+ {
67
+ if (preset == 0) {
68
+ mg_idxopt_init(io);
69
+ mg_mapopt_init(mo);
70
+ mg_ggopt_init(go);
71
+ } else if (strcmp(preset, "lr") == 0) { // this is the default
72
+ } else if (strcmp(preset, "asm") == 0 || strcmp(preset, "ggs") == 0) {
73
+ io->k = 19, io->w = 10;
74
+ mo->flag |= MG_M_RMQ;
75
+ mo->occ_max1 = 10, mo->occ_max1_cap = 100;
76
+ mo->bw = 1000, mo->bw_long = 150000;
77
+ mo->max_gap = 10000, mo->max_gap_pre = 1000;
78
+ mo->min_lc_cnt = 5, mo->min_lc_score = 40;
79
+ mo->min_gc_cnt = 5, mo->min_gc_score = 1000;
80
+ mo->min_cov_mapq = 5;
81
+ mo->min_cov_blen = 100000;
82
+ mo->max_lc_skip = mo->max_gc_skip = 50;
83
+ mo->div = 0.01f;
84
+ mo->mini_batch_size = 4000000000LL;
85
+ if (strcmp(preset, "ggs") == 0)
86
+ go->algo = MG_G_GGSIMPLE, mo->best_n = 0;
87
+ } else if (strcmp(preset, "se") == 0 || strcmp(preset, "sr") == 0) {
88
+ io->k = 21, io->w = 10;
89
+ mo->flag |= MG_M_SR | MG_M_HEAP_SORT | MG_M_2_IO_THREADS;
90
+ mo->occ_max1 = 1000;
91
+ mo->occ_max1_cap = 2500;
92
+ mo->max_gap = 100;
93
+ mo->bw = mo->bw_long = 100;
94
+ mo->max_frag_len = 800;
95
+ mo->pri_ratio = 0.5f;
96
+ mo->min_lc_cnt = 2, mo->min_lc_score = 25;
97
+ mo->min_gc_cnt = 3, mo->min_gc_score = 40;
98
+ mo->mini_batch_size = 50000000;
99
+ mo->min_cov_blen = 50;
100
+ mo->chn_pen_gap = 0.2f;
101
+ mo->ref_bonus = 1;
102
+ if (strcmp(preset, "sr") == 0) {
103
+ mo->flag |= MG_M_FRAG_MODE | MG_M_FRAG_MERGE;
104
+ mo->pe_ori = 0<<1|1; // FR
105
+ }
106
+ } else return -1;
107
+ return 0;
108
+ }
109
+
110
+ int mg_opt_check(const mg_idxopt_t *io, const mg_mapopt_t *mo, const mg_ggopt_t *go)
111
+ {
112
+ if ((mo->flag & MG_M_FRAG_MODE) && !(mo->flag & MG_M_FRAG_MERGE)) {
113
+ if (mg_verbose >= 1)
114
+ fprintf(stderr, "[ERROR]\033[1;31m the fragment-without-merge mode is not implemented\033[0m\n");
115
+ return -1;
116
+ }
117
+ return 0;
118
+ }
119
+
120
+ void mg_opt_update(const mg_idx_t *gi, mg_mapopt_t *mo, mg_ggopt_t *go)
121
+ {
122
+ float f[2];
123
+ int32_t q[2];
124
+ f[0] = 0.1f, f[1] = mo->occ_max1_frac;
125
+ mg_idx_cal_quantile(gi, 2, f, q);
126
+ if (q[0] > mo->lc_max_occ) mo->lc_max_occ = q[0];
127
+ if (mo->lc_max_occ > mo->occ_max1_cap) mo->lc_max_occ = mo->occ_max1_cap;
128
+ if (q[1] > mo->occ_max1) mo->occ_max1 = q[1];
129
+ if (mo->occ_max1 > mo->occ_max1_cap) mo->occ_max1 = mo->occ_max1_cap;
130
+ if (mo->bw_long < mo->bw) mo->bw_long = mo->bw;
131
+ if (mg_verbose >= 3)
132
+ fprintf(stderr, "[M::%s::%.3f*%.2f] occ_max1=%d; lc_max_occ=%d\n", __func__,
133
+ realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0), mo->occ_max1, mo->lc_max_occ);
134
+ }
@@ -0,0 +1,251 @@
1
+ #include "mgpriv.h"
2
+ #include "ksort.h"
3
+ #include "kavl.h"
4
+ #include "algo.h"
5
+ #include "khashl.h"
6
+
7
+ typedef struct sp_node_s {
8
+ uint64_t di; // dist<<32 | unique_id
9
+ uint32_t v;
10
+ int32_t pre;
11
+ uint32_t hash;
12
+ int32_t is_0;
13
+ KAVL_HEAD(struct sp_node_s) head;
14
+ } sp_node_t, *sp_node_p;
15
+
16
+ #define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di))
17
+ KAVL_INIT(sp, sp_node_t, head, sp_node_cmp)
18
+
19
+ #define sp_node_lt(a, b) ((a)->di < (b)->di)
20
+ KSORT_INIT(sp, sp_node_p, sp_node_lt)
21
+
22
+ typedef struct {
23
+ int32_t k;
24
+ int32_t qs, qe;
25
+ sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap
26
+ } sp_topk_t;
27
+
28
+ KHASHL_MAP_INIT(KH_LOCAL, kh_sp_t, sp, uint32_t, sp_topk_t, kh_hash_uint32, kh_eq_generic)
29
+ KHASHL_MAP_INIT(KH_LOCAL, kh_sp2_t, sp2, uint32_t, uint64_t, kh_hash_uint32, kh_eq_generic)
30
+
31
+ #define MG_SHORT_K_EXT 1000
32
+
33
+ static inline sp_node_t *gen_sp_node(void *km, const gfa_t *g, uint32_t v, int32_t d, int32_t id)
34
+ {
35
+ sp_node_t *p;
36
+ KMALLOC(km, p, 1);
37
+ p->v = v, p->di = (uint64_t)d<<32 | id, p->pre = -1, p->is_0 = 1;
38
+ return p;
39
+ }
40
+
41
+ mg_pathv_t *mg_shortest_k(void *km0, const gfa_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t *n_pathv)
42
+ {
43
+ sp_node_t *p, *root = 0, **out;
44
+ sp_topk_t *q;
45
+ kh_sp_t *h;
46
+ kh_sp2_t *h2;
47
+ void *km;
48
+ khint_t k;
49
+ int absent;
50
+ int32_t i, j, n_done, n_found;
51
+ uint32_t id, n_out, m_out;
52
+ int8_t *dst_done;
53
+ mg_pathv_t *ret = 0;
54
+ uint64_t *dst_group, *seeds = 0;
55
+ void *h_seeds = 0;
56
+ mg128_v mini = {0,0,0};
57
+
58
+ if (n_pathv) *n_pathv = 0;
59
+ if (n_dst <= 0) return 0;
60
+ for (i = 0; i < n_dst; ++i) { // initialize
61
+ mg_path_dst_t *t = &dst[i];
62
+ if (t->inner)
63
+ t->dist = 0, t->n_path = 1, t->path_end = -1;
64
+ else
65
+ t->dist = -1, t->n_path = 0, t->path_end = -1;
66
+ }
67
+ if (max_k > MG_MAX_SHORT_K) max_k = MG_MAX_SHORT_K;
68
+ km = (mg_dbg_flag&MG_DBG_NO_KALLOC) && (mg_dbg_flag&MG_DBG_SHORTK)? 0 : km_init2(km0, 0x4000);
69
+
70
+ KCALLOC(km, dst_done, n_dst);
71
+ KMALLOC(km, dst_group, n_dst);
72
+ for (i = 0; i < n_dst; ++i) // multiple dst[] may have the same dst[].v. We need to group them first.
73
+ dst_group[i] = (uint64_t)dst[i].v<<32 | i;
74
+ radix_sort_gfa64(dst_group, dst_group + n_dst);
75
+
76
+ h2 = sp2_init2(km); // this hash table keeps all destinations
77
+ sp2_resize(h2, n_dst * 2);
78
+ for (i = 1, j = 0; i <= n_dst; ++i) {
79
+ if (i == n_dst || dst_group[i]>>32 != dst_group[j]>>32) {
80
+ k = sp2_put(h2, dst_group[j]>>32, &absent);
81
+ kh_val(h2, k) = (uint64_t)j << 32 | (i - j);
82
+ assert(absent);
83
+ j = i;
84
+ }
85
+ }
86
+
87
+ h = sp_init2(km); // this hash table keeps visited vertices
88
+ sp_resize(h, 16);
89
+ m_out = 16, n_out = 0;
90
+ KMALLOC(km, out, m_out);
91
+
92
+ id = 0;
93
+ p = gen_sp_node(km, g, src, 0, id++);
94
+ p->hash = kh_hash_uint32(src);
95
+ kavl_insert(sp, &root, p, 0);
96
+ k = sp_put(h, src, &absent);
97
+ q = &kh_val(h, k);
98
+ q->k = 1, q->p[0] = p, q->qs = q->qe = -1;
99
+
100
+ n_done = 0;
101
+ while (kavl_size(head, root) > 0) {
102
+ int32_t i, nv;
103
+ gfa_arc_t *av;
104
+ sp_node_t *r;
105
+
106
+ r = kavl_erase_first(sp, &root); // take out the closest vertex in the heap (as a binary tree)
107
+ //fprintf(stderr, "XX\t%d\t%d\t%d\t%c%s[%d]\t%d\n", n_out, kavl_size(head, root), n_finished, "><"[(r->v&1)^1], g->seg[r->v>>1].name, r->v, (int32_t)(r->di>>32));
108
+ if (n_out == m_out) KEXPAND(km, out, m_out);
109
+ r->di = r->di>>32<<32 | n_out; // lower 32 bits now for position in the out[] array
110
+ out[n_out++] = r;
111
+
112
+ k = sp2_get(h2, r->v);
113
+ if (k != kh_end(h2)) { // we have reached one dst vertex
114
+ int32_t j, dist = r->di>>32, off = kh_val(h2, k) >> 32, cnt = (int32_t)kh_val(h2, k);
115
+ for (j = 0; j < cnt; ++j) {
116
+ mg_path_dst_t *t = &dst[(int32_t)dst_group[off + j]];
117
+ int32_t done = 0;
118
+ if (t->inner) {
119
+ done = 1;
120
+ } else {
121
+ int32_t copy = 0;
122
+ //if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " src=%c%s[%d],qlen=%d\tdst=%c%s[%d]\ttarget_distx=%d,target_hash=%x\tdistx=%d,hash=%x\n", "><"[src&1], g->seg[src>>1].name, src, ql, "><"[t->v&1], g->seg[t->v>>1].name, t->v, t->target_dist - g->seg[src>>1].len, t->target_hash, dist - g->seg[src>>1].len, r->hash);
123
+ if (t->n_path == 0) { // keep the shortest path
124
+ copy = 1;
125
+ } else if (t->target_dist >= 0) { // we have a target distance; choose the closest
126
+ if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) { // we found the target path
127
+ copy = 1, done = 1;
128
+ } else {
129
+ int32_t d0 = t->dist, d1 = dist;
130
+ d0 = d0 > t->target_dist? d0 - t->target_dist : t->target_dist - d0;
131
+ d1 = d1 > t->target_dist? d1 - t->target_dist : t->target_dist - d1;
132
+ if (d1 < d0) copy = 1;
133
+ }
134
+ }
135
+ if (copy) {
136
+ t->path_end = n_out - 1, t->dist = dist, t->hash = r->hash, t->is_0 = r->is_0;
137
+ if (t->target_dist >= 0) {
138
+ if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) done = 1;
139
+ else if (dist > t->target_dist + MG_SHORT_K_EXT) done = 1;
140
+ }
141
+ }
142
+ ++t->n_path;
143
+ if (t->n_path >= max_k) done = 1;
144
+ }
145
+ if (dst_done[off + j] == 0 && done)
146
+ dst_done[off + j] = 1, ++n_done;
147
+ }
148
+ if (n_done == n_dst) break;
149
+ }
150
+
151
+ nv = gfa_arc_n(g, r->v);
152
+ av = gfa_arc_a(g, r->v);
153
+ for (i = 0; i < nv; ++i) { // visit all neighbors
154
+ gfa_arc_t *ai = &av[i];
155
+ int32_t d = (r->di>>32) + (uint32_t)ai->v_lv;
156
+ if (d > max_dist) continue; // don't probe vertices too far away
157
+ k = sp_put(h, ai->w, &absent);
158
+ q = &kh_val(h, k);
159
+ if (absent) { // a new vertex visited
160
+ q->k = 0, q->qs = q->qe = -1;
161
+ //if (ql && qs) fprintf(stderr, "ql=%d,src=%d\tv=%c%s[%d]\n", ql, src, "><"[ai->w&1], g->seg[ai->w>>1].name, ai->w);
162
+ }
163
+ if (q->k < max_k) { // enough room: add to the heap
164
+ p = gen_sp_node(km, g, ai->w, d, id++);
165
+ p->pre = n_out - 1;
166
+ p->hash = r->hash + kh_hash_uint32(ai->w);
167
+ p->is_0 = r->is_0;
168
+ if (ai->rank > 0) p->is_0 = 0;
169
+ kavl_insert(sp, &root, p, 0);
170
+ q->p[q->k++] = p;
171
+ ks_heapup_sp(q->k, q->p);
172
+ } else if (q->p[0]->di>>32 > d) { // shorter than the longest path so far: replace the longest
173
+ p = kavl_erase(sp, &root, q->p[0], 0);
174
+ if (p) {
175
+ p->di = (uint64_t)d<<32 | (id++);
176
+ p->pre = n_out - 1;
177
+ p->hash = r->hash + kh_hash_uint32(ai->w);
178
+ p->is_0 = r->is_0;
179
+ if (ai->rank > 0) p->is_0 = 0;
180
+ kavl_insert(sp, &root, p, 0);
181
+ ks_heapdown_sp(0, q->k, q->p);
182
+ } else {
183
+ fprintf(stderr, "Warning: logical bug in gfa_shortest_k(): q->k=%d,q->p[0]->{d,i}={%d,%d},d=%d,src=%u,max_dist=%d,n_dst=%d\n", q->k, (int32_t)(q->p[0]->di>>32), (int32_t)q->p[0]->di, d, src, max_dist, n_dst);
184
+ km_destroy(km);
185
+ return 0;
186
+ }
187
+ } // else: the path is longer than all the existing paths ended at ai->w
188
+ }
189
+ }
190
+
191
+ kfree(km, dst_group);
192
+ kfree(km, dst_done);
193
+ sp_destroy(h);
194
+ mg_idx_hfree(h_seeds);
195
+ kfree(km, seeds);
196
+ kfree(km, mini.a);
197
+ // NB: AVL nodes are not deallocated. When km==0, they are memory leaks.
198
+
199
+ for (i = 0, n_found = 0; i < n_dst; ++i)
200
+ if (dst[i].n_path > 0) ++n_found;
201
+
202
+ if (n_found > 0 && n_pathv) { // then generate the backtrack array
203
+ int32_t n, *trans;
204
+ KCALLOC(km, trans, n_out); // used to squeeze unused elements in out[]
205
+ for (i = 0; i < n_dst; ++i) { // mark dst vertices with a target distance
206
+ mg_path_dst_t *t = &dst[i];
207
+ if (t->n_path > 0 && t->target_dist >= 0 && t->path_end >= 0)
208
+ trans[(int32_t)out[t->path_end]->di] = 1;
209
+ }
210
+ for (i = 0; i < n_out; ++i) { // mark dst vertices without a target distance
211
+ k = sp2_get(h2, out[i]->v);
212
+ if (k != kh_end(h2)) { // TODO: check if this is correct!
213
+ int32_t off = kh_val(h2, k)>>32, cnt = (int32_t)kh_val(h2, k);
214
+ for (j = off; j < off + cnt; ++j)
215
+ if (dst[j].target_dist < 0)
216
+ trans[i] = 1;
217
+ }
218
+ }
219
+ for (i = n_out - 1; i >= 0; --i) // mark all predecessors
220
+ if (trans[i] && out[i]->pre >= 0)
221
+ trans[out[i]->pre] = 1;
222
+ for (i = n = 0; i < n_out; ++i) // generate coordinate translations
223
+ if (trans[i]) trans[i] = n++;
224
+ else trans[i] = -1;
225
+
226
+ *n_pathv = n;
227
+ KMALLOC(km0, ret, n);
228
+ for (i = 0; i < n_out; ++i) { // generate the backtrack array
229
+ mg_pathv_t *p;
230
+ if (trans[i] < 0) continue;
231
+ p = &ret[trans[i]];
232
+ p->v = out[i]->v, p->d = out[i]->di >> 32;
233
+ p->pre = out[i]->pre < 0? out[i]->pre : trans[out[i]->pre];
234
+ }
235
+ for (i = 0; i < n_dst; ++i) // translate "path_end"
236
+ if (dst[i].path_end >= 0)
237
+ dst[i].path_end = trans[dst[i].path_end];
238
+ }
239
+
240
+ km_destroy(km);
241
+ return ret;
242
+ }
243
+
244
+ void mg_sub_print_path(FILE *fp, const gfa_t *g, int32_t n, mg_pathv_t *path)
245
+ {
246
+ int32_t i;
247
+ for (i = 0; i < n; ++i) {
248
+ mg_pathv_t *p = &path[i];
249
+ fprintf(fp, "[%d]\t%d\t%s\t%d\t%d\n", i, p->v, g->seg[p->v>>1].name, p->d, p->pre);
250
+ }
251
+ }
@@ -0,0 +1,109 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <assert.h>
4
+ #include <string.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "kvec-km.h"
7
+ #include "mgpriv.h"
8
+
9
+ unsigned char seq_nt4_table[256] = {
10
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
11
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
12
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
13
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
14
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
15
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
17
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
22
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
24
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
26
+ };
27
+
28
+ static inline uint64_t hash64(uint64_t key, uint64_t mask)
29
+ {
30
+ key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
31
+ key = key ^ key >> 24;
32
+ key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
33
+ key = key ^ key >> 14;
34
+ key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
35
+ key = key ^ key >> 28;
36
+ key = (key + (key << 31)) & mask;
37
+ return key;
38
+ }
39
+
40
+ /**
41
+ * Find symmetric (w,k)-minimizers on a DNA sequence
42
+ *
43
+ * @param km thread-local memory pool; using NULL falls back to malloc()
44
+ * @param str DNA sequence
45
+ * @param len length of $str
46
+ * @param w find a minimizer for every $w consecutive k-mers
47
+ * @param k k-mer size
48
+ * @param rid reference ID; will be copied to the output $p array
49
+ * @param p minimizers
50
+ * p->a[i].x = kMer<<8 | kmerSpan
51
+ * p->a[i].y = rid<<32 | lastPos<<1 | strand
52
+ * where lastPos is the position of the last base of the i-th minimizer,
53
+ * and strand indicates whether the minimizer comes from the top or the bottom strand.
54
+ * Callers may want to set "p->n = 0"; otherwise results are appended to p
55
+ */
56
+ void mg_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, mg128_v *p)
57
+ {
58
+ uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
59
+ int i, j, l, buf_pos, min_pos, kmer_span = 0;
60
+ mg128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
61
+
62
+ assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
63
+ memset(buf, 0xff, w * 16);
64
+ kv_resize(mg128_t, km, *p, p->n + len/w);
65
+
66
+ for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
67
+ int c = seq_nt4_table[(uint8_t)str[i]];
68
+ mg128_t info = { UINT64_MAX, UINT64_MAX };
69
+ if (c < 4) { // not an ambiguous base
70
+ int z;
71
+ kmer_span = l + 1 < k? l + 1 : k;
72
+ kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
73
+ kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
74
+ if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
75
+ z = kmer[0] < kmer[1]? 0 : 1; // strand
76
+ ++l;
77
+ if (l >= k && kmer_span < 256) {
78
+ info.x = hash64(kmer[z], mask) << 8 | kmer_span;
79
+ info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
80
+ }
81
+ } else l = 0, kmer_span = 0;
82
+ buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
83
+ if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
84
+ for (j = buf_pos + 1; j < w; ++j)
85
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mg128_t, km, *p, buf[j]);
86
+ for (j = 0; j < buf_pos; ++j)
87
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mg128_t, km, *p, buf[j]);
88
+ }
89
+ if (info.x <= min.x) { // a new minimum; then write the old min
90
+ if (l >= w + k && min.x != UINT64_MAX) kv_push(mg128_t, km, *p, min);
91
+ min = info, min_pos = buf_pos;
92
+ } else if (buf_pos == min_pos) { // old min has moved outside the window
93
+ if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mg128_t, km, *p, min);
94
+ for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
95
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
96
+ for (j = 0; j <= buf_pos; ++j)
97
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j;
98
+ if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
99
+ for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
100
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mg128_t, km, *p, buf[j]);
101
+ for (j = 0; j <= buf_pos; ++j)
102
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mg128_t, km, *p, buf[j]);
103
+ }
104
+ }
105
+ if (++buf_pos == w) buf_pos = 0;
106
+ }
107
+ if (min.x != UINT64_MAX)
108
+ kv_push(mg128_t, km, *p, min);
109
+ }
@@ -0,0 +1,147 @@
1
+ #include <stdlib.h>
2
+ #include "sys.h"
3
+
4
+ #if defined(WIN32) || defined(_WIN32)
5
+ #include <windows.h>
6
+
7
+ struct timezone
8
+ {
9
+ __int32 tz_minuteswest; /* minutes W of Greenwich */
10
+ int tz_dsttime; /* type of dst correction */
11
+ };
12
+
13
+ /*
14
+ * gettimeofday.c
15
+ * Win32 gettimeofday() replacement
16
+ * taken from PostgreSQL, according to
17
+ * https://stackoverflow.com/questions/1676036/what-should-i-use-to-replace-gettimeofday-on-windows
18
+ *
19
+ * src/port/gettimeofday.c
20
+ *
21
+ * Copyright (c) 2003 SRA, Inc.
22
+ * Copyright (c) 2003 SKC, Inc.
23
+ *
24
+ * Permission to use, copy, modify, and distribute this software and
25
+ * its documentation for any purpose, without fee, and without a
26
+ * written agreement is hereby granted, provided that the above
27
+ * copyright notice and this paragraph and the following two
28
+ * paragraphs appear in all copies.
29
+ *
30
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
31
+ * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
32
+ * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
33
+ * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
34
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ *
36
+ * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
37
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
+ * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
39
+ * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
40
+ * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
41
+ */
42
+
43
+ /* FILETIME of Jan 1 1970 00:00:00. */
44
+ static const unsigned __int64 epoch = ((unsigned __int64) 116444736000000000ULL);
45
+
46
+ /*
47
+ * timezone information is stored outside the kernel so tzp isn't used anymore.
48
+ *
49
+ * Note: this function is not for Win32 high precision timing purpose. See
50
+ * elapsed_time().
51
+ */
52
+ int gettimeofday(struct timeval * tp, struct timezone *tzp)
53
+ {
54
+ FILETIME file_time;
55
+ SYSTEMTIME system_time;
56
+ ULARGE_INTEGER ularge;
57
+
58
+ GetSystemTime(&system_time);
59
+ SystemTimeToFileTime(&system_time, &file_time);
60
+ ularge.LowPart = file_time.dwLowDateTime;
61
+ ularge.HighPart = file_time.dwHighDateTime;
62
+
63
+ tp->tv_sec = (long) ((ularge.QuadPart - epoch) / 10000000L);
64
+ tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
65
+
66
+ return 0;
67
+ }
68
+
69
+ // taken from https://stackoverflow.com/questions/5272470/c-get-cpu-usage-on-linux-and-windows
70
+ double cputime()
71
+ {
72
+ HANDLE hProcess = GetCurrentProcess();
73
+ FILETIME ftCreation, ftExit, ftKernel, ftUser;
74
+ SYSTEMTIME stKernel;
75
+ SYSTEMTIME stUser;
76
+
77
+ GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser);
78
+ FileTimeToSystemTime(&ftKernel, &stKernel);
79
+ FileTimeToSystemTime(&ftUser, &stUser);
80
+
81
+ double kernelModeTime = ((stKernel.wHour * 60.) + stKernel.wMinute * 60.) + stKernel.wSecond * 1. + stKernel.wMilliseconds / 1000.;
82
+ double userModeTime = ((stUser.wHour * 60.) + stUser.wMinute * 60.) + stUser.wSecond * 1. + stUser.wMilliseconds / 1000.;
83
+
84
+ return kernelModeTime + userModeTime;
85
+ }
86
+
87
+ long peakrss(void) { return 0; }
88
+ #else
89
+ #include <sys/resource.h>
90
+ #include <sys/time.h>
91
+
92
+ double cputime(void)
93
+ {
94
+ struct rusage r;
95
+ getrusage(RUSAGE_SELF, &r);
96
+ return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
97
+ }
98
+
99
+ long peakrss(void)
100
+ {
101
+ struct rusage r;
102
+ getrusage(RUSAGE_SELF, &r);
103
+ #ifdef __linux__
104
+ return r.ru_maxrss * 1024;
105
+ #else
106
+ return r.ru_maxrss;
107
+ #endif
108
+ }
109
+
110
+ #endif /* WIN32 || _WIN32 */
111
+
112
+ double realtime(void)
113
+ {
114
+ struct timeval tp;
115
+ gettimeofday(&tp, NULL);
116
+ return tp.tv_sec + tp.tv_usec * 1e-6;
117
+ }
118
+
119
+ void mg_err_fputs(const char *str, FILE *fp)
120
+ {
121
+ int ret;
122
+ ret = fputs(str, fp);
123
+ if (ret == EOF) {
124
+ fprintf(stderr, "[ERROR] failed to write the results\n");
125
+ exit(EXIT_FAILURE);
126
+ }
127
+ }
128
+
129
+ void mg_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp)
130
+ {
131
+ int ret;
132
+ ret = fwrite(p, size, nitems, fp);
133
+ if (ret == EOF) {
134
+ fprintf(stderr, "[ERROR] failed to write data\n");
135
+ exit(EXIT_FAILURE);
136
+ }
137
+ }
138
+
139
+ void mg_err_fread(void *p, size_t size, size_t nitems, FILE *fp)
140
+ {
141
+ int ret;
142
+ ret = fread(p, size, nitems, fp);
143
+ if (ret == EOF) {
144
+ fprintf(stderr, "[ERROR] failed to read data\n");
145
+ exit(EXIT_FAILURE);
146
+ }
147
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef MG_SYS_H
2
+ #define MG_SYS_H
3
+
4
+ #include <stdio.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void mg_err_fputs(const char *str, FILE *fp);
11
+
12
+ double realtime(void);
13
+ double cputime(void);
14
+ long peakrss(void);
15
+
16
+ #ifdef __cplusplus
17
+ }
18
+ #endif
19
+
20
+ #endif