ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,140 @@
1
+ #include <assert.h>
2
+ #include <string.h>
3
+ #include "mgpriv.h"
4
+ #include "kalloc.h"
5
+ #include "miniwfa.h"
6
+
7
+ static void append_cigar1(void *km, mg64_v *c, int32_t op, int32_t len)
8
+ {
9
+ if (c->n > 0 && (c->a[c->n - 1]&0xf) == op) {
10
+ c->a[c->n - 1] += (uint64_t)len<<4;
11
+ } else {
12
+ if (c->n == c->m) {
13
+ c->m += (c->m>>1) + 16;
14
+ KREALLOC(km, c->a, c->m);
15
+ }
16
+ c->a[c->n++] = (uint64_t)len<<4 | op;
17
+ }
18
+ }
19
+
20
+ static void append_cigar(void *km, mg64_v *c, int32_t n_cigar, const uint32_t *cigar)
21
+ {
22
+ int32_t k;
23
+ if (n_cigar == 0) return;
24
+ append_cigar1(km, c, cigar[0]&0xf, cigar[0]>>4);
25
+ if (c->n + n_cigar - 1 > c->m) {
26
+ c->m = c->n + n_cigar - 1;
27
+ kroundup32(c->m);
28
+ KREALLOC(km, c->a, c->m);
29
+ }
30
+ for (k = 0; k < n_cigar - 1; ++k)
31
+ c->a[c->n + k] = cigar[1 + k];
32
+ c->n += n_cigar - 1;
33
+ }
34
+
35
+ void mg_gchain_cigar(void *km, const gfa_t *g, const gfa_edseq_t *es, const char *qseq, mg_gchains_t *gt, const char *qname)
36
+ {
37
+ int32_t i, l_seq = 0, m_seq = 0;
38
+ char *seq = 0;
39
+ void *km2;
40
+ mg64_v cigar = {0,0,0};
41
+ km2 = km_init2(km, 0);
42
+ for (i = 0; i < gt->n_gc; ++i) {
43
+ mg_gchain_t *gc = &gt->gc[i];
44
+ int32_t l0 = gc->off;
45
+ int32_t off_a0 = gt->lc[l0].off;
46
+ int32_t j, j0 = 0, k, l;
47
+ cigar.n = 0;
48
+ append_cigar1(km, &cigar, 7, gt->a[off_a0].y>>32&0xff);
49
+ for (j = 1; j < gc->n_anchor; ++j) {
50
+ const mg128_t *q, *p = &gt->a[off_a0 + j];
51
+ if ((p->y & MG_SEED_IGNORE) && j != gc->n_anchor - 1) continue;
52
+ q = &gt->a[off_a0 + j0];
53
+ // find the lchain that contains the anchor
54
+ for (l = l0; l < gc->off + gc->cnt; ++l) {
55
+ mg_llchain_t *r = &gt->lc[l];
56
+ if (off_a0 + j >= r->off && off_a0 + j < r->off + r->cnt)
57
+ break;
58
+ }
59
+ assert(l < gc->off + gc->cnt);
60
+ assert((int32_t)q->x < g->seg[gt->lc[l0].v>>1].len);
61
+ // calculate the target sequence length
62
+ if (l == l0) {
63
+ l_seq = (int32_t)p->x - (int32_t)q->x;
64
+ } else {
65
+ l_seq = g->seg[gt->lc[l0].v>>1].len - (int32_t)q->x - 1;
66
+ for (k = l0 + 1; k < l; ++k)
67
+ l_seq += es[gt->lc[k].v].len;
68
+ l_seq += (int32_t)p->x + 1;
69
+ }
70
+ if (l_seq + 1 > m_seq) {
71
+ m_seq = l_seq + 1;
72
+ kroundup32(m_seq);
73
+ KREALLOC(km, seq, m_seq);
74
+ }
75
+ // get the target sequence
76
+ if (l == l0) { // on the same vertex
77
+ memcpy(seq, &es[gt->lc[l0].v].seq[(int32_t)q->x + 1], l_seq);
78
+ } else {
79
+ uint32_t v = gt->lc[l0].v;
80
+ l_seq = g->seg[v>>1].len - (int32_t)q->x - 1;
81
+ memcpy(seq, &es[v].seq[(int32_t)q->x + 1], l_seq);
82
+ for (k = l0 + 1; k < l; ++k) {
83
+ v = gt->lc[k].v;
84
+ memcpy(&seq[l_seq], es[v].seq, es[v].len);
85
+ l_seq += es[v].len;
86
+ }
87
+ memcpy(&seq[l_seq], es[gt->lc[l].v].seq, (int32_t)p->x + 1);
88
+ l_seq += (int32_t)p->x + 1;
89
+ }
90
+ {
91
+ int32_t qlen = (int32_t)p->y - (int32_t)q->y;
92
+ const char *qs = &qseq[(int32_t)q->y + 1];
93
+ assert(l_seq > 0 || qlen > 0);
94
+ if (l_seq == 0) append_cigar1(km, &cigar, 1, qlen);
95
+ else if (qlen == 0) append_cigar1(km, &cigar, 2, l_seq);
96
+ else if (l_seq == qlen && qlen <= (q->y>>32&0xff)) append_cigar1(km, &cigar, 7, qlen);
97
+ else {
98
+ mwf_opt_t opt;
99
+ mwf_rst_t rst;
100
+ mwf_opt_init(&opt);
101
+ opt.flag |= MWF_F_CIGAR;
102
+ mwf_wfa_auto(km2, &opt, l_seq, seq, qlen, qs, &rst);
103
+ append_cigar(km, &cigar, rst.n_cigar, rst.cigar);
104
+ kfree(km2, rst.cigar);
105
+ if ((mg_dbg_flag&MG_DBG_MINIWFA) && l_seq > 5000 && qlen > 5000 && rst.s >= 10000)
106
+ fprintf(stderr, "WL\t%s\t%d\t%d\t%d\t%d\t%d\n", qname, i, (int32_t)q->y + 1, (int32_t)p->y - (int32_t)q->y, l_seq, rst.s);
107
+ if (rst.s >= 10000 && l_seq > 5000 && qlen > 5000) {
108
+ km_destroy(km2);
109
+ km2 = km_init2(km, 0);
110
+ }
111
+ if ((mg_dbg_flag&MG_DBG_MWF_SEQ) && l_seq > 5000 && qlen > 5000 && rst.s >= 10000) {
112
+ char *str;
113
+ str = Kmalloc(km, char, qlen + l_seq + strlen(qname) + 100);
114
+ k = sprintf(str, "WL\t%s\t%d\t%d\t%d\nWT\t%.*s\nWQ\t%.*s\n", qname, i, (int32_t)q->y + 1, rst.s, l_seq, seq, qlen, qs);
115
+ fwrite(str, 1, k, stderr);
116
+ kfree(km, str);
117
+ }
118
+ }
119
+ }
120
+ j0 = j, l0 = l;
121
+ }
122
+ // save the CIGAR to gt->gc[i]
123
+ gc->p = (mg_cigar_t*)kcalloc(gt->km, 1, cigar.n * 8 + sizeof(mg_cigar_t));
124
+ gc->p->ss = (int32_t)gt->a[off_a0].x + 1 - (int32_t)(gt->a[off_a0].y>>32&0xff);
125
+ gc->p->ee = (int32_t)gt->a[off_a0 + gc->n_anchor - 1].x + 1;
126
+ gc->p->n_cigar = cigar.n;
127
+ memcpy(gc->p->cigar, cigar.a, cigar.n * 8);
128
+ for (j = 0, l = 0; j < gc->p->n_cigar; ++j) {
129
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4;
130
+ if (op == 7) gc->p->mlen += len, gc->p->blen += len;
131
+ else gc->p->blen += len;
132
+ if (op != 1) gc->p->aplen += len;
133
+ if (op != 2) l += len;
134
+ }
135
+ assert(l == gc->qe - gc->qs && gc->p->aplen == gc->pe - gc->ps);
136
+ }
137
+ km_destroy(km2);
138
+ kfree(km, seq);
139
+ kfree(km, cigar.a);
140
+ }
@@ -0,0 +1,532 @@
1
+ #include <math.h>
2
+ #include <string.h>
3
+ #include "mgpriv.h"
4
+ #include "ksort.h" // for radix sort
5
+ #include "khashl.h" // for kh_hash_uint32()
6
+ #include "gfa-priv.h"
7
+
8
+ typedef struct {
9
+ uint32_t srt;
10
+ int32_t i;
11
+ } gc_frag_t;
12
+
13
+ #define gc_frag_key(p) ((p).srt)
14
+ KRADIX_SORT_INIT(gc, gc_frag_t, gc_frag_key, 4)
15
+
16
+ static int32_t find_max(int32_t n, const gc_frag_t *gf, int32_t x)
17
+ {
18
+ int32_t s = 0, e = n;
19
+ if (n == 0) return -1;
20
+ if (gf[n-1].srt < x) return n - 1;
21
+ if (gf[0].srt >= x) return -1;
22
+ while (e > s) { // TODO: finish this block
23
+ int32_t m = s + (e - s) / 2;
24
+ if (gf[m].srt >= x) e = m;
25
+ else s = m + 1;
26
+ }
27
+ assert(s == e);
28
+ return s;
29
+ }
30
+
31
+ static int32_t mg_target_dist(const gfa_t *g, const mg_lchain_t *l0, const mg_lchain_t *l1)
32
+ {
33
+ // below equals (l1->qs - l0->qe) - min_dist + g->seg[l1->v>>1].len; see mg_gchain1_dp() for the calculation of min_dist
34
+ return (l1->qs - l0->qe) - (g->seg[l0->v>>1].len - l0->re) + (g->seg[l1->v>>1].len - l1->rs);
35
+ // when l0->v == l1->v, the above becomes (l1->qs - l0->qe) - (l1->rs - l0->re), which is what we want
36
+ }
37
+
38
+ static inline int32_t cal_sc(const mg_path_dst_t *dj, const mg_lchain_t *li, const mg_lchain_t *lc, const mg128_t *an, const gc_frag_t *a, const int32_t *f,
39
+ int bw, int ref_bonus, float chn_pen_gap)
40
+ {
41
+ const mg_lchain_t *lj;
42
+ int32_t gap, sc, segi, segj;
43
+ float lin_pen, log_pen;
44
+ if (dj->n_path == 0) return INT32_MIN;
45
+ segi = (an[li->off].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
46
+ gap = dj->dist - dj->target_dist;
47
+ lj = &lc[a[dj->meta].i];
48
+ segj = (an[lj->off + lj->cnt - 1].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
49
+ if (gap < 0) gap = -gap;
50
+ if (segi == segj && gap > bw) return INT32_MIN;
51
+ if (lj->qe <= li->qs) sc = li->score;
52
+ else sc = (int32_t)((double)(li->qe - lj->qe) / (li->qe - li->qs) * li->score + .499); // dealing with overlap on query
53
+ //sc += dj->mlen; // TODO: is this line the right thing to do?
54
+ if (dj->is_0) sc += ref_bonus;
55
+ lin_pen = chn_pen_gap * (float)gap;
56
+ log_pen = gap >= 2? mg_log2(gap) : 0.0f;
57
+ sc -= (int32_t)(lin_pen + log_pen);
58
+ sc += f[dj->meta];
59
+ return sc;
60
+ }
61
+
62
+ int32_t mg_gchain1_dp(void *km, const gfa_t *g, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip,
63
+ int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, const mg128_t *an, uint64_t **u_)
64
+ {
65
+ int32_t i, j, k, m_dst, n_dst, n_ext, n_u, n_v, n_lc = *n_lc_;
66
+ int32_t *f, *v, *t;
67
+ int64_t *p;
68
+ uint64_t *u;
69
+ mg_path_dst_t *dst;
70
+ gc_frag_t *a;
71
+ mg_lchain_t *swap;
72
+ char *qs;
73
+
74
+ *u_ = 0;
75
+ if (n_lc == 0) return 0;
76
+
77
+ KMALLOC(km, a, n_lc);
78
+ for (i = n_ext = 0; i < n_lc; ++i) { // a[] is a view of frag[]; for sorting
79
+ mg_lchain_t *r = &lc[i];
80
+ gc_frag_t *ai = &a[i];
81
+ int32_t is_isolated = 0, min_end_dist_g;
82
+ r->dist_pre = -1;
83
+ min_end_dist_g = g->seg[r->v>>1].len - r->re;
84
+ if (r->rs < min_end_dist_g) min_end_dist_g = r->rs;
85
+ if (min_end_dist_g > max_dist_g) is_isolated = 1; // if too far from segment ends
86
+ else if (min_end_dist_g>>3 > r->score) is_isolated = 1; // if the lchain too small relative to distance to the segment ends
87
+ ai->srt = (uint32_t)is_isolated<<31 | r->qe;
88
+ ai->i = i;
89
+ if (!is_isolated) ++n_ext;
90
+ }
91
+ if (n_ext < 2) { // no graph chaining needed; early return
92
+ kfree(km, a);
93
+ KMALLOC(km, u, n_lc);
94
+ for (i = 0; i < n_lc; ++i)
95
+ u[i] = (uint64_t)lc[i].score<<32 | 1;
96
+ *u_ = u;
97
+ return n_lc;
98
+ }
99
+ radix_sort_gc(a, a + n_lc);
100
+
101
+ KMALLOC(km, v, n_lc);
102
+ KMALLOC(km, f, n_ext);
103
+ KMALLOC(km, p, n_ext);
104
+ KCALLOC(km, t, n_ext);
105
+
106
+ KMALLOC(km, qs, max_dist_q + 1);
107
+ m_dst = n_dst = 0, dst = 0;
108
+ for (i = 0; i < n_ext; ++i) { // core loop
109
+ gc_frag_t *ai = &a[i];
110
+ mg_lchain_t *li = &lc[ai->i];
111
+ int32_t segi = (an[li->off].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
112
+ { // collect end points potentially reachable from _i_
113
+ int32_t x = li->qs + bw, n_skip = 0;
114
+ if (x > qlen) x = qlen;
115
+ x = find_max(i, a, x);
116
+ n_dst = 0;
117
+ for (j = x; j >= 0; --j) { // collect potential destination vertices
118
+ gc_frag_t *aj = &a[j];
119
+ mg_lchain_t *lj = &lc[aj->i];
120
+ mg_path_dst_t *q;
121
+ int32_t target_dist, segj, dq;
122
+ if (lj->qs >= li->qs) continue; // lj is contained in li on the query coordinate
123
+ if (lj->qe > li->qs) { // test overlap on the query
124
+ int o = lj->qe - li->qs;
125
+ if (o > (lj->qe - lj->qs) * mask_level || o > (li->qe - li->qs) * mask_level)
126
+ continue;
127
+ }
128
+ dq = li->qs - lj->qe;
129
+ segj = (an[lj->off + lj->cnt - 1].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
130
+ if (segi == segj) {
131
+ if (dq > max_dist_q) break; // if query gap too large, stop
132
+ } else {
133
+ if (dq > max_dist_g && dq > max_dist_q) break;
134
+ }
135
+ if (li->v != lj->v) { // the two linear chains are on two different segments
136
+ int32_t min_dist = li->rs + (g->seg[lj->v>>1].len - lj->re); // minimal graph gap
137
+ if (min_dist > max_dist_g) continue; // graph gap too large
138
+ if (segi == segj && min_dist - bw > li->qs - lj->qe) continue; // when li->qs < lj->qe, the condition turns to min_dist + (lj->qe - li->qs) > bw, which is desired
139
+ target_dist = mg_target_dist(g, lj, li);
140
+ if (target_dist < 0) continue; // this may happen if the query overlap is far too large
141
+ } else if (lj->rs >= li->rs || lj->re >= li->re) { // not colinear
142
+ continue;
143
+ } else {
144
+ int32_t dr = li->rs - lj->re, w = dr > dq? dr - dq : dq - dr;
145
+ if (segi == segj && w > bw) continue; // test bandwidth
146
+ if (dr > max_dist_g || dr < -max_dist_g) continue;
147
+ if (lj->re > li->rs) { // test overlap on the graph segment
148
+ int o = lj->re - li->rs;
149
+ if (o > (lj->re - lj->rs) * mask_level || o > (li->re - li->rs) * mask_level)
150
+ continue;
151
+ }
152
+ target_dist = mg_target_dist(g, lj, li);
153
+ }
154
+ if (n_dst == m_dst) KEXPAND(km, dst, m_dst); // TODO: watch out the quadratic behavior!
155
+ q = &dst[n_dst++];
156
+ memset(q, 0, sizeof(mg_path_dst_t));
157
+ q->inner = (li->v == lj->v);
158
+ q->v = lj->v^1;
159
+ q->meta = j;
160
+ q->qlen = li->qs - lj->qe;
161
+ q->target_dist = target_dist;
162
+ q->target_hash = 0;
163
+ q->check_hash = 0;
164
+ if (t[j] == i) {
165
+ if (++n_skip > max_skip)
166
+ break;
167
+ }
168
+ if (p[j] >= 0) t[p[j]] = i;
169
+ }
170
+ }
171
+ { // confirm reach-ability
172
+ int32_t k;
173
+ // test reach-ability without sequences
174
+ mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0);
175
+ // remove unreachable destinations
176
+ for (j = k = 0; j < n_dst; ++j) {
177
+ mg_path_dst_t *dj = &dst[j];
178
+ int32_t sc;
179
+ if (dj->n_path == 0) continue; // not reachable
180
+ sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
181
+ if (sc == INT32_MIN) continue; // out of band
182
+ if (sc + li->score < 0) continue; // negative score and too low
183
+ dst[k++] = dst[j];
184
+ }
185
+ n_dst = k;
186
+ }
187
+ { // DP
188
+ int32_t max_f = li->score, max_j = -1, max_d = -1, max_inner = 0;
189
+ uint32_t max_hash = 0;
190
+ for (j = 0; j < n_dst; ++j) {
191
+ mg_path_dst_t *dj = &dst[j];
192
+ int32_t sc;
193
+ sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
194
+ if (sc == INT32_MIN) continue;
195
+ if (mg_dbg_flag & MG_DBG_GC1) {
196
+ mg_lchain_t *lj = &lc[a[dj->meta].i];
197
+ fprintf(stderr, " [dst:%d] dst=%c%s[%d], n_path=%d, target=%d, opt_dist=%d, score=%d, q_intv=[%d,%d), g_intv=[%d,%d)\n", dj->meta, "><"[dj->v&1], g->seg[dj->v>>1].name, dj->v, dj->n_path, dj->target_dist - g->seg[li->v>>1].len, dj->dist - g->seg[li->v>>1].len, sc, lj->qs, lj->qe, lj->rs, lj->re);
198
+ }
199
+ if (sc > max_f) max_f = sc, max_j = dj->meta, max_d = dj->dist, max_hash = dj->hash, max_inner = dj->inner;
200
+ }
201
+ f[i] = max_f, p[i] = max_j;
202
+ li->dist_pre = max_d;
203
+ li->hash_pre = max_hash;
204
+ li->inner_pre = max_inner;
205
+ v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f;
206
+ if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " [opt:%d] opt=%d, max_f=%d\n", ai->i, max_j, max_f);
207
+ }
208
+ }
209
+ kfree(km, dst);
210
+ kfree(km, qs);
211
+ if (mg_dbg_flag & MG_DBG_GC1) {
212
+ int32_t mmax_f = 0, mmax_i = -1;
213
+ for (i = 0; i < n_ext; ++i) if (f[i] > mmax_f) mmax_f = f[i], mmax_i = i;
214
+ i = mmax_i; while (i >= 0) { fprintf(stderr, "[best] i=%d, seg=%s, max_f=%d, chn_pen_gap=%f\n", a[i].i, g->seg[lc[a[i].i].v>>1].name, f[i], chn_pen_gap); i = p[i]; }
215
+ }
216
+
217
+ u = mg_chain_backtrack(km, n_ext, f, p, v, t, 0, 0, INT32_MAX, n_lc - n_ext, &n_u, &n_v);
218
+ kfree(km, f); kfree(km, p); kfree(km, t);
219
+
220
+ for (i = 0; i < n_lc - n_ext; ++i) {
221
+ u[n_u++] = (uint64_t)lc[a[n_ext + i].i].score << 32 | 1;
222
+ v[n_v++] = n_ext + i;
223
+ }
224
+
225
+ KMALLOC(km, swap, n_v);
226
+ for (i = 0, k = 0; i < n_u; ++i) {
227
+ int32_t k0 = k, ni = (int32_t)u[i];
228
+ for (j = 0; j < ni; ++j)
229
+ swap[k++] = lc[a[v[k0 + (ni - j - 1)]].i];
230
+ }
231
+ assert(k == n_v);
232
+ memcpy(lc, swap, n_v * sizeof(mg_lchain_t));
233
+ *n_lc_ = n_v;
234
+ *u_ = u;
235
+
236
+ kfree(km, a);
237
+ kfree(km, swap);
238
+ kfree(km, v);
239
+ return n_u;
240
+ }
241
+
242
+ void mg_gchain_extra(const gfa_t *g, mg_gchains_t *gs)
243
+ {
244
+ int32_t i, j, k;
245
+ for (i = 0; i < gs->n_gc; ++i) { // iterate over gchains
246
+ mg_gchain_t *p = &gs->gc[i];
247
+ const mg_llchain_t *q;
248
+ const mg128_t *last_a;
249
+ int32_t q_span, rest_pl, tmp, n_mini;
250
+
251
+ p->qs = p->qe = p->ps = p->pe = -1, p->plen = p->blen = p->mlen = 0, p->div = -1.0f;
252
+ if (p->cnt == 0) continue;
253
+
254
+ assert(gs->lc[p->off].cnt > 0 && gs->lc[p->off + p->cnt - 1].cnt > 0); // first and last lchains can't be empty
255
+ q = &gs->lc[p->off];
256
+ q_span = (int32_t)(gs->a[q->off].y>>32&0xff);
257
+ p->qs = (int32_t)gs->a[q->off].y + 1 - q_span;
258
+ p->ps = (int32_t)gs->a[q->off].x + 1 - q_span;
259
+ tmp = (int32_t)(gs->a[q->off].x>>32);
260
+ assert(p->qs >= 0 && p->ps >= 0);
261
+ q = &gs->lc[p->off + p->cnt - 1];
262
+ p->qe = (int32_t)gs->a[q->off + q->cnt - 1].y + 1;
263
+ p->pe = g->seg[q->v>>1].len - (int32_t)gs->a[q->off + q->cnt - 1].x - 1; // this is temporary
264
+ n_mini = (int32_t)(gs->a[q->off + q->cnt - 1].x>>32) - tmp + 1;
265
+ assert(p->n_anchor > 0);
266
+
267
+ rest_pl = 0; // this value is never used if the first lchain is not empty (which should always be true)
268
+ last_a = &gs->a[gs->lc[p->off].off];
269
+ for (j = 0; j < p->cnt; ++j) { // iterate over lchains
270
+ const mg_llchain_t *q = &gs->lc[p->off + j];
271
+ int32_t vlen = g->seg[q->v>>1].len;
272
+ p->plen += vlen;
273
+ for (k = 0; k < q->cnt; ++k) { // iterate over anchors
274
+ const mg128_t *r = &gs->a[q->off + k];
275
+ int32_t pl, ql = (int32_t)r->y - (int32_t)last_a->y;
276
+ int32_t span = (int32_t)(r->y>>32&0xff);
277
+ if (j == 0 && k == 0) { // the first anchor on the first lchain
278
+ pl = ql = span;
279
+ } else if (j > 0 && k == 0) { // the first anchor but not on the first lchain
280
+ pl = (int32_t)r->x + 1 + rest_pl;
281
+ } else {
282
+ pl = (int32_t)r->x - (int32_t)last_a->x;
283
+ }
284
+ if (ql < 0) ql = -ql, n_mini += (int32_t)(last_a->x>>32) - (int32_t)(r->x>>32); // dealing with overlapping query at junctions
285
+ p->blen += pl > ql? pl : ql;
286
+ p->mlen += pl > span && ql > span? span : pl < ql? pl : ql;
287
+ last_a = r;
288
+ }
289
+ if (q->cnt == 0) rest_pl += vlen;
290
+ else rest_pl = vlen - (int32_t)gs->a[q->off + q->cnt - 1].x - 1;
291
+ }
292
+ p->pe = p->plen - p->pe;
293
+ assert(p->pe >= p->ps);
294
+ // here n_mini >= p->n_anchor should stand almost all the time
295
+ p->div = n_mini >= p->n_anchor? log((double)n_mini / p->n_anchor) / q_span : log((double)p->n_anchor / n_mini) / q_span;
296
+ }
297
+ }
298
+
299
+ /*
300
+ * Generate graph chains
301
+ */
302
+ typedef struct {
303
+ void *km;
304
+ const gfa_t *g;
305
+ const gfa_edseq_t *es;
306
+ const char *qseq;
307
+ int32_t n_seg, n_llc, m_llc, n_a;
308
+ mg_llchain_t *llc;
309
+ } bridge_aux_t;
310
+
311
+ static inline void copy_lchain(mg_llchain_t *q, const mg_lchain_t *p, int32_t *n_a, mg128_t *a_new, const mg128_t *a_old, int32_t ed)
312
+ {
313
+ q->cnt = p->cnt, q->v = p->v, q->score = p->score, q->ed = ed;
314
+ memcpy(&a_new[*n_a], &a_old[p->off], q->cnt * sizeof(mg128_t));
315
+ q->off = *n_a;
316
+ (*n_a) += q->cnt;
317
+ }
318
+
319
+ static int32_t bridge_shortk(bridge_aux_t *aux, const mg_lchain_t *l0, const mg_lchain_t *l1)
320
+ {
321
+ int32_t s, n_pathv;
322
+ mg_path_dst_t dst;
323
+ mg_pathv_t *p;
324
+ memset(&dst, 0, sizeof(mg_path_dst_t));
325
+ dst.v = l0->v ^ 1;
326
+ assert(l1->dist_pre >= 0);
327
+ dst.target_dist = l1->dist_pre;
328
+ dst.target_hash = l1->hash_pre;
329
+ dst.check_hash = 1;
330
+ p = mg_shortest_k(aux->km, aux->g, l1->v^1, 1, &dst, dst.target_dist, MG_MAX_SHORT_K, &n_pathv);
331
+ if (n_pathv == 0 || dst.target_hash != dst.hash) {
332
+ fprintf(stderr, "[W::%s] %c%s[%d] -> %c%s[%d], dist=%d, target_dist=%d; chain skiped.\n", __func__, "><"[(l1->v^1)&1], aux->g->seg[l1->v>>1].name, l1->v^1, "><"[(l0->v^1)&1],
333
+ aux->g->seg[l0->v>>1].name, l0->v^1, dst.dist, dst.target_dist);
334
+ kfree(aux->km, p);
335
+ return -1;
336
+ }
337
+ for (s = n_pathv - 2; s >= 1; --s) { // path found in a backward way, so we need to reverse it
338
+ mg_llchain_t *q;
339
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
340
+ q = &aux->llc[aux->n_llc++];
341
+ q->off = q->cnt = q->score = 0;
342
+ q->v = p[s].v^1; // when reversing a path, we also need to flip the orientation
343
+ q->ed = -1;
344
+ }
345
+ kfree(aux->km, p);
346
+ return 0;
347
+ }
348
+
349
+ static int32_t bridge_gwfa(bridge_aux_t *aux, int32_t kmer_size, int32_t gdp_max_ed, const mg_lchain_t *l0, const mg_lchain_t *l1, int32_t *ed)
350
+ {
351
+ uint32_t v0 = l0->v, v1 = l1->v;
352
+ int32_t qs = l0->qe - kmer_size, qe = l1->qs + kmer_size, end0, end1, j;
353
+ void *z;
354
+ gfa_edopt_t opt;
355
+ gfa_edrst_t r;
356
+
357
+ *ed = -1;
358
+ end0 = l0->re - kmer_size;
359
+ end1 = l1->rs + kmer_size - 1;
360
+
361
+ gfa_edopt_init(&opt);
362
+ opt.traceback = 1, opt.max_chk = 1000, opt.bw_dyn = 1000, opt.max_lag = gdp_max_ed/2;
363
+ opt.i_term = 500000000LL;
364
+ z = gfa_ed_init(aux->km, &opt, aux->g, aux->es, qe - qs, &aux->qseq[qs], v0, end0);
365
+ gfa_ed_step(z, v1, end1, gdp_max_ed, &r);
366
+ gfa_ed_destroy(z);
367
+ //fprintf(stdout, "qs=%d,qe=%d,v0=%c%s:%d:%d,v1=%c%s:%d,s=%d,nv=%d\n", qs, qe, "><"[v0&1], aux->g->seg[v0>>1].name, end0, aux->g->seg[v0>>1].len - end0 - 1, "><"[v1&1], aux->g->seg[v1>>1].name, end1, r.s, r.nv);
368
+ if (r.s < 0) return 0;
369
+
370
+ for (j = 1; j < r.nv - 1; ++j) {
371
+ mg_llchain_t *q;
372
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
373
+ q = &aux->llc[aux->n_llc++];
374
+ q->off = q->cnt = q->score = 0;
375
+ q->v = r.v[j];
376
+ q->ed = -1;
377
+ }
378
+ kfree(aux->km, r.v);
379
+ *ed = r.s;
380
+ return 1;
381
+ }
382
+
383
+ static int32_t bridge_lchains(mg_gchains_t *gc, bridge_aux_t *aux, int32_t kmer_size, int32_t gdp_max_ed, const mg_lchain_t *l0, const mg_lchain_t *l1, const mg128_t *a)
384
+ {
385
+ if (l1->v != l0->v) { // bridging two segments
386
+ int32_t ed = -1, ret = 0;
387
+ if (aux->n_seg > 1 || !bridge_gwfa(aux, kmer_size, gdp_max_ed, l0, l1, &ed))
388
+ ret = bridge_shortk(aux, l0, l1);
389
+ if (ret < 0) return -1;
390
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
391
+ copy_lchain(&aux->llc[aux->n_llc++], l1, &aux->n_a, gc->a, a, ed);
392
+ } else { // on one segment
393
+ int32_t k;
394
+ mg_llchain_t *t = &aux->llc[aux->n_llc - 1];
395
+ for (k = 0; k < l1->cnt; ++k) { // FIXME: this part is made redundant by resolve_overlap()
396
+ const mg128_t *ak = &a[l1->off + k];
397
+ if ((int32_t)ak->x > l0->re && (int32_t)ak->y > l0->qe)
398
+ break;
399
+ }
400
+ if (k < l1->cnt) { // l1 contained. TODO: check what is happening...
401
+ t->cnt += l1->cnt - k, t->score += l1->score;
402
+ memcpy(&gc->a[aux->n_a], &a[l1->off + k], (l1->cnt - k) * sizeof(mg128_t));
403
+ aux->n_a += l1->cnt - k;
404
+ }
405
+ }
406
+ return 0;
407
+ }
408
+
409
+ static void resolve_overlap(mg_lchain_t *l0, mg_lchain_t *l1, const mg128_t *a)
410
+ {
411
+ int32_t j, x, y, shift0, shift1;
412
+ // check the end of l0
413
+ x = (int32_t)a[l1->off].x;
414
+ y = (int32_t)a[l1->off].y;
415
+ for (j = l0->cnt - 1; j >= 0; --j)
416
+ if ((int32_t)a[l0->off + j].y <= y && (l0->v != l1->v || (int32_t)a[l0->off + j].x <= x))
417
+ break;
418
+ shift0 = l0->cnt - 1 - j;
419
+ // check the start of l1
420
+ x = (int32_t)a[l0->off + l0->cnt - 1].x;
421
+ y = (int32_t)a[l0->off + l0->cnt - 1].y;
422
+ for (j = 0; j < l1->cnt; ++j)
423
+ if ((int32_t)a[l1->off + j].y >= y && (l0->v != l1->v || (int32_t)a[l1->off + j].x >= x))
424
+ break;
425
+ shift1 = j;
426
+ assert(shift1 < l1->cnt); // this should never happen, or it is a bug
427
+ // update
428
+ if (shift0 > 0) {
429
+ l0->cnt -= shift0;
430
+ if (l0->cnt) { // l0->cnt may be 0 as the start of l0 may be changed and go into l1
431
+ l0->qe = (int32_t)a[l0->off + l0->cnt - 1].y + 1;
432
+ l0->re = (int32_t)a[l0->off + l0->cnt - 1].x + 1;
433
+ }
434
+ }
435
+ if (shift1 > 0) {
436
+ l1->off += shift1, l1->cnt -= shift1;
437
+ l1->qs = (int32_t)a[l1->off].y + 1 - (int32_t)(a[l1->off].y>>32&0xff);
438
+ l1->rs = (int32_t)a[l1->off].x + 1 - (int32_t)(a[l1->off].y>>32&0xff);
439
+ }
440
+ if (l0->cnt == 0) l0->qs = l0->qe = l1->qs, l0->rs = l0->re = l1->rs; // this line should have no effect
441
+ }
442
+
443
+ mg_gchains_t *mg_gchain_gen(void *km_dst, void *km, const gfa_t *g, const gfa_edseq_t *es, int32_t n_u, const uint64_t *u,
444
+ mg_lchain_t *lc, const mg128_t *a, uint32_t hash, int32_t min_gc_cnt, int32_t min_gc_score,
445
+ int32_t gdp_max_ed, int32_t n_seg, const char *qseq)
446
+ {
447
+ mg_gchains_t *gc;
448
+ int32_t i, j, k, st, kmer_size;
449
+ bridge_aux_t aux;
450
+
451
+ // preallocate gc->gc and gc->a
452
+ KCALLOC(km_dst, gc, 1);
453
+ for (i = 0, st = 0; i < n_u; ++i) {
454
+ int32_t m = 0, nui = (int32_t)u[i];
455
+ for (j = 0; j < nui; ++j) m += lc[st + j].cnt; // m is the number of anchors in this gchain
456
+ if (m >= min_gc_cnt && u[i]>>32 >= min_gc_score)
457
+ gc->n_gc++, gc->n_a += m;
458
+ st += nui;
459
+ }
460
+ if (gc->n_gc == 0) return gc;
461
+ gc->km = km_dst;
462
+ KCALLOC(km_dst, gc->gc, gc->n_gc);
463
+ KMALLOC(km_dst, gc->a, gc->n_a);
464
+
465
+ // core loop
466
+ memset(&aux, 0, sizeof(aux));
467
+ aux.km = km, aux.g = g, aux.es = es, aux.n_seg = n_seg, aux.qseq = qseq;
468
+ kmer_size = a[0].y>>32&0xff;
469
+ for (i = k = 0, st = 0, aux.n_a = 0; i < n_u; ++i) {
470
+ int32_t n_a0 = aux.n_a, n_llc0 = aux.n_llc, m = 0, nui = (int32_t)u[i];
471
+ for (j = 0; j < nui; ++j) m += lc[st + j].cnt;
472
+ if (m >= min_gc_cnt && u[i]>>32 >= min_gc_score) {
473
+ uint32_t h = hash;
474
+ int32_t j0;
475
+ gc->gc[k].score = u[i]>>32;
476
+ gc->gc[k].off = n_llc0;
477
+ for (j = 0; j < nui; ++j) {
478
+ const mg_lchain_t *p = &lc[st + j];
479
+ h += kh_hash_uint32(p->qs) + kh_hash_uint32(p->re) + kh_hash_uint32(p->v);
480
+ }
481
+ gc->gc[k].hash = kh_hash_uint32(h);
482
+
483
+ for (j = 1; j < nui; ++j)
484
+ resolve_overlap(&lc[st + j - 1], &lc[st + j], a);
485
+
486
+ if (aux.n_llc == aux.m_llc) KEXPAND(aux.km, aux.llc, aux.m_llc);
487
+ copy_lchain(&aux.llc[aux.n_llc++], &lc[st], &aux.n_a, gc->a, a, -1); // copy the first lchain
488
+ for (j0 = 0, j = 1; j < nui; ++j) {
489
+ const mg_lchain_t *l0 = &lc[st + j0], *l1 = &lc[st + j];
490
+ if (l1->cnt > 0) {
491
+ int32_t ret, t;
492
+ ret = bridge_lchains(gc, &aux, kmer_size, gdp_max_ed, l0, l1, a);
493
+ if (ret < 0) {
494
+ for (t = j0; t < j; ++t) {
495
+ ret = bridge_lchains(gc, &aux, kmer_size, gdp_max_ed, &lc[st + t], &lc[st + t + 1], a);
496
+ assert(ret >= 0);
497
+ }
498
+ }
499
+ j0 = j;
500
+ }
501
+ }
502
+
503
+ gc->gc[k].cnt = aux.n_llc - n_llc0;
504
+ gc->gc[k].n_anchor = aux.n_a - n_a0;
505
+ ++k;
506
+ }
507
+ st += nui;
508
+ }
509
+ assert(aux.n_a <= gc->n_a);
510
+
511
+ gc->n_a = aux.n_a;
512
+ gc->n_lc = aux.n_llc;
513
+ KMALLOC(km_dst, gc->lc, aux.n_llc);
514
+ memcpy(gc->lc, aux.llc, aux.n_llc * sizeof(mg_llchain_t));
515
+ kfree(km, aux.llc);
516
+
517
+ mg_gchain_extra(g, gc);
518
+ mg_gchain_sort_by_score(km, gc);
519
+ return gc;
520
+ }
521
+
522
+ void mg_gchain_free(mg_gchains_t *gs)
523
+ {
524
+ void *km;
525
+ int32_t i;
526
+ if (gs == 0) return;
527
+ km = gs->km;
528
+ for (i = 0; i < gs->n_gc; ++i)
529
+ if (gs->gc[i].p) kfree(km, gs->gc[i].p);
530
+ kfree(km, gs->gc); kfree(km, gs->a); kfree(km, gs->lc);
531
+ kfree(km, gs);
532
+ }