ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,140 @@
1
+ #include <assert.h>
2
+ #include <string.h>
3
+ #include "mgpriv.h"
4
+ #include "kalloc.h"
5
+ #include "miniwfa.h"
6
+
7
+ static void append_cigar1(void *km, mg64_v *c, int32_t op, int32_t len)
8
+ {
9
+ if (c->n > 0 && (c->a[c->n - 1]&0xf) == op) {
10
+ c->a[c->n - 1] += (uint64_t)len<<4;
11
+ } else {
12
+ if (c->n == c->m) {
13
+ c->m += (c->m>>1) + 16;
14
+ KREALLOC(km, c->a, c->m);
15
+ }
16
+ c->a[c->n++] = (uint64_t)len<<4 | op;
17
+ }
18
+ }
19
+
20
+ static void append_cigar(void *km, mg64_v *c, int32_t n_cigar, const uint32_t *cigar)
21
+ {
22
+ int32_t k;
23
+ if (n_cigar == 0) return;
24
+ append_cigar1(km, c, cigar[0]&0xf, cigar[0]>>4);
25
+ if (c->n + n_cigar - 1 > c->m) {
26
+ c->m = c->n + n_cigar - 1;
27
+ kroundup32(c->m);
28
+ KREALLOC(km, c->a, c->m);
29
+ }
30
+ for (k = 0; k < n_cigar - 1; ++k)
31
+ c->a[c->n + k] = cigar[1 + k];
32
+ c->n += n_cigar - 1;
33
+ }
34
+
35
+ void mg_gchain_cigar(void *km, const gfa_t *g, const gfa_edseq_t *es, const char *qseq, mg_gchains_t *gt, const char *qname)
36
+ {
37
+ int32_t i, l_seq = 0, m_seq = 0;
38
+ char *seq = 0;
39
+ void *km2;
40
+ mg64_v cigar = {0,0,0};
41
+ km2 = km_init2(km, 0);
42
+ for (i = 0; i < gt->n_gc; ++i) {
43
+ mg_gchain_t *gc = &gt->gc[i];
44
+ int32_t l0 = gc->off;
45
+ int32_t off_a0 = gt->lc[l0].off;
46
+ int32_t j, j0 = 0, k, l;
47
+ cigar.n = 0;
48
+ append_cigar1(km, &cigar, 7, gt->a[off_a0].y>>32&0xff);
49
+ for (j = 1; j < gc->n_anchor; ++j) {
50
+ const mg128_t *q, *p = &gt->a[off_a0 + j];
51
+ if ((p->y & MG_SEED_IGNORE) && j != gc->n_anchor - 1) continue;
52
+ q = &gt->a[off_a0 + j0];
53
+ // find the lchain that contains the anchor
54
+ for (l = l0; l < gc->off + gc->cnt; ++l) {
55
+ mg_llchain_t *r = &gt->lc[l];
56
+ if (off_a0 + j >= r->off && off_a0 + j < r->off + r->cnt)
57
+ break;
58
+ }
59
+ assert(l < gc->off + gc->cnt);
60
+ assert((int32_t)q->x < g->seg[gt->lc[l0].v>>1].len);
61
+ // calculate the target sequence length
62
+ if (l == l0) {
63
+ l_seq = (int32_t)p->x - (int32_t)q->x;
64
+ } else {
65
+ l_seq = g->seg[gt->lc[l0].v>>1].len - (int32_t)q->x - 1;
66
+ for (k = l0 + 1; k < l; ++k)
67
+ l_seq += es[gt->lc[k].v].len;
68
+ l_seq += (int32_t)p->x + 1;
69
+ }
70
+ if (l_seq + 1 > m_seq) {
71
+ m_seq = l_seq + 1;
72
+ kroundup32(m_seq);
73
+ KREALLOC(km, seq, m_seq);
74
+ }
75
+ // get the target sequence
76
+ if (l == l0) { // on the same vertex
77
+ memcpy(seq, &es[gt->lc[l0].v].seq[(int32_t)q->x + 1], l_seq);
78
+ } else {
79
+ uint32_t v = gt->lc[l0].v;
80
+ l_seq = g->seg[v>>1].len - (int32_t)q->x - 1;
81
+ memcpy(seq, &es[v].seq[(int32_t)q->x + 1], l_seq);
82
+ for (k = l0 + 1; k < l; ++k) {
83
+ v = gt->lc[k].v;
84
+ memcpy(&seq[l_seq], es[v].seq, es[v].len);
85
+ l_seq += es[v].len;
86
+ }
87
+ memcpy(&seq[l_seq], es[gt->lc[l].v].seq, (int32_t)p->x + 1);
88
+ l_seq += (int32_t)p->x + 1;
89
+ }
90
+ {
91
+ int32_t qlen = (int32_t)p->y - (int32_t)q->y;
92
+ const char *qs = &qseq[(int32_t)q->y + 1];
93
+ assert(l_seq > 0 || qlen > 0);
94
+ if (l_seq == 0) append_cigar1(km, &cigar, 1, qlen);
95
+ else if (qlen == 0) append_cigar1(km, &cigar, 2, l_seq);
96
+ else if (l_seq == qlen && qlen <= (q->y>>32&0xff)) append_cigar1(km, &cigar, 7, qlen);
97
+ else {
98
+ mwf_opt_t opt;
99
+ mwf_rst_t rst;
100
+ mwf_opt_init(&opt);
101
+ opt.flag |= MWF_F_CIGAR;
102
+ mwf_wfa_auto(km2, &opt, l_seq, seq, qlen, qs, &rst);
103
+ append_cigar(km, &cigar, rst.n_cigar, rst.cigar);
104
+ kfree(km2, rst.cigar);
105
+ if ((mg_dbg_flag&MG_DBG_MINIWFA) && l_seq > 5000 && qlen > 5000 && rst.s >= 10000)
106
+ fprintf(stderr, "WL\t%s\t%d\t%d\t%d\t%d\t%d\n", qname, i, (int32_t)q->y + 1, (int32_t)p->y - (int32_t)q->y, l_seq, rst.s);
107
+ if (rst.s >= 10000 && l_seq > 5000 && qlen > 5000) {
108
+ km_destroy(km2);
109
+ km2 = km_init2(km, 0);
110
+ }
111
+ if ((mg_dbg_flag&MG_DBG_MWF_SEQ) && l_seq > 5000 && qlen > 5000 && rst.s >= 10000) {
112
+ char *str;
113
+ str = Kmalloc(km, char, qlen + l_seq + strlen(qname) + 100);
114
+ k = sprintf(str, "WL\t%s\t%d\t%d\t%d\nWT\t%.*s\nWQ\t%.*s\n", qname, i, (int32_t)q->y + 1, rst.s, l_seq, seq, qlen, qs);
115
+ fwrite(str, 1, k, stderr);
116
+ kfree(km, str);
117
+ }
118
+ }
119
+ }
120
+ j0 = j, l0 = l;
121
+ }
122
+ // save the CIGAR to gt->gc[i]
123
+ gc->p = (mg_cigar_t*)kcalloc(gt->km, 1, cigar.n * 8 + sizeof(mg_cigar_t));
124
+ gc->p->ss = (int32_t)gt->a[off_a0].x + 1 - (int32_t)(gt->a[off_a0].y>>32&0xff);
125
+ gc->p->ee = (int32_t)gt->a[off_a0 + gc->n_anchor - 1].x + 1;
126
+ gc->p->n_cigar = cigar.n;
127
+ memcpy(gc->p->cigar, cigar.a, cigar.n * 8);
128
+ for (j = 0, l = 0; j < gc->p->n_cigar; ++j) {
129
+ int32_t op = gc->p->cigar[j]&0xf, len = gc->p->cigar[j]>>4;
130
+ if (op == 7) gc->p->mlen += len, gc->p->blen += len;
131
+ else gc->p->blen += len;
132
+ if (op != 1) gc->p->aplen += len;
133
+ if (op != 2) l += len;
134
+ }
135
+ assert(l == gc->qe - gc->qs && gc->p->aplen == gc->pe - gc->ps);
136
+ }
137
+ km_destroy(km2);
138
+ kfree(km, seq);
139
+ kfree(km, cigar.a);
140
+ }
@@ -0,0 +1,532 @@
1
+ #include <math.h>
2
+ #include <string.h>
3
+ #include "mgpriv.h"
4
+ #include "ksort.h" // for radix sort
5
+ #include "khashl.h" // for kh_hash_uint32()
6
+ #include "gfa-priv.h"
7
+
8
+ typedef struct {
9
+ uint32_t srt;
10
+ int32_t i;
11
+ } gc_frag_t;
12
+
13
+ #define gc_frag_key(p) ((p).srt)
14
+ KRADIX_SORT_INIT(gc, gc_frag_t, gc_frag_key, 4)
15
+
16
+ static int32_t find_max(int32_t n, const gc_frag_t *gf, int32_t x)
17
+ {
18
+ int32_t s = 0, e = n;
19
+ if (n == 0) return -1;
20
+ if (gf[n-1].srt < x) return n - 1;
21
+ if (gf[0].srt >= x) return -1;
22
+ while (e > s) { // TODO: finish this block
23
+ int32_t m = s + (e - s) / 2;
24
+ if (gf[m].srt >= x) e = m;
25
+ else s = m + 1;
26
+ }
27
+ assert(s == e);
28
+ return s;
29
+ }
30
+
31
+ static int32_t mg_target_dist(const gfa_t *g, const mg_lchain_t *l0, const mg_lchain_t *l1)
32
+ {
33
+ // below equals (l1->qs - l0->qe) - min_dist + g->seg[l1->v>>1].len; see mg_gchain1_dp() for the calculation of min_dist
34
+ return (l1->qs - l0->qe) - (g->seg[l0->v>>1].len - l0->re) + (g->seg[l1->v>>1].len - l1->rs);
35
+ // when l0->v == l1->v, the above becomes (l1->qs - l0->qe) - (l1->rs - l0->re), which is what we want
36
+ }
37
+
38
+ static inline int32_t cal_sc(const mg_path_dst_t *dj, const mg_lchain_t *li, const mg_lchain_t *lc, const mg128_t *an, const gc_frag_t *a, const int32_t *f,
39
+ int bw, int ref_bonus, float chn_pen_gap)
40
+ {
41
+ const mg_lchain_t *lj;
42
+ int32_t gap, sc, segi, segj;
43
+ float lin_pen, log_pen;
44
+ if (dj->n_path == 0) return INT32_MIN;
45
+ segi = (an[li->off].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
46
+ gap = dj->dist - dj->target_dist;
47
+ lj = &lc[a[dj->meta].i];
48
+ segj = (an[lj->off + lj->cnt - 1].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
49
+ if (gap < 0) gap = -gap;
50
+ if (segi == segj && gap > bw) return INT32_MIN;
51
+ if (lj->qe <= li->qs) sc = li->score;
52
+ else sc = (int32_t)((double)(li->qe - lj->qe) / (li->qe - li->qs) * li->score + .499); // dealing with overlap on query
53
+ //sc += dj->mlen; // TODO: is this line the right thing to do?
54
+ if (dj->is_0) sc += ref_bonus;
55
+ lin_pen = chn_pen_gap * (float)gap;
56
+ log_pen = gap >= 2? mg_log2(gap) : 0.0f;
57
+ sc -= (int32_t)(lin_pen + log_pen);
58
+ sc += f[dj->meta];
59
+ return sc;
60
+ }
61
+
62
+ int32_t mg_gchain1_dp(void *km, const gfa_t *g, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip,
63
+ int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, const mg128_t *an, uint64_t **u_)
64
+ {
65
+ int32_t i, j, k, m_dst, n_dst, n_ext, n_u, n_v, n_lc = *n_lc_;
66
+ int32_t *f, *v, *t;
67
+ int64_t *p;
68
+ uint64_t *u;
69
+ mg_path_dst_t *dst;
70
+ gc_frag_t *a;
71
+ mg_lchain_t *swap;
72
+ char *qs;
73
+
74
+ *u_ = 0;
75
+ if (n_lc == 0) return 0;
76
+
77
+ KMALLOC(km, a, n_lc);
78
+ for (i = n_ext = 0; i < n_lc; ++i) { // a[] is a view of frag[]; for sorting
79
+ mg_lchain_t *r = &lc[i];
80
+ gc_frag_t *ai = &a[i];
81
+ int32_t is_isolated = 0, min_end_dist_g;
82
+ r->dist_pre = -1;
83
+ min_end_dist_g = g->seg[r->v>>1].len - r->re;
84
+ if (r->rs < min_end_dist_g) min_end_dist_g = r->rs;
85
+ if (min_end_dist_g > max_dist_g) is_isolated = 1; // if too far from segment ends
86
+ else if (min_end_dist_g>>3 > r->score) is_isolated = 1; // if the lchain too small relative to distance to the segment ends
87
+ ai->srt = (uint32_t)is_isolated<<31 | r->qe;
88
+ ai->i = i;
89
+ if (!is_isolated) ++n_ext;
90
+ }
91
+ if (n_ext < 2) { // no graph chaining needed; early return
92
+ kfree(km, a);
93
+ KMALLOC(km, u, n_lc);
94
+ for (i = 0; i < n_lc; ++i)
95
+ u[i] = (uint64_t)lc[i].score<<32 | 1;
96
+ *u_ = u;
97
+ return n_lc;
98
+ }
99
+ radix_sort_gc(a, a + n_lc);
100
+
101
+ KMALLOC(km, v, n_lc);
102
+ KMALLOC(km, f, n_ext);
103
+ KMALLOC(km, p, n_ext);
104
+ KCALLOC(km, t, n_ext);
105
+
106
+ KMALLOC(km, qs, max_dist_q + 1);
107
+ m_dst = n_dst = 0, dst = 0;
108
+ for (i = 0; i < n_ext; ++i) { // core loop
109
+ gc_frag_t *ai = &a[i];
110
+ mg_lchain_t *li = &lc[ai->i];
111
+ int32_t segi = (an[li->off].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
112
+ { // collect end points potentially reachable from _i_
113
+ int32_t x = li->qs + bw, n_skip = 0;
114
+ if (x > qlen) x = qlen;
115
+ x = find_max(i, a, x);
116
+ n_dst = 0;
117
+ for (j = x; j >= 0; --j) { // collect potential destination vertices
118
+ gc_frag_t *aj = &a[j];
119
+ mg_lchain_t *lj = &lc[aj->i];
120
+ mg_path_dst_t *q;
121
+ int32_t target_dist, segj, dq;
122
+ if (lj->qs >= li->qs) continue; // lj is contained in li on the query coordinate
123
+ if (lj->qe > li->qs) { // test overlap on the query
124
+ int o = lj->qe - li->qs;
125
+ if (o > (lj->qe - lj->qs) * mask_level || o > (li->qe - li->qs) * mask_level)
126
+ continue;
127
+ }
128
+ dq = li->qs - lj->qe;
129
+ segj = (an[lj->off + lj->cnt - 1].y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
130
+ if (segi == segj) {
131
+ if (dq > max_dist_q) break; // if query gap too large, stop
132
+ } else {
133
+ if (dq > max_dist_g && dq > max_dist_q) break;
134
+ }
135
+ if (li->v != lj->v) { // the two linear chains are on two different segments
136
+ int32_t min_dist = li->rs + (g->seg[lj->v>>1].len - lj->re); // minimal graph gap
137
+ if (min_dist > max_dist_g) continue; // graph gap too large
138
+ if (segi == segj && min_dist - bw > li->qs - lj->qe) continue; // when li->qs < lj->qe, the condition turns to min_dist + (lj->qe - li->qs) > bw, which is desired
139
+ target_dist = mg_target_dist(g, lj, li);
140
+ if (target_dist < 0) continue; // this may happen if the query overlap is far too large
141
+ } else if (lj->rs >= li->rs || lj->re >= li->re) { // not colinear
142
+ continue;
143
+ } else {
144
+ int32_t dr = li->rs - lj->re, w = dr > dq? dr - dq : dq - dr;
145
+ if (segi == segj && w > bw) continue; // test bandwidth
146
+ if (dr > max_dist_g || dr < -max_dist_g) continue;
147
+ if (lj->re > li->rs) { // test overlap on the graph segment
148
+ int o = lj->re - li->rs;
149
+ if (o > (lj->re - lj->rs) * mask_level || o > (li->re - li->rs) * mask_level)
150
+ continue;
151
+ }
152
+ target_dist = mg_target_dist(g, lj, li);
153
+ }
154
+ if (n_dst == m_dst) KEXPAND(km, dst, m_dst); // TODO: watch out the quadratic behavior!
155
+ q = &dst[n_dst++];
156
+ memset(q, 0, sizeof(mg_path_dst_t));
157
+ q->inner = (li->v == lj->v);
158
+ q->v = lj->v^1;
159
+ q->meta = j;
160
+ q->qlen = li->qs - lj->qe;
161
+ q->target_dist = target_dist;
162
+ q->target_hash = 0;
163
+ q->check_hash = 0;
164
+ if (t[j] == i) {
165
+ if (++n_skip > max_skip)
166
+ break;
167
+ }
168
+ if (p[j] >= 0) t[p[j]] = i;
169
+ }
170
+ }
171
+ { // confirm reach-ability
172
+ int32_t k;
173
+ // test reach-ability without sequences
174
+ mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0);
175
+ // remove unreachable destinations
176
+ for (j = k = 0; j < n_dst; ++j) {
177
+ mg_path_dst_t *dj = &dst[j];
178
+ int32_t sc;
179
+ if (dj->n_path == 0) continue; // not reachable
180
+ sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
181
+ if (sc == INT32_MIN) continue; // out of band
182
+ if (sc + li->score < 0) continue; // negative score and too low
183
+ dst[k++] = dst[j];
184
+ }
185
+ n_dst = k;
186
+ }
187
+ { // DP
188
+ int32_t max_f = li->score, max_j = -1, max_d = -1, max_inner = 0;
189
+ uint32_t max_hash = 0;
190
+ for (j = 0; j < n_dst; ++j) {
191
+ mg_path_dst_t *dj = &dst[j];
192
+ int32_t sc;
193
+ sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
194
+ if (sc == INT32_MIN) continue;
195
+ if (mg_dbg_flag & MG_DBG_GC1) {
196
+ mg_lchain_t *lj = &lc[a[dj->meta].i];
197
+ fprintf(stderr, " [dst:%d] dst=%c%s[%d], n_path=%d, target=%d, opt_dist=%d, score=%d, q_intv=[%d,%d), g_intv=[%d,%d)\n", dj->meta, "><"[dj->v&1], g->seg[dj->v>>1].name, dj->v, dj->n_path, dj->target_dist - g->seg[li->v>>1].len, dj->dist - g->seg[li->v>>1].len, sc, lj->qs, lj->qe, lj->rs, lj->re);
198
+ }
199
+ if (sc > max_f) max_f = sc, max_j = dj->meta, max_d = dj->dist, max_hash = dj->hash, max_inner = dj->inner;
200
+ }
201
+ f[i] = max_f, p[i] = max_j;
202
+ li->dist_pre = max_d;
203
+ li->hash_pre = max_hash;
204
+ li->inner_pre = max_inner;
205
+ v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f;
206
+ if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " [opt:%d] opt=%d, max_f=%d\n", ai->i, max_j, max_f);
207
+ }
208
+ }
209
+ kfree(km, dst);
210
+ kfree(km, qs);
211
+ if (mg_dbg_flag & MG_DBG_GC1) {
212
+ int32_t mmax_f = 0, mmax_i = -1;
213
+ for (i = 0; i < n_ext; ++i) if (f[i] > mmax_f) mmax_f = f[i], mmax_i = i;
214
+ i = mmax_i; while (i >= 0) { fprintf(stderr, "[best] i=%d, seg=%s, max_f=%d, chn_pen_gap=%f\n", a[i].i, g->seg[lc[a[i].i].v>>1].name, f[i], chn_pen_gap); i = p[i]; }
215
+ }
216
+
217
+ u = mg_chain_backtrack(km, n_ext, f, p, v, t, 0, 0, INT32_MAX, n_lc - n_ext, &n_u, &n_v);
218
+ kfree(km, f); kfree(km, p); kfree(km, t);
219
+
220
+ for (i = 0; i < n_lc - n_ext; ++i) {
221
+ u[n_u++] = (uint64_t)lc[a[n_ext + i].i].score << 32 | 1;
222
+ v[n_v++] = n_ext + i;
223
+ }
224
+
225
+ KMALLOC(km, swap, n_v);
226
+ for (i = 0, k = 0; i < n_u; ++i) {
227
+ int32_t k0 = k, ni = (int32_t)u[i];
228
+ for (j = 0; j < ni; ++j)
229
+ swap[k++] = lc[a[v[k0 + (ni - j - 1)]].i];
230
+ }
231
+ assert(k == n_v);
232
+ memcpy(lc, swap, n_v * sizeof(mg_lchain_t));
233
+ *n_lc_ = n_v;
234
+ *u_ = u;
235
+
236
+ kfree(km, a);
237
+ kfree(km, swap);
238
+ kfree(km, v);
239
+ return n_u;
240
+ }
241
+
242
+ void mg_gchain_extra(const gfa_t *g, mg_gchains_t *gs)
243
+ {
244
+ int32_t i, j, k;
245
+ for (i = 0; i < gs->n_gc; ++i) { // iterate over gchains
246
+ mg_gchain_t *p = &gs->gc[i];
247
+ const mg_llchain_t *q;
248
+ const mg128_t *last_a;
249
+ int32_t q_span, rest_pl, tmp, n_mini;
250
+
251
+ p->qs = p->qe = p->ps = p->pe = -1, p->plen = p->blen = p->mlen = 0, p->div = -1.0f;
252
+ if (p->cnt == 0) continue;
253
+
254
+ assert(gs->lc[p->off].cnt > 0 && gs->lc[p->off + p->cnt - 1].cnt > 0); // first and last lchains can't be empty
255
+ q = &gs->lc[p->off];
256
+ q_span = (int32_t)(gs->a[q->off].y>>32&0xff);
257
+ p->qs = (int32_t)gs->a[q->off].y + 1 - q_span;
258
+ p->ps = (int32_t)gs->a[q->off].x + 1 - q_span;
259
+ tmp = (int32_t)(gs->a[q->off].x>>32);
260
+ assert(p->qs >= 0 && p->ps >= 0);
261
+ q = &gs->lc[p->off + p->cnt - 1];
262
+ p->qe = (int32_t)gs->a[q->off + q->cnt - 1].y + 1;
263
+ p->pe = g->seg[q->v>>1].len - (int32_t)gs->a[q->off + q->cnt - 1].x - 1; // this is temporary
264
+ n_mini = (int32_t)(gs->a[q->off + q->cnt - 1].x>>32) - tmp + 1;
265
+ assert(p->n_anchor > 0);
266
+
267
+ rest_pl = 0; // this value is never used if the first lchain is not empty (which should always be true)
268
+ last_a = &gs->a[gs->lc[p->off].off];
269
+ for (j = 0; j < p->cnt; ++j) { // iterate over lchains
270
+ const mg_llchain_t *q = &gs->lc[p->off + j];
271
+ int32_t vlen = g->seg[q->v>>1].len;
272
+ p->plen += vlen;
273
+ for (k = 0; k < q->cnt; ++k) { // iterate over anchors
274
+ const mg128_t *r = &gs->a[q->off + k];
275
+ int32_t pl, ql = (int32_t)r->y - (int32_t)last_a->y;
276
+ int32_t span = (int32_t)(r->y>>32&0xff);
277
+ if (j == 0 && k == 0) { // the first anchor on the first lchain
278
+ pl = ql = span;
279
+ } else if (j > 0 && k == 0) { // the first anchor but not on the first lchain
280
+ pl = (int32_t)r->x + 1 + rest_pl;
281
+ } else {
282
+ pl = (int32_t)r->x - (int32_t)last_a->x;
283
+ }
284
+ if (ql < 0) ql = -ql, n_mini += (int32_t)(last_a->x>>32) - (int32_t)(r->x>>32); // dealing with overlapping query at junctions
285
+ p->blen += pl > ql? pl : ql;
286
+ p->mlen += pl > span && ql > span? span : pl < ql? pl : ql;
287
+ last_a = r;
288
+ }
289
+ if (q->cnt == 0) rest_pl += vlen;
290
+ else rest_pl = vlen - (int32_t)gs->a[q->off + q->cnt - 1].x - 1;
291
+ }
292
+ p->pe = p->plen - p->pe;
293
+ assert(p->pe >= p->ps);
294
+ // here n_mini >= p->n_anchor should stand almost all the time
295
+ p->div = n_mini >= p->n_anchor? log((double)n_mini / p->n_anchor) / q_span : log((double)p->n_anchor / n_mini) / q_span;
296
+ }
297
+ }
298
+
299
+ /*
300
+ * Generate graph chains
301
+ */
302
+ typedef struct {
303
+ void *km;
304
+ const gfa_t *g;
305
+ const gfa_edseq_t *es;
306
+ const char *qseq;
307
+ int32_t n_seg, n_llc, m_llc, n_a;
308
+ mg_llchain_t *llc;
309
+ } bridge_aux_t;
310
+
311
+ static inline void copy_lchain(mg_llchain_t *q, const mg_lchain_t *p, int32_t *n_a, mg128_t *a_new, const mg128_t *a_old, int32_t ed)
312
+ {
313
+ q->cnt = p->cnt, q->v = p->v, q->score = p->score, q->ed = ed;
314
+ memcpy(&a_new[*n_a], &a_old[p->off], q->cnt * sizeof(mg128_t));
315
+ q->off = *n_a;
316
+ (*n_a) += q->cnt;
317
+ }
318
+
319
+ static int32_t bridge_shortk(bridge_aux_t *aux, const mg_lchain_t *l0, const mg_lchain_t *l1)
320
+ {
321
+ int32_t s, n_pathv;
322
+ mg_path_dst_t dst;
323
+ mg_pathv_t *p;
324
+ memset(&dst, 0, sizeof(mg_path_dst_t));
325
+ dst.v = l0->v ^ 1;
326
+ assert(l1->dist_pre >= 0);
327
+ dst.target_dist = l1->dist_pre;
328
+ dst.target_hash = l1->hash_pre;
329
+ dst.check_hash = 1;
330
+ p = mg_shortest_k(aux->km, aux->g, l1->v^1, 1, &dst, dst.target_dist, MG_MAX_SHORT_K, &n_pathv);
331
+ if (n_pathv == 0 || dst.target_hash != dst.hash) {
332
+ fprintf(stderr, "[W::%s] %c%s[%d] -> %c%s[%d], dist=%d, target_dist=%d; chain skiped.\n", __func__, "><"[(l1->v^1)&1], aux->g->seg[l1->v>>1].name, l1->v^1, "><"[(l0->v^1)&1],
333
+ aux->g->seg[l0->v>>1].name, l0->v^1, dst.dist, dst.target_dist);
334
+ kfree(aux->km, p);
335
+ return -1;
336
+ }
337
+ for (s = n_pathv - 2; s >= 1; --s) { // path found in a backward way, so we need to reverse it
338
+ mg_llchain_t *q;
339
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
340
+ q = &aux->llc[aux->n_llc++];
341
+ q->off = q->cnt = q->score = 0;
342
+ q->v = p[s].v^1; // when reversing a path, we also need to flip the orientation
343
+ q->ed = -1;
344
+ }
345
+ kfree(aux->km, p);
346
+ return 0;
347
+ }
348
+
349
+ static int32_t bridge_gwfa(bridge_aux_t *aux, int32_t kmer_size, int32_t gdp_max_ed, const mg_lchain_t *l0, const mg_lchain_t *l1, int32_t *ed)
350
+ {
351
+ uint32_t v0 = l0->v, v1 = l1->v;
352
+ int32_t qs = l0->qe - kmer_size, qe = l1->qs + kmer_size, end0, end1, j;
353
+ void *z;
354
+ gfa_edopt_t opt;
355
+ gfa_edrst_t r;
356
+
357
+ *ed = -1;
358
+ end0 = l0->re - kmer_size;
359
+ end1 = l1->rs + kmer_size - 1;
360
+
361
+ gfa_edopt_init(&opt);
362
+ opt.traceback = 1, opt.max_chk = 1000, opt.bw_dyn = 1000, opt.max_lag = gdp_max_ed/2;
363
+ opt.i_term = 500000000LL;
364
+ z = gfa_ed_init(aux->km, &opt, aux->g, aux->es, qe - qs, &aux->qseq[qs], v0, end0);
365
+ gfa_ed_step(z, v1, end1, gdp_max_ed, &r);
366
+ gfa_ed_destroy(z);
367
+ //fprintf(stdout, "qs=%d,qe=%d,v0=%c%s:%d:%d,v1=%c%s:%d,s=%d,nv=%d\n", qs, qe, "><"[v0&1], aux->g->seg[v0>>1].name, end0, aux->g->seg[v0>>1].len - end0 - 1, "><"[v1&1], aux->g->seg[v1>>1].name, end1, r.s, r.nv);
368
+ if (r.s < 0) return 0;
369
+
370
+ for (j = 1; j < r.nv - 1; ++j) {
371
+ mg_llchain_t *q;
372
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
373
+ q = &aux->llc[aux->n_llc++];
374
+ q->off = q->cnt = q->score = 0;
375
+ q->v = r.v[j];
376
+ q->ed = -1;
377
+ }
378
+ kfree(aux->km, r.v);
379
+ *ed = r.s;
380
+ return 1;
381
+ }
382
+
383
+ static int32_t bridge_lchains(mg_gchains_t *gc, bridge_aux_t *aux, int32_t kmer_size, int32_t gdp_max_ed, const mg_lchain_t *l0, const mg_lchain_t *l1, const mg128_t *a)
384
+ {
385
+ if (l1->v != l0->v) { // bridging two segments
386
+ int32_t ed = -1, ret = 0;
387
+ if (aux->n_seg > 1 || !bridge_gwfa(aux, kmer_size, gdp_max_ed, l0, l1, &ed))
388
+ ret = bridge_shortk(aux, l0, l1);
389
+ if (ret < 0) return -1;
390
+ if (aux->n_llc == aux->m_llc) KEXPAND(aux->km, aux->llc, aux->m_llc);
391
+ copy_lchain(&aux->llc[aux->n_llc++], l1, &aux->n_a, gc->a, a, ed);
392
+ } else { // on one segment
393
+ int32_t k;
394
+ mg_llchain_t *t = &aux->llc[aux->n_llc - 1];
395
+ for (k = 0; k < l1->cnt; ++k) { // FIXME: this part is made redundant by resolve_overlap()
396
+ const mg128_t *ak = &a[l1->off + k];
397
+ if ((int32_t)ak->x > l0->re && (int32_t)ak->y > l0->qe)
398
+ break;
399
+ }
400
+ if (k < l1->cnt) { // l1 contained. TODO: check what is happening...
401
+ t->cnt += l1->cnt - k, t->score += l1->score;
402
+ memcpy(&gc->a[aux->n_a], &a[l1->off + k], (l1->cnt - k) * sizeof(mg128_t));
403
+ aux->n_a += l1->cnt - k;
404
+ }
405
+ }
406
+ return 0;
407
+ }
408
+
409
+ static void resolve_overlap(mg_lchain_t *l0, mg_lchain_t *l1, const mg128_t *a)
410
+ {
411
+ int32_t j, x, y, shift0, shift1;
412
+ // check the end of l0
413
+ x = (int32_t)a[l1->off].x;
414
+ y = (int32_t)a[l1->off].y;
415
+ for (j = l0->cnt - 1; j >= 0; --j)
416
+ if ((int32_t)a[l0->off + j].y <= y && (l0->v != l1->v || (int32_t)a[l0->off + j].x <= x))
417
+ break;
418
+ shift0 = l0->cnt - 1 - j;
419
+ // check the start of l1
420
+ x = (int32_t)a[l0->off + l0->cnt - 1].x;
421
+ y = (int32_t)a[l0->off + l0->cnt - 1].y;
422
+ for (j = 0; j < l1->cnt; ++j)
423
+ if ((int32_t)a[l1->off + j].y >= y && (l0->v != l1->v || (int32_t)a[l1->off + j].x >= x))
424
+ break;
425
+ shift1 = j;
426
+ assert(shift1 < l1->cnt); // this should never happen, or it is a bug
427
+ // update
428
+ if (shift0 > 0) {
429
+ l0->cnt -= shift0;
430
+ if (l0->cnt) { // l0->cnt may be 0 as the start of l0 may be changed and go into l1
431
+ l0->qe = (int32_t)a[l0->off + l0->cnt - 1].y + 1;
432
+ l0->re = (int32_t)a[l0->off + l0->cnt - 1].x + 1;
433
+ }
434
+ }
435
+ if (shift1 > 0) {
436
+ l1->off += shift1, l1->cnt -= shift1;
437
+ l1->qs = (int32_t)a[l1->off].y + 1 - (int32_t)(a[l1->off].y>>32&0xff);
438
+ l1->rs = (int32_t)a[l1->off].x + 1 - (int32_t)(a[l1->off].y>>32&0xff);
439
+ }
440
+ if (l0->cnt == 0) l0->qs = l0->qe = l1->qs, l0->rs = l0->re = l1->rs; // this line should have no effect
441
+ }
442
+
443
+ mg_gchains_t *mg_gchain_gen(void *km_dst, void *km, const gfa_t *g, const gfa_edseq_t *es, int32_t n_u, const uint64_t *u,
444
+ mg_lchain_t *lc, const mg128_t *a, uint32_t hash, int32_t min_gc_cnt, int32_t min_gc_score,
445
+ int32_t gdp_max_ed, int32_t n_seg, const char *qseq)
446
+ {
447
+ mg_gchains_t *gc;
448
+ int32_t i, j, k, st, kmer_size;
449
+ bridge_aux_t aux;
450
+
451
+ // preallocate gc->gc and gc->a
452
+ KCALLOC(km_dst, gc, 1);
453
+ for (i = 0, st = 0; i < n_u; ++i) {
454
+ int32_t m = 0, nui = (int32_t)u[i];
455
+ for (j = 0; j < nui; ++j) m += lc[st + j].cnt; // m is the number of anchors in this gchain
456
+ if (m >= min_gc_cnt && u[i]>>32 >= min_gc_score)
457
+ gc->n_gc++, gc->n_a += m;
458
+ st += nui;
459
+ }
460
+ if (gc->n_gc == 0) return gc;
461
+ gc->km = km_dst;
462
+ KCALLOC(km_dst, gc->gc, gc->n_gc);
463
+ KMALLOC(km_dst, gc->a, gc->n_a);
464
+
465
+ // core loop
466
+ memset(&aux, 0, sizeof(aux));
467
+ aux.km = km, aux.g = g, aux.es = es, aux.n_seg = n_seg, aux.qseq = qseq;
468
+ kmer_size = a[0].y>>32&0xff;
469
+ for (i = k = 0, st = 0, aux.n_a = 0; i < n_u; ++i) {
470
+ int32_t n_a0 = aux.n_a, n_llc0 = aux.n_llc, m = 0, nui = (int32_t)u[i];
471
+ for (j = 0; j < nui; ++j) m += lc[st + j].cnt;
472
+ if (m >= min_gc_cnt && u[i]>>32 >= min_gc_score) {
473
+ uint32_t h = hash;
474
+ int32_t j0;
475
+ gc->gc[k].score = u[i]>>32;
476
+ gc->gc[k].off = n_llc0;
477
+ for (j = 0; j < nui; ++j) {
478
+ const mg_lchain_t *p = &lc[st + j];
479
+ h += kh_hash_uint32(p->qs) + kh_hash_uint32(p->re) + kh_hash_uint32(p->v);
480
+ }
481
+ gc->gc[k].hash = kh_hash_uint32(h);
482
+
483
+ for (j = 1; j < nui; ++j)
484
+ resolve_overlap(&lc[st + j - 1], &lc[st + j], a);
485
+
486
+ if (aux.n_llc == aux.m_llc) KEXPAND(aux.km, aux.llc, aux.m_llc);
487
+ copy_lchain(&aux.llc[aux.n_llc++], &lc[st], &aux.n_a, gc->a, a, -1); // copy the first lchain
488
+ for (j0 = 0, j = 1; j < nui; ++j) {
489
+ const mg_lchain_t *l0 = &lc[st + j0], *l1 = &lc[st + j];
490
+ if (l1->cnt > 0) {
491
+ int32_t ret, t;
492
+ ret = bridge_lchains(gc, &aux, kmer_size, gdp_max_ed, l0, l1, a);
493
+ if (ret < 0) {
494
+ for (t = j0; t < j; ++t) {
495
+ ret = bridge_lchains(gc, &aux, kmer_size, gdp_max_ed, &lc[st + t], &lc[st + t + 1], a);
496
+ assert(ret >= 0);
497
+ }
498
+ }
499
+ j0 = j;
500
+ }
501
+ }
502
+
503
+ gc->gc[k].cnt = aux.n_llc - n_llc0;
504
+ gc->gc[k].n_anchor = aux.n_a - n_a0;
505
+ ++k;
506
+ }
507
+ st += nui;
508
+ }
509
+ assert(aux.n_a <= gc->n_a);
510
+
511
+ gc->n_a = aux.n_a;
512
+ gc->n_lc = aux.n_llc;
513
+ KMALLOC(km_dst, gc->lc, aux.n_llc);
514
+ memcpy(gc->lc, aux.llc, aux.n_llc * sizeof(mg_llchain_t));
515
+ kfree(km, aux.llc);
516
+
517
+ mg_gchain_extra(g, gc);
518
+ mg_gchain_sort_by_score(km, gc);
519
+ return gc;
520
+ }
521
+
522
+ void mg_gchain_free(mg_gchains_t *gs)
523
+ {
524
+ void *km;
525
+ int32_t i;
526
+ if (gs == 0) return;
527
+ km = gs->km;
528
+ for (i = 0; i < gs->n_gc; ++i)
529
+ if (gs->gc[i].p) kfree(km, gs->gc[i].p);
530
+ kfree(km, gs->gc); kfree(km, gs->a); kfree(km, gs->lc);
531
+ kfree(km, gs);
532
+ }