ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,223 @@
1
+ #include <math.h>
2
+ #include <assert.h>
3
+ #include <string.h>
4
+ #include "mgpriv.h"
5
+ #include "kalloc.h"
6
+
7
+ // reorder gcs->a[] and gcs->lc[] such that they are in the same order as gcs->gc[]
8
+ void mg_gchain_restore_order(void *km, mg_gchains_t *gcs)
9
+ {
10
+ int32_t i, n_a, n_lc;
11
+ mg_llchain_t *lc;
12
+ mg128_t *a;
13
+ KMALLOC(km, lc, gcs->n_lc);
14
+ KMALLOC(km, a, gcs->n_a);
15
+ for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
16
+ mg_gchain_t *gc = &gcs->gc[i];
17
+ assert(gc->cnt > 0);
18
+ memcpy(&lc[n_lc], &gcs->lc[gc->off], gc->cnt * sizeof(mg_llchain_t));
19
+ memcpy(&a[n_a], &gcs->a[gcs->lc[gc->off].off], gc->n_anchor * sizeof(mg128_t));
20
+ n_lc += gc->cnt, n_a += gc->n_anchor;
21
+ }
22
+ memcpy(gcs->lc, lc, gcs->n_lc * sizeof(mg_llchain_t));
23
+ memcpy(gcs->a, a, gcs->n_a * sizeof(mg128_t));
24
+ kfree(km, lc); kfree(km, a);
25
+ for (i = 0, n_lc = 0; i < gcs->n_gc; ++i) {
26
+ mg_gchain_t *gc = &gcs->gc[i];
27
+ gc->off = n_lc;
28
+ n_lc += gc->cnt;
29
+ }
30
+ for (i = 0, n_a = 0; i < gcs->n_lc; ++i) {
31
+ mg_llchain_t *lc = &gcs->lc[i];
32
+ lc->off = n_a;
33
+ n_a += lc->cnt;
34
+ }
35
+ }
36
+
37
+ // recompute gcs->gc[].{off,n_anchor} and gcs->lc[].off, ASSUMING they are properly ordered (see mg_gchain_restore_order)
38
+ void mg_gchain_restore_offset(mg_gchains_t *gcs)
39
+ {
40
+ int32_t i, j, n_a, n_lc;
41
+ for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
42
+ mg_gchain_t *gc = &gcs->gc[i];
43
+ gc->off = n_lc;
44
+ for (j = 0, gc->n_anchor = 0; j < gc->cnt; ++j) {
45
+ mg_llchain_t *lc = &gcs->lc[n_lc + j];
46
+ lc->off = n_a;
47
+ n_a += lc->cnt;
48
+ gc->n_anchor += lc->cnt;
49
+ }
50
+ n_lc += gc->cnt;
51
+ }
52
+ assert(n_lc == gcs->n_lc && n_a == gcs->n_a);
53
+ }
54
+
55
+ // sort chains by score
56
+ void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs)
57
+ {
58
+ mg128_t *z;
59
+ mg_gchain_t *gc;
60
+ int32_t i;
61
+ KMALLOC(km, z, gcs->n_gc);
62
+ KMALLOC(km, gc, gcs->n_gc);
63
+ for (i = 0; i < gcs->n_gc; ++i)
64
+ z[i].x = (uint64_t)gcs->gc[i].score << 32 | gcs->gc[i].hash, z[i].y = i;
65
+ radix_sort_128x(z, z + gcs->n_gc);
66
+ for (i = gcs->n_gc - 1; i >= 0; --i)
67
+ gc[gcs->n_gc - 1 - i] = gcs->gc[z[i].y];
68
+ memcpy(gcs->gc, gc, gcs->n_gc * sizeof(mg_gchain_t));
69
+ kfree(km, z); kfree(km, gc);
70
+ mg_gchain_restore_order(km, gcs); // this put gcs in the proper order
71
+ }
72
+
73
+ // set r[].{id,parent,subsc}, ASSUMING r[] is sorted by score
74
+ void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level)
75
+ {
76
+ int i, j, k, *w;
77
+ uint64_t *cov;
78
+ if (n <= 0) return;
79
+ for (i = 0; i < n; ++i) r[i].id = i;
80
+ cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
81
+ w = (int*)kmalloc(km, n * sizeof(int));
82
+ w[0] = 0, r[0].parent = 0;
83
+ for (i = 1, k = 1; i < n; ++i) {
84
+ mg_gchain_t *ri = &r[i];
85
+ int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
86
+ if (hard_mask_level) goto skip_uncov;
87
+ for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
88
+ mg_gchain_t *rp = &r[w[j]];
89
+ int sj = rp->qs, ej = rp->qe;
90
+ if (ej <= si || sj >= ei) continue;
91
+ if (sj < si) sj = si;
92
+ if (ej > ei) ej = ei;
93
+ cov[n_cov++] = (uint64_t)sj<<32 | ej;
94
+ }
95
+ if (n_cov == 0) {
96
+ goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
97
+ } else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
98
+ int j, x = si;
99
+ radix_sort_gfa64(cov, cov + n_cov);
100
+ for (j = 0; j < n_cov; ++j) {
101
+ if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
102
+ x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
103
+ }
104
+ if (ei > x) uncov_len += ei - x;
105
+ }
106
+ skip_uncov:
107
+ for (j = 0; j < k; ++j) { // traverse existing primary hits again
108
+ mg_gchain_t *rp = &r[w[j]];
109
+ int sj = rp->qs, ej = rp->qe, min, max, ol;
110
+ if (ej <= si || sj >= ei) continue; // no overlap
111
+ min = ej - sj < ei - si? ej - sj : ei - si;
112
+ max = ej - sj > ei - si? ej - sj : ei - si;
113
+ ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
114
+ if ((float)ol / min - (float)uncov_len / max > mask_level) {
115
+ int cnt_sub = 0;
116
+ ri->parent = rp->parent;
117
+ rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
118
+ if (ri->cnt >= rp->cnt) cnt_sub = 1;
119
+ if (cnt_sub) ++rp->n_sub;
120
+ break;
121
+ }
122
+ }
123
+ set_parent_test:
124
+ if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
125
+ }
126
+ kfree(km, cov);
127
+ kfree(km, w);
128
+ }
129
+
130
+ // set r[].flt, i.e. mark weak suboptimal chains as filtered
131
+ int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r)
132
+ {
133
+ if (pri_ratio > 0.0f && n > 0) {
134
+ int i, k, n_2nd = 0;
135
+ for (i = k = 0; i < n; ++i) {
136
+ int p = r[i].parent;
137
+ if (p == i) { // primary
138
+ r[i].flt = 0, ++k;
139
+ } else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
140
+ if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].ps == r[p].ps && r[i].pe == r[p].pe)) // not identical hits; TODO: check path as well
141
+ r[i].flt = 0, ++n_2nd, ++k;
142
+ else r[i].flt = 1;
143
+ } else r[i].flt = 1;
144
+ }
145
+ return k;
146
+ }
147
+ return n;
148
+ }
149
+
150
+ // hard drop filtered chains, ASSUMING gcs is properly ordered
151
+ void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs)
152
+ {
153
+ int32_t i, n_gc, n_lc, n_a, n_lc0, n_a0, *o2n;
154
+ if (gcs->n_gc == 0) return;
155
+ KMALLOC(km, o2n, gcs->n_gc);
156
+ for (i = 0, n_gc = 0; i < gcs->n_gc; ++i) {
157
+ mg_gchain_t *r = &gcs->gc[i];
158
+ o2n[i] = -1;
159
+ if (r->flt || r->cnt == 0) {
160
+ kfree(gcs->km, r->p);
161
+ continue;
162
+ }
163
+ o2n[i] = n_gc++;
164
+ }
165
+ n_gc = n_lc = n_a = 0;
166
+ n_lc0 = n_a0 = 0;
167
+ for (i = 0; i < gcs->n_gc; ++i) {
168
+ mg_gchain_t *r = &gcs->gc[i];
169
+ if (o2n[i] >= 0) {
170
+ memmove(&gcs->a[n_a], &gcs->a[n_a0], r->n_anchor * sizeof(mg128_t));
171
+ memmove(&gcs->lc[n_lc], &gcs->lc[n_lc0], r->cnt * sizeof(mg_llchain_t));
172
+ gcs->gc[n_gc] = *r;
173
+ gcs->gc[n_gc].id = n_gc;
174
+ gcs->gc[n_gc].parent = o2n[gcs->gc[n_gc].parent];
175
+ ++n_gc, n_lc += r->cnt, n_a += r->n_anchor;
176
+ }
177
+ n_lc0 += r->cnt, n_a0 += r->n_anchor;
178
+ }
179
+ assert(n_lc0 == gcs->n_lc && n_a0 == gcs->n_a);
180
+ kfree(km, o2n);
181
+ gcs->n_gc = n_gc, gcs->n_lc = n_lc, gcs->n_a = n_a;
182
+ if (n_a != n_a0) {
183
+ KREALLOC(gcs->km, gcs->a, gcs->n_a);
184
+ KREALLOC(gcs->km, gcs->lc, gcs->n_lc);
185
+ KREALLOC(gcs->km, gcs->gc, gcs->n_gc);
186
+ }
187
+ mg_gchain_restore_offset(gcs);
188
+ }
189
+
190
+ // estimate mapping quality
191
+ void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score)
192
+ {
193
+ static const float q_coef = 40.0f;
194
+ int64_t sum_sc = 0;
195
+ float uniq_ratio, r_sc, r_cnt;
196
+ int i, t_sc, t_cnt;
197
+ if (gcs == 0 || gcs->n_gc == 0) return;
198
+ t_sc = qlen < 100? qlen : 100;
199
+ t_cnt = max_mini < 10? max_mini : 10;
200
+ if (t_cnt < 5) t_cnt = 5;
201
+ r_sc = 1.0 / t_sc;
202
+ r_cnt = 1.0 / t_cnt;
203
+ for (i = 0; i < gcs->n_gc; ++i)
204
+ if (gcs->gc[i].parent == gcs->gc[i].id)
205
+ sum_sc += gcs->gc[i].score;
206
+ uniq_ratio = (float)sum_sc / (sum_sc + gcs->rep_len);
207
+ for (i = 0; i < gcs->n_gc; ++i) {
208
+ mg_gchain_t *r = &gcs->gc[i];
209
+ if (r->parent == r->id) {
210
+ int mapq, subsc;
211
+ float pen_s1 = (r->score > t_sc? 1.0f : r->score * r_sc) * uniq_ratio;
212
+ float x, pen_cm = r->n_anchor > t_cnt? 1.0f : r->n_anchor * r_cnt;
213
+ pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
214
+ subsc = r->subsc > min_gc_score? r->subsc : min_gc_score;
215
+ x = (float)subsc / r->score;
216
+ mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
217
+ mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
218
+ mapq = mapq > 0? mapq : 0;
219
+ if (r->score > subsc && mapq == 0) mapq = 1;
220
+ r->mapq = mapq < 60? mapq : 60;
221
+ } else r->mapq = 0;
222
+ }
223
+ }
@@ -0,0 +1,260 @@
1
+ #include <assert.h>
2
+ #include <ctype.h>
3
+ #include "gfa-priv.h"
4
+ #include "ksort.h"
5
+
6
+ typedef struct {
7
+ uint32_t side;
8
+ uint32_t ins:31, end:1;
9
+ } gfa_split_t;
10
+
11
+ #define split_key(p) ((p).side)
12
+ KRADIX_SORT_INIT(split, gfa_split_t, split_key, 4)
13
+
14
+ static inline void create_first_arc_semi(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank, uint64_t link_id, int is_comp)
15
+ {
16
+ gfa_arc_t *a;
17
+ if (g->n_arc == g->m_arc) GFA_EXPAND(g->arc, g->m_arc);
18
+ a = &g->arc[g->n_arc++];
19
+ a->v_lv = (uint64_t)v<<32 | seg[v>>1].len;
20
+ a->w = w;
21
+ a->rank = rank;
22
+ a->ov = a->ow = 0;
23
+ a->link_id = link_id;
24
+ a->del = 0;
25
+ a->comp = !!is_comp;
26
+ }
27
+
28
+ static inline void create_first_arc(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank)
29
+ {
30
+ uint64_t link_id = g->n_arc;
31
+ create_first_arc_semi(g, seg, v, w, rank, link_id, 0);
32
+ create_first_arc_semi(g, seg, w^1, v^1, rank, link_id, 1);
33
+ }
34
+
35
+ void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq)
36
+ {
37
+ int32_t i, j, k, *scnt, *soff, n_ctg_seg, n_old_seg, n_seg;
38
+ gfa_split_t *sp;
39
+ gfa_seg_t *seg;
40
+ char buf[16];
41
+ uint64_t t, n_old_arc = g->n_arc, *ins_side, *oldcnt;
42
+
43
+ if (n_ins <= 0 || n_ctg <= 0) return;
44
+
45
+ // set soff[]
46
+ GFA_CALLOC(scnt, g->n_seg);
47
+ for (i = 0; i < n_ins; ++i)
48
+ ++scnt[ins[i].v[0]>>1], ++scnt[ins[i].v[1]>>1];
49
+ GFA_MALLOC(soff, g->n_seg + 1);
50
+ for (j = 1, soff[0] = 0; j <= g->n_seg; ++j)
51
+ soff[j] = soff[j-1] + scnt[j-1];
52
+
53
+ // populate sp[]
54
+ GFA_MALLOC(sp, soff[g->n_seg]);
55
+ GFA_BZERO(scnt, g->n_seg);
56
+ for (i = 0, n_ctg_seg = 0; i < n_ins; ++i) {
57
+ const gfa_ins_t *p = &ins[i];
58
+ for (k = 0; k < 2; ++k) {
59
+ uint32_t vlen = g->seg[p->v[k]>>1].len;
60
+ gfa_split_t *q = &sp[soff[p->v[k]>>1] + scnt[p->v[k]>>1]];
61
+ q->ins = i, q->end = k;
62
+ q->side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
63
+ assert(q->side != (0<<1|0) && q->side != (vlen<<1|1)); // not possible to link such sides
64
+ ++scnt[p->v[k]>>1];
65
+ }
66
+ if (p->coff[1] > p->coff[0])
67
+ ++n_ctg_seg;
68
+ }
69
+ free(scnt);
70
+
71
+ // sort sp[]
72
+ for (j = 0, n_old_seg = 0; j < g->n_seg; ++j)
73
+ if (soff[j+1] - soff[j] > 1)
74
+ radix_sort_split(&sp[soff[j]], &sp[soff[j+1]]);
75
+
76
+ // precompute the number of segments after split
77
+ for (j = 0, n_old_seg = 0; j < g->n_seg; ++j) {
78
+ int32_t i0;
79
+ for (i0 = soff[j], i = i0 + 1, k = 0; i <= soff[j+1]; ++i)
80
+ if (i == soff[j+1] || sp[i0].side>>1 != sp[i].side>>1) {
81
+ if (sp[i0].side>>1 != 0 && sp[i0].side>>1 != g->seg[j].len) // otherwise no new segment will be created
82
+ ++k;
83
+ i0 = i;
84
+ }
85
+ n_old_seg += k + 1;
86
+ }
87
+
88
+ // compute ins_side[] and split old segments
89
+ n_seg = n_old_seg + n_ctg_seg;
90
+ GFA_CALLOC(seg, n_seg);
91
+ GFA_CALLOC(ins_side, n_ins);
92
+ GFA_MALLOC(oldcnt, g->n_seg);
93
+ for (j = 0, k = 0; j < g->n_seg; ++j) {
94
+ int32_t i0, l, off = 0, k0 = k;
95
+ gfa_seg_t *s = &g->seg[j];
96
+ gfa_seg_t *t = &seg[k]; // this is so far a placeholder
97
+ // create the first half of a new segment
98
+ snprintf(buf, 15, "s%d", k + 1);
99
+ t->name = gfa_strdup(buf);
100
+ t->snid = s->snid, t->soff = s->soff, t->rank = s->rank;
101
+ // iterate over splits
102
+ for (i0 = soff[j], i = i0 + 1; i <= soff[j+1]; ++i) {
103
+ if (i == soff[j+1] || sp[i].side>>1 != sp[i0].side>>1) {
104
+ gfa_split_t *q0 = &sp[i0];
105
+ for (l = i0; l < i; ++l) {
106
+ gfa_split_t *q = &sp[l];
107
+ int32_t shift = q->end == 0? 32 : 0; // first end on the higher 32 bits
108
+ int32_t side = q->side & 1;
109
+ int32_t which = q->side>>1 == 0? 0 : side; // special-casing when q->side==1, because no new segment created in this case
110
+ ins_side[q->ins] |= (uint64_t)((uint32_t)(k + which) << 1 | (side^q->end)) << shift;
111
+ }
112
+ if (q0->side>>1 != 0 && q0->side>>1 != g->seg[j].len) { // create a new segment
113
+ t->len = (q0->side>>1) - off;
114
+ GFA_MALLOC(t->seq, t->len + 1);
115
+ memcpy(t->seq, &s->seq[off], t->len);
116
+ t->seq[t->len] = 0;
117
+ off += t->len;
118
+ t = &seg[++k]; // create a new segment
119
+ snprintf(buf, 15, "s%d", k + 1);
120
+ t->name = gfa_strdup(buf);
121
+ t->snid = s->snid, t->soff = s->soff + off, t->rank = s->rank;
122
+ }
123
+ i0 = i;
124
+ }
125
+ }
126
+ // finish the last segment
127
+ t->len = s->len - off;
128
+ GFA_MALLOC(t->seq, t->len + 1);
129
+ memcpy(t->seq, &s->seq[off], t->len);
130
+ t->seq[t->len] = 0;
131
+ ++k;
132
+ oldcnt[j] = (uint64_t)k0 << 32 | (k - k0);
133
+ // add new arcs between newly created segments
134
+ for (i = 0; i < k - k0 - 1; ++i)
135
+ create_first_arc(g, seg, (uint32_t)(k0+i)<<1, (uint32_t)(k0+i+1)<<1, s->rank);
136
+ }
137
+ assert(k == n_old_seg);
138
+ free(soff);
139
+ free(sp);
140
+
141
+ // update existing g->arc[]
142
+ for (t = 0; t < n_old_arc; ++t) {
143
+ gfa_arc_t *a = &g->arc[t];
144
+ uint32_t v = a->v_lv >> 32;
145
+ uint32_t off = oldcnt[v>>1]>>32, cnt = (uint32_t)oldcnt[v>>1];
146
+ v = (v&1) == 0? (off+cnt-1)<<1 : off<<1 | 1;
147
+ a->v_lv = (uint64_t)v << 32 | seg[v>>1].len;
148
+ off = oldcnt[a->w>>1]>>32, cnt = (uint32_t)oldcnt[a->w>>1];
149
+ a->w = (a->w&1) == 0? off<<1 : (off+cnt-1)<<1 | 1;
150
+ }
151
+ free(oldcnt);
152
+
153
+ // create newly inserted segments
154
+ for (i = 0, k = n_old_seg; i < n_ins; ++i) {
155
+ const gfa_ins_t *p = &ins[i];
156
+ if (p->coff[0] < p->coff[1]) { // not a pure deletion
157
+ gfa_seg_t *t = &seg[k];
158
+ snprintf(buf, 15, "s%d", k + 1);
159
+ t->name = gfa_strdup(buf);
160
+ GFA_MALLOC(t->seq, p->coff[1] - p->coff[0] + 1);
161
+ for (j = 0; j < p->coff[1] - p->coff[0]; ++j)
162
+ t->seq[j] = seq[p->ctg][p->coff[0] + j];
163
+ t->seq[j] = 0;
164
+ t->len = j;
165
+ t->snid = gfa_sseq_add(g, name[p->ctg]);
166
+ t->soff = p->coff[0];
167
+ t->rank = g->max_rank + 1; // TODO: to deal with SN/SO/SR tags somewhere
168
+ gfa_sseq_update(g, t);
169
+ create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)k<<1, t->rank);
170
+ create_first_arc(g, seg, (uint32_t)k<<1, (uint32_t)ins_side[i], t->rank);
171
+ ++k;
172
+ } else { // a pure deletion
173
+ create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)ins_side[i], g->max_rank + 1);
174
+ }
175
+ }
176
+ free(ins_side);
177
+
178
+ // update *g
179
+ for (j = 0; j < g->n_seg; ++j) {
180
+ free(g->seg[j].name);
181
+ free(g->seg[j].seq);
182
+ free(g->seg[j].aux.aux);
183
+ }
184
+ free(g->seg);
185
+ g->seg = seg, g->n_seg = g->m_seg = n_seg;
186
+ ++g->max_rank;
187
+ GFA_REALLOC(g->link_aux, g->m_arc);
188
+ GFA_BZERO(&g->link_aux[n_old_arc], g->m_arc - n_old_arc);
189
+ gfa_arc_sort(g);
190
+ gfa_arc_index(g);
191
+ gfa_fix_multi(g);
192
+ // k = gfa_fix_symm(g); assert(k == 0); // for debugging; the graph should be symmetric
193
+ }
194
+
195
+ static int32_t gfa_ins_shrink_semi(const gfa_t *g, int32_t pen, uint32_t v, int32_t voff, int32_t coff, uint32_t vv, int32_t vend, int32_t cend, const char *seq)
196
+ {
197
+ int32_t i, j, l, dir, score, max, max_l;
198
+ if (cend == coff) return 0;
199
+ dir = cend > coff? +1 : -1;
200
+ for (i = coff, j = voff, l = max_l = 0, score = max = 0; i != cend; i += dir, j += dir) {
201
+ int32_t cg, vlen = g->seg[v>>1].len;
202
+ if (j == vlen || j == -1) break;
203
+ if (vv == v && j == vend) break;
204
+ ++l;
205
+ cg = (v&1) == 0? g->seg[v>>1].seq[j] : gfa_comp_table[(uint8_t)g->seg[v>>1].seq[vlen - 1 - j]];
206
+ score += tolower(cg) == tolower(seq[i])? +1 : -pen;
207
+ if (score > max) max = score, max_l = l;
208
+ if (score < max - pen * pen) break; // X-drop
209
+ }
210
+ return max_l;
211
+ }
212
+
213
+ int gfa_ins_adj(const gfa_t *g, int pen, gfa_ins_t *ins, const char *seq) // min_len is NOT used for now
214
+ {
215
+ int32_t l, tot = 0;
216
+ l = gfa_ins_shrink_semi(g, pen, ins->v[0], ins->voff[0], ins->coff[0], ins->v[1], ins->voff[1], ins->coff[1], seq);
217
+ ins->voff[0] += l, ins->coff[0] += l, tot += l;
218
+ l = gfa_ins_shrink_semi(g, pen, ins->v[1], ins->voff[1] - 1, ins->coff[1] - 1, ins->v[0], ins->voff[0] - 1, ins->coff[0] - 1, seq);
219
+ ins->voff[1] -= l, ins->coff[1] -= l, tot += l;
220
+ return tot;
221
+ }
222
+
223
+ static inline int check_multi(const gfa_t *g, const gfa_ins_t *ins)
224
+ {
225
+ if (ins->v[0] != ins->v[1] && ins->coff[1] - ins->coff[0] == 0) {
226
+ const gfa_seg_t *s[2];
227
+ uint32_t v[2];
228
+ s[0] = &g->seg[ins->v[0]>>1];
229
+ s[1] = &g->seg[ins->v[1]>>1];
230
+ if (ins->voff[0] != 0 && ins->voff[0] != s[0]->len) return 0;
231
+ if (ins->voff[1] != 0 && ins->voff[1] != s[1]->len) return 0;
232
+ v[0] = ins->voff[0] == 0? ins->v[0]^1 : ins->v[0];
233
+ v[1] = ins->voff[1] == 0? ins->v[1] : ins->v[1]^1;
234
+ if (gfa_find_arc(g, v[0], v[1]) >= 0) return 1;
235
+ return 0;
236
+ } else return 0;
237
+ }
238
+
239
+ int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins) // filter out impossible inserts
240
+ {
241
+ int32_t i, k, n;
242
+ for (i = 0, n = 0; i < n_ins; ++i) {
243
+ gfa_ins_t *p = &ins[i];
244
+ for (k = 0; k < 2; ++k) {
245
+ uint32_t vlen = g->seg[p->v[k]>>1].len;
246
+ uint32_t side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
247
+ if (side == (0<<1|0) || side == (vlen<<1|1))
248
+ break;
249
+ }
250
+ if (k != 2 || check_multi(g, p)) { // multi-link may happen due to inconsistency between graph chaining and WFA alignment
251
+ if (gfa_verbose >= 2)
252
+ fprintf(stderr, "[W::%s] %s between %c%s and %c%s derived from the %d-th query at %d-%d\n",
253
+ __func__, k != 2? "impossible insert" : "multi-link",
254
+ "><"[p->v[0]&1], g->seg[p->v[0]>>1].name, "><"[p->v[1]&1], g->seg[p->v[1]>>1].name, p->ctg, p->coff[0], p->coff[1]);
255
+ continue;
256
+ }
257
+ ins[n++] = ins[i];
258
+ }
259
+ return n;
260
+ }