ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,223 @@
1
+ #include <math.h>
2
+ #include <assert.h>
3
+ #include <string.h>
4
+ #include "mgpriv.h"
5
+ #include "kalloc.h"
6
+
7
+ // reorder gcs->a[] and gcs->lc[] such that they are in the same order as gcs->gc[]
8
+ void mg_gchain_restore_order(void *km, mg_gchains_t *gcs)
9
+ {
10
+ int32_t i, n_a, n_lc;
11
+ mg_llchain_t *lc;
12
+ mg128_t *a;
13
+ KMALLOC(km, lc, gcs->n_lc);
14
+ KMALLOC(km, a, gcs->n_a);
15
+ for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
16
+ mg_gchain_t *gc = &gcs->gc[i];
17
+ assert(gc->cnt > 0);
18
+ memcpy(&lc[n_lc], &gcs->lc[gc->off], gc->cnt * sizeof(mg_llchain_t));
19
+ memcpy(&a[n_a], &gcs->a[gcs->lc[gc->off].off], gc->n_anchor * sizeof(mg128_t));
20
+ n_lc += gc->cnt, n_a += gc->n_anchor;
21
+ }
22
+ memcpy(gcs->lc, lc, gcs->n_lc * sizeof(mg_llchain_t));
23
+ memcpy(gcs->a, a, gcs->n_a * sizeof(mg128_t));
24
+ kfree(km, lc); kfree(km, a);
25
+ for (i = 0, n_lc = 0; i < gcs->n_gc; ++i) {
26
+ mg_gchain_t *gc = &gcs->gc[i];
27
+ gc->off = n_lc;
28
+ n_lc += gc->cnt;
29
+ }
30
+ for (i = 0, n_a = 0; i < gcs->n_lc; ++i) {
31
+ mg_llchain_t *lc = &gcs->lc[i];
32
+ lc->off = n_a;
33
+ n_a += lc->cnt;
34
+ }
35
+ }
36
+
37
+ // recompute gcs->gc[].{off,n_anchor} and gcs->lc[].off, ASSUMING they are properly ordered (see mg_gchain_restore_order)
38
+ void mg_gchain_restore_offset(mg_gchains_t *gcs)
39
+ {
40
+ int32_t i, j, n_a, n_lc;
41
+ for (i = 0, n_a = n_lc = 0; i < gcs->n_gc; ++i) {
42
+ mg_gchain_t *gc = &gcs->gc[i];
43
+ gc->off = n_lc;
44
+ for (j = 0, gc->n_anchor = 0; j < gc->cnt; ++j) {
45
+ mg_llchain_t *lc = &gcs->lc[n_lc + j];
46
+ lc->off = n_a;
47
+ n_a += lc->cnt;
48
+ gc->n_anchor += lc->cnt;
49
+ }
50
+ n_lc += gc->cnt;
51
+ }
52
+ assert(n_lc == gcs->n_lc && n_a == gcs->n_a);
53
+ }
54
+
55
+ // sort chains by score
56
+ void mg_gchain_sort_by_score(void *km, mg_gchains_t *gcs)
57
+ {
58
+ mg128_t *z;
59
+ mg_gchain_t *gc;
60
+ int32_t i;
61
+ KMALLOC(km, z, gcs->n_gc);
62
+ KMALLOC(km, gc, gcs->n_gc);
63
+ for (i = 0; i < gcs->n_gc; ++i)
64
+ z[i].x = (uint64_t)gcs->gc[i].score << 32 | gcs->gc[i].hash, z[i].y = i;
65
+ radix_sort_128x(z, z + gcs->n_gc);
66
+ for (i = gcs->n_gc - 1; i >= 0; --i)
67
+ gc[gcs->n_gc - 1 - i] = gcs->gc[z[i].y];
68
+ memcpy(gcs->gc, gc, gcs->n_gc * sizeof(mg_gchain_t));
69
+ kfree(km, z); kfree(km, gc);
70
+ mg_gchain_restore_order(km, gcs); // this put gcs in the proper order
71
+ }
72
+
73
+ // set r[].{id,parent,subsc}, ASSUMING r[] is sorted by score
74
+ void mg_gchain_set_parent(void *km, float mask_level, int n, mg_gchain_t *r, int sub_diff, int hard_mask_level)
75
+ {
76
+ int i, j, k, *w;
77
+ uint64_t *cov;
78
+ if (n <= 0) return;
79
+ for (i = 0; i < n; ++i) r[i].id = i;
80
+ cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
81
+ w = (int*)kmalloc(km, n * sizeof(int));
82
+ w[0] = 0, r[0].parent = 0;
83
+ for (i = 1, k = 1; i < n; ++i) {
84
+ mg_gchain_t *ri = &r[i];
85
+ int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
86
+ if (hard_mask_level) goto skip_uncov;
87
+ for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
88
+ mg_gchain_t *rp = &r[w[j]];
89
+ int sj = rp->qs, ej = rp->qe;
90
+ if (ej <= si || sj >= ei) continue;
91
+ if (sj < si) sj = si;
92
+ if (ej > ei) ej = ei;
93
+ cov[n_cov++] = (uint64_t)sj<<32 | ej;
94
+ }
95
+ if (n_cov == 0) {
96
+ goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
97
+ } else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
98
+ int j, x = si;
99
+ radix_sort_gfa64(cov, cov + n_cov);
100
+ for (j = 0; j < n_cov; ++j) {
101
+ if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
102
+ x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
103
+ }
104
+ if (ei > x) uncov_len += ei - x;
105
+ }
106
+ skip_uncov:
107
+ for (j = 0; j < k; ++j) { // traverse existing primary hits again
108
+ mg_gchain_t *rp = &r[w[j]];
109
+ int sj = rp->qs, ej = rp->qe, min, max, ol;
110
+ if (ej <= si || sj >= ei) continue; // no overlap
111
+ min = ej - sj < ei - si? ej - sj : ei - si;
112
+ max = ej - sj > ei - si? ej - sj : ei - si;
113
+ ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
114
+ if ((float)ol / min - (float)uncov_len / max > mask_level) {
115
+ int cnt_sub = 0;
116
+ ri->parent = rp->parent;
117
+ rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
118
+ if (ri->cnt >= rp->cnt) cnt_sub = 1;
119
+ if (cnt_sub) ++rp->n_sub;
120
+ break;
121
+ }
122
+ }
123
+ set_parent_test:
124
+ if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
125
+ }
126
+ kfree(km, cov);
127
+ kfree(km, w);
128
+ }
129
+
130
+ // set r[].flt, i.e. mark weak suboptimal chains as filtered
131
+ int mg_gchain_flt_sub(float pri_ratio, int min_diff, int best_n, int n, mg_gchain_t *r)
132
+ {
133
+ if (pri_ratio > 0.0f && n > 0) {
134
+ int i, k, n_2nd = 0;
135
+ for (i = k = 0; i < n; ++i) {
136
+ int p = r[i].parent;
137
+ if (p == i) { // primary
138
+ r[i].flt = 0, ++k;
139
+ } else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
140
+ if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].ps == r[p].ps && r[i].pe == r[p].pe)) // not identical hits; TODO: check path as well
141
+ r[i].flt = 0, ++n_2nd, ++k;
142
+ else r[i].flt = 1;
143
+ } else r[i].flt = 1;
144
+ }
145
+ return k;
146
+ }
147
+ return n;
148
+ }
149
+
150
+ // hard drop filtered chains, ASSUMING gcs is properly ordered
151
+ void mg_gchain_drop_flt(void *km, mg_gchains_t *gcs)
152
+ {
153
+ int32_t i, n_gc, n_lc, n_a, n_lc0, n_a0, *o2n;
154
+ if (gcs->n_gc == 0) return;
155
+ KMALLOC(km, o2n, gcs->n_gc);
156
+ for (i = 0, n_gc = 0; i < gcs->n_gc; ++i) {
157
+ mg_gchain_t *r = &gcs->gc[i];
158
+ o2n[i] = -1;
159
+ if (r->flt || r->cnt == 0) {
160
+ kfree(gcs->km, r->p);
161
+ continue;
162
+ }
163
+ o2n[i] = n_gc++;
164
+ }
165
+ n_gc = n_lc = n_a = 0;
166
+ n_lc0 = n_a0 = 0;
167
+ for (i = 0; i < gcs->n_gc; ++i) {
168
+ mg_gchain_t *r = &gcs->gc[i];
169
+ if (o2n[i] >= 0) {
170
+ memmove(&gcs->a[n_a], &gcs->a[n_a0], r->n_anchor * sizeof(mg128_t));
171
+ memmove(&gcs->lc[n_lc], &gcs->lc[n_lc0], r->cnt * sizeof(mg_llchain_t));
172
+ gcs->gc[n_gc] = *r;
173
+ gcs->gc[n_gc].id = n_gc;
174
+ gcs->gc[n_gc].parent = o2n[gcs->gc[n_gc].parent];
175
+ ++n_gc, n_lc += r->cnt, n_a += r->n_anchor;
176
+ }
177
+ n_lc0 += r->cnt, n_a0 += r->n_anchor;
178
+ }
179
+ assert(n_lc0 == gcs->n_lc && n_a0 == gcs->n_a);
180
+ kfree(km, o2n);
181
+ gcs->n_gc = n_gc, gcs->n_lc = n_lc, gcs->n_a = n_a;
182
+ if (n_a != n_a0) {
183
+ KREALLOC(gcs->km, gcs->a, gcs->n_a);
184
+ KREALLOC(gcs->km, gcs->lc, gcs->n_lc);
185
+ KREALLOC(gcs->km, gcs->gc, gcs->n_gc);
186
+ }
187
+ mg_gchain_restore_offset(gcs);
188
+ }
189
+
190
+ // estimate mapping quality
191
+ void mg_gchain_set_mapq(void *km, mg_gchains_t *gcs, int qlen, int max_mini, int min_gc_score)
192
+ {
193
+ static const float q_coef = 40.0f;
194
+ int64_t sum_sc = 0;
195
+ float uniq_ratio, r_sc, r_cnt;
196
+ int i, t_sc, t_cnt;
197
+ if (gcs == 0 || gcs->n_gc == 0) return;
198
+ t_sc = qlen < 100? qlen : 100;
199
+ t_cnt = max_mini < 10? max_mini : 10;
200
+ if (t_cnt < 5) t_cnt = 5;
201
+ r_sc = 1.0 / t_sc;
202
+ r_cnt = 1.0 / t_cnt;
203
+ for (i = 0; i < gcs->n_gc; ++i)
204
+ if (gcs->gc[i].parent == gcs->gc[i].id)
205
+ sum_sc += gcs->gc[i].score;
206
+ uniq_ratio = (float)sum_sc / (sum_sc + gcs->rep_len);
207
+ for (i = 0; i < gcs->n_gc; ++i) {
208
+ mg_gchain_t *r = &gcs->gc[i];
209
+ if (r->parent == r->id) {
210
+ int mapq, subsc;
211
+ float pen_s1 = (r->score > t_sc? 1.0f : r->score * r_sc) * uniq_ratio;
212
+ float x, pen_cm = r->n_anchor > t_cnt? 1.0f : r->n_anchor * r_cnt;
213
+ pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
214
+ subsc = r->subsc > min_gc_score? r->subsc : min_gc_score;
215
+ x = (float)subsc / r->score;
216
+ mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
217
+ mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
218
+ mapq = mapq > 0? mapq : 0;
219
+ if (r->score > subsc && mapq == 0) mapq = 1;
220
+ r->mapq = mapq < 60? mapq : 60;
221
+ } else r->mapq = 0;
222
+ }
223
+ }
@@ -0,0 +1,260 @@
1
+ #include <assert.h>
2
+ #include <ctype.h>
3
+ #include "gfa-priv.h"
4
+ #include "ksort.h"
5
+
6
+ typedef struct {
7
+ uint32_t side;
8
+ uint32_t ins:31, end:1;
9
+ } gfa_split_t;
10
+
11
+ #define split_key(p) ((p).side)
12
+ KRADIX_SORT_INIT(split, gfa_split_t, split_key, 4)
13
+
14
+ static inline void create_first_arc_semi(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank, uint64_t link_id, int is_comp)
15
+ {
16
+ gfa_arc_t *a;
17
+ if (g->n_arc == g->m_arc) GFA_EXPAND(g->arc, g->m_arc);
18
+ a = &g->arc[g->n_arc++];
19
+ a->v_lv = (uint64_t)v<<32 | seg[v>>1].len;
20
+ a->w = w;
21
+ a->rank = rank;
22
+ a->ov = a->ow = 0;
23
+ a->link_id = link_id;
24
+ a->del = 0;
25
+ a->comp = !!is_comp;
26
+ }
27
+
28
+ static inline void create_first_arc(gfa_t *g, const gfa_seg_t *seg, uint32_t v, uint32_t w, int32_t rank)
29
+ {
30
+ uint64_t link_id = g->n_arc;
31
+ create_first_arc_semi(g, seg, v, w, rank, link_id, 0);
32
+ create_first_arc_semi(g, seg, w^1, v^1, rank, link_id, 1);
33
+ }
34
+
35
+ void gfa_augment(gfa_t *g, int32_t n_ins, const gfa_ins_t *ins, int32_t n_ctg, const char *const* name, const char *const* seq)
36
+ {
37
+ int32_t i, j, k, *scnt, *soff, n_ctg_seg, n_old_seg, n_seg;
38
+ gfa_split_t *sp;
39
+ gfa_seg_t *seg;
40
+ char buf[16];
41
+ uint64_t t, n_old_arc = g->n_arc, *ins_side, *oldcnt;
42
+
43
+ if (n_ins <= 0 || n_ctg <= 0) return;
44
+
45
+ // set soff[]
46
+ GFA_CALLOC(scnt, g->n_seg);
47
+ for (i = 0; i < n_ins; ++i)
48
+ ++scnt[ins[i].v[0]>>1], ++scnt[ins[i].v[1]>>1];
49
+ GFA_MALLOC(soff, g->n_seg + 1);
50
+ for (j = 1, soff[0] = 0; j <= g->n_seg; ++j)
51
+ soff[j] = soff[j-1] + scnt[j-1];
52
+
53
+ // populate sp[]
54
+ GFA_MALLOC(sp, soff[g->n_seg]);
55
+ GFA_BZERO(scnt, g->n_seg);
56
+ for (i = 0, n_ctg_seg = 0; i < n_ins; ++i) {
57
+ const gfa_ins_t *p = &ins[i];
58
+ for (k = 0; k < 2; ++k) {
59
+ uint32_t vlen = g->seg[p->v[k]>>1].len;
60
+ gfa_split_t *q = &sp[soff[p->v[k]>>1] + scnt[p->v[k]>>1]];
61
+ q->ins = i, q->end = k;
62
+ q->side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
63
+ assert(q->side != (0<<1|0) && q->side != (vlen<<1|1)); // not possible to link such sides
64
+ ++scnt[p->v[k]>>1];
65
+ }
66
+ if (p->coff[1] > p->coff[0])
67
+ ++n_ctg_seg;
68
+ }
69
+ free(scnt);
70
+
71
+ // sort sp[]
72
+ for (j = 0, n_old_seg = 0; j < g->n_seg; ++j)
73
+ if (soff[j+1] - soff[j] > 1)
74
+ radix_sort_split(&sp[soff[j]], &sp[soff[j+1]]);
75
+
76
+ // precompute the number of segments after split
77
+ for (j = 0, n_old_seg = 0; j < g->n_seg; ++j) {
78
+ int32_t i0;
79
+ for (i0 = soff[j], i = i0 + 1, k = 0; i <= soff[j+1]; ++i)
80
+ if (i == soff[j+1] || sp[i0].side>>1 != sp[i].side>>1) {
81
+ if (sp[i0].side>>1 != 0 && sp[i0].side>>1 != g->seg[j].len) // otherwise no new segment will be created
82
+ ++k;
83
+ i0 = i;
84
+ }
85
+ n_old_seg += k + 1;
86
+ }
87
+
88
+ // compute ins_side[] and split old segments
89
+ n_seg = n_old_seg + n_ctg_seg;
90
+ GFA_CALLOC(seg, n_seg);
91
+ GFA_CALLOC(ins_side, n_ins);
92
+ GFA_MALLOC(oldcnt, g->n_seg);
93
+ for (j = 0, k = 0; j < g->n_seg; ++j) {
94
+ int32_t i0, l, off = 0, k0 = k;
95
+ gfa_seg_t *s = &g->seg[j];
96
+ gfa_seg_t *t = &seg[k]; // this is so far a placeholder
97
+ // create the first half of a new segment
98
+ snprintf(buf, 15, "s%d", k + 1);
99
+ t->name = gfa_strdup(buf);
100
+ t->snid = s->snid, t->soff = s->soff, t->rank = s->rank;
101
+ // iterate over splits
102
+ for (i0 = soff[j], i = i0 + 1; i <= soff[j+1]; ++i) {
103
+ if (i == soff[j+1] || sp[i].side>>1 != sp[i0].side>>1) {
104
+ gfa_split_t *q0 = &sp[i0];
105
+ for (l = i0; l < i; ++l) {
106
+ gfa_split_t *q = &sp[l];
107
+ int32_t shift = q->end == 0? 32 : 0; // first end on the higher 32 bits
108
+ int32_t side = q->side & 1;
109
+ int32_t which = q->side>>1 == 0? 0 : side; // special-casing when q->side==1, because no new segment created in this case
110
+ ins_side[q->ins] |= (uint64_t)((uint32_t)(k + which) << 1 | (side^q->end)) << shift;
111
+ }
112
+ if (q0->side>>1 != 0 && q0->side>>1 != g->seg[j].len) { // create a new segment
113
+ t->len = (q0->side>>1) - off;
114
+ GFA_MALLOC(t->seq, t->len + 1);
115
+ memcpy(t->seq, &s->seq[off], t->len);
116
+ t->seq[t->len] = 0;
117
+ off += t->len;
118
+ t = &seg[++k]; // create a new segment
119
+ snprintf(buf, 15, "s%d", k + 1);
120
+ t->name = gfa_strdup(buf);
121
+ t->snid = s->snid, t->soff = s->soff + off, t->rank = s->rank;
122
+ }
123
+ i0 = i;
124
+ }
125
+ }
126
+ // finish the last segment
127
+ t->len = s->len - off;
128
+ GFA_MALLOC(t->seq, t->len + 1);
129
+ memcpy(t->seq, &s->seq[off], t->len);
130
+ t->seq[t->len] = 0;
131
+ ++k;
132
+ oldcnt[j] = (uint64_t)k0 << 32 | (k - k0);
133
+ // add new arcs between newly created segments
134
+ for (i = 0; i < k - k0 - 1; ++i)
135
+ create_first_arc(g, seg, (uint32_t)(k0+i)<<1, (uint32_t)(k0+i+1)<<1, s->rank);
136
+ }
137
+ assert(k == n_old_seg);
138
+ free(soff);
139
+ free(sp);
140
+
141
+ // update existing g->arc[]
142
+ for (t = 0; t < n_old_arc; ++t) {
143
+ gfa_arc_t *a = &g->arc[t];
144
+ uint32_t v = a->v_lv >> 32;
145
+ uint32_t off = oldcnt[v>>1]>>32, cnt = (uint32_t)oldcnt[v>>1];
146
+ v = (v&1) == 0? (off+cnt-1)<<1 : off<<1 | 1;
147
+ a->v_lv = (uint64_t)v << 32 | seg[v>>1].len;
148
+ off = oldcnt[a->w>>1]>>32, cnt = (uint32_t)oldcnt[a->w>>1];
149
+ a->w = (a->w&1) == 0? off<<1 : (off+cnt-1)<<1 | 1;
150
+ }
151
+ free(oldcnt);
152
+
153
+ // create newly inserted segments
154
+ for (i = 0, k = n_old_seg; i < n_ins; ++i) {
155
+ const gfa_ins_t *p = &ins[i];
156
+ if (p->coff[0] < p->coff[1]) { // not a pure deletion
157
+ gfa_seg_t *t = &seg[k];
158
+ snprintf(buf, 15, "s%d", k + 1);
159
+ t->name = gfa_strdup(buf);
160
+ GFA_MALLOC(t->seq, p->coff[1] - p->coff[0] + 1);
161
+ for (j = 0; j < p->coff[1] - p->coff[0]; ++j)
162
+ t->seq[j] = seq[p->ctg][p->coff[0] + j];
163
+ t->seq[j] = 0;
164
+ t->len = j;
165
+ t->snid = gfa_sseq_add(g, name[p->ctg]);
166
+ t->soff = p->coff[0];
167
+ t->rank = g->max_rank + 1; // TODO: to deal with SN/SO/SR tags somewhere
168
+ gfa_sseq_update(g, t);
169
+ create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)k<<1, t->rank);
170
+ create_first_arc(g, seg, (uint32_t)k<<1, (uint32_t)ins_side[i], t->rank);
171
+ ++k;
172
+ } else { // a pure deletion
173
+ create_first_arc(g, seg, ins_side[i]>>32, (uint32_t)ins_side[i], g->max_rank + 1);
174
+ }
175
+ }
176
+ free(ins_side);
177
+
178
+ // update *g
179
+ for (j = 0; j < g->n_seg; ++j) {
180
+ free(g->seg[j].name);
181
+ free(g->seg[j].seq);
182
+ free(g->seg[j].aux.aux);
183
+ }
184
+ free(g->seg);
185
+ g->seg = seg, g->n_seg = g->m_seg = n_seg;
186
+ ++g->max_rank;
187
+ GFA_REALLOC(g->link_aux, g->m_arc);
188
+ GFA_BZERO(&g->link_aux[n_old_arc], g->m_arc - n_old_arc);
189
+ gfa_arc_sort(g);
190
+ gfa_arc_index(g);
191
+ gfa_fix_multi(g);
192
+ // k = gfa_fix_symm(g); assert(k == 0); // for debugging; the graph should be symmetric
193
+ }
194
+
195
+ static int32_t gfa_ins_shrink_semi(const gfa_t *g, int32_t pen, uint32_t v, int32_t voff, int32_t coff, uint32_t vv, int32_t vend, int32_t cend, const char *seq)
196
+ {
197
+ int32_t i, j, l, dir, score, max, max_l;
198
+ if (cend == coff) return 0;
199
+ dir = cend > coff? +1 : -1;
200
+ for (i = coff, j = voff, l = max_l = 0, score = max = 0; i != cend; i += dir, j += dir) {
201
+ int32_t cg, vlen = g->seg[v>>1].len;
202
+ if (j == vlen || j == -1) break;
203
+ if (vv == v && j == vend) break;
204
+ ++l;
205
+ cg = (v&1) == 0? g->seg[v>>1].seq[j] : gfa_comp_table[(uint8_t)g->seg[v>>1].seq[vlen - 1 - j]];
206
+ score += tolower(cg) == tolower(seq[i])? +1 : -pen;
207
+ if (score > max) max = score, max_l = l;
208
+ if (score < max - pen * pen) break; // X-drop
209
+ }
210
+ return max_l;
211
+ }
212
+
213
+ int gfa_ins_adj(const gfa_t *g, int pen, gfa_ins_t *ins, const char *seq) // min_len is NOT used for now
214
+ {
215
+ int32_t l, tot = 0;
216
+ l = gfa_ins_shrink_semi(g, pen, ins->v[0], ins->voff[0], ins->coff[0], ins->v[1], ins->voff[1], ins->coff[1], seq);
217
+ ins->voff[0] += l, ins->coff[0] += l, tot += l;
218
+ l = gfa_ins_shrink_semi(g, pen, ins->v[1], ins->voff[1] - 1, ins->coff[1] - 1, ins->v[0], ins->voff[0] - 1, ins->coff[0] - 1, seq);
219
+ ins->voff[1] -= l, ins->coff[1] -= l, tot += l;
220
+ return tot;
221
+ }
222
+
223
+ static inline int check_multi(const gfa_t *g, const gfa_ins_t *ins)
224
+ {
225
+ if (ins->v[0] != ins->v[1] && ins->coff[1] - ins->coff[0] == 0) {
226
+ const gfa_seg_t *s[2];
227
+ uint32_t v[2];
228
+ s[0] = &g->seg[ins->v[0]>>1];
229
+ s[1] = &g->seg[ins->v[1]>>1];
230
+ if (ins->voff[0] != 0 && ins->voff[0] != s[0]->len) return 0;
231
+ if (ins->voff[1] != 0 && ins->voff[1] != s[1]->len) return 0;
232
+ v[0] = ins->voff[0] == 0? ins->v[0]^1 : ins->v[0];
233
+ v[1] = ins->voff[1] == 0? ins->v[1] : ins->v[1]^1;
234
+ if (gfa_find_arc(g, v[0], v[1]) >= 0) return 1;
235
+ return 0;
236
+ } else return 0;
237
+ }
238
+
239
+ int32_t gfa_ins_filter(const gfa_t *g, int32_t n_ins, gfa_ins_t *ins) // filter out impossible inserts
240
+ {
241
+ int32_t i, k, n;
242
+ for (i = 0, n = 0; i < n_ins; ++i) {
243
+ gfa_ins_t *p = &ins[i];
244
+ for (k = 0; k < 2; ++k) {
245
+ uint32_t vlen = g->seg[p->v[k]>>1].len;
246
+ uint32_t side = (p->v[k]&1? vlen - p->voff[k] : p->voff[k]) << 1 | ((p->v[k]&1) ^ k);
247
+ if (side == (0<<1|0) || side == (vlen<<1|1))
248
+ break;
249
+ }
250
+ if (k != 2 || check_multi(g, p)) { // multi-link may happen due to inconsistency between graph chaining and WFA alignment
251
+ if (gfa_verbose >= 2)
252
+ fprintf(stderr, "[W::%s] %s between %c%s and %c%s derived from the %d-th query at %d-%d\n",
253
+ __func__, k != 2? "impossible insert" : "multi-link",
254
+ "><"[p->v[0]&1], g->seg[p->v[0]>>1].name, "><"[p->v[1]&1], g->seg[p->v[1]>>1].name, p->ctg, p->coff[0], p->coff[1]);
255
+ continue;
256
+ }
257
+ ins[n++] = ins[i];
258
+ }
259
+ return n;
260
+ }