ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,441 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include "mgpriv.h"
|
6
|
+
#include "kalloc.h"
|
7
|
+
#include "krmq.h"
|
8
|
+
|
9
|
+
static int64_t mg_chain_bk_end(int32_t max_drop, const mg128_t *z, const int32_t *f, const int64_t *p, int32_t *t, int64_t k)
|
10
|
+
{
|
11
|
+
int64_t i = z[k].y, end_i = -1, max_i = i;
|
12
|
+
int32_t max_s = 0;
|
13
|
+
if (i < 0 || t[i] != 0) return i;
|
14
|
+
do {
|
15
|
+
int32_t s;
|
16
|
+
t[i] = 2;
|
17
|
+
end_i = i = p[i];
|
18
|
+
s = i < 0? z[k].x : (int32_t)z[k].x - f[i];
|
19
|
+
if (s > max_s) max_s = s, max_i = i;
|
20
|
+
else if (max_s - s > max_drop) break;
|
21
|
+
} while (i >= 0 && t[i] == 0);
|
22
|
+
for (i = z[k].y; i >= 0 && i != end_i; i = p[i]) // reset modified t[]
|
23
|
+
t[i] = 0;
|
24
|
+
return max_i;
|
25
|
+
}
|
26
|
+
|
27
|
+
uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t max_drop,
|
28
|
+
int32_t extra_u, int32_t *n_u_, int32_t *n_v_)
|
29
|
+
{
|
30
|
+
mg128_t *z;
|
31
|
+
uint64_t *u;
|
32
|
+
int64_t i, k, n_z, n_v;
|
33
|
+
int32_t n_u;
|
34
|
+
|
35
|
+
*n_u_ = *n_v_ = 0;
|
36
|
+
for (i = 0, n_z = 0; i < n; ++i) // precompute n_z
|
37
|
+
if (f[i] >= min_sc) ++n_z;
|
38
|
+
if (n_z == 0) return 0;
|
39
|
+
KMALLOC(km, z, n_z);
|
40
|
+
for (i = 0, k = 0; i < n; ++i) // populate z[]
|
41
|
+
if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i;
|
42
|
+
radix_sort_128x(z, z + n_z);
|
43
|
+
|
44
|
+
memset(t, 0, n * 4);
|
45
|
+
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u
|
46
|
+
if (t[z[k].y] == 0) {
|
47
|
+
int64_t n_v0 = n_v, end_i;
|
48
|
+
int32_t sc;
|
49
|
+
end_i = mg_chain_bk_end(max_drop, z, f, p, t, k);
|
50
|
+
for (i = z[k].y; i != end_i; i = p[i])
|
51
|
+
++n_v, t[i] = 1;
|
52
|
+
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
|
53
|
+
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
|
54
|
+
++n_u;
|
55
|
+
else n_v = n_v0;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
KMALLOC(km, u, n_u + extra_u);
|
59
|
+
memset(t, 0, n * 4);
|
60
|
+
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[]
|
61
|
+
if (t[z[k].y] == 0) {
|
62
|
+
int64_t n_v0 = n_v, end_i;
|
63
|
+
int32_t sc;
|
64
|
+
end_i = mg_chain_bk_end(max_drop, z, f, p, t, k);
|
65
|
+
for (i = z[k].y; i != end_i; i = p[i])
|
66
|
+
v[n_v++] = i, t[i] = 1;
|
67
|
+
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
|
68
|
+
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
|
69
|
+
u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0);
|
70
|
+
else n_v = n_v0;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
kfree(km, z);
|
74
|
+
assert(n_v < INT32_MAX);
|
75
|
+
*n_u_ = n_u, *n_v_ = n_v;
|
76
|
+
return u;
|
77
|
+
}
|
78
|
+
|
79
|
+
static mg128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mg128_t *a)
|
80
|
+
{
|
81
|
+
mg128_t *b, *w;
|
82
|
+
uint64_t *u2;
|
83
|
+
int64_t i, j, k;
|
84
|
+
|
85
|
+
// write the result to b[]
|
86
|
+
KMALLOC(km, b, n_v);
|
87
|
+
for (i = 0, k = 0; i < n_u; ++i) {
|
88
|
+
int32_t k0 = k, ni = (int32_t)u[i];
|
89
|
+
for (j = 0; j < ni; ++j)
|
90
|
+
b[k++] = a[v[k0 + (ni - j - 1)]];
|
91
|
+
}
|
92
|
+
kfree(km, v);
|
93
|
+
|
94
|
+
// sort u[] and a[] by the target position, such that adjacent chains may be joined
|
95
|
+
KMALLOC(km, w, n_u);
|
96
|
+
for (i = k = 0; i < n_u; ++i) {
|
97
|
+
w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
|
98
|
+
k += (int32_t)u[i];
|
99
|
+
}
|
100
|
+
radix_sort_128x(w, w + n_u);
|
101
|
+
KMALLOC(km, u2, n_u);
|
102
|
+
for (i = k = 0; i < n_u; ++i) {
|
103
|
+
int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
|
104
|
+
u2[i] = u[j];
|
105
|
+
memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mg128_t));
|
106
|
+
k += n;
|
107
|
+
}
|
108
|
+
memcpy(u, u2, n_u * 8);
|
109
|
+
memcpy(b, a, k * sizeof(mg128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
|
110
|
+
kfree(km, a); kfree(km, w); kfree(km, u2);
|
111
|
+
return b;
|
112
|
+
}
|
113
|
+
|
114
|
+
static inline int32_t comput_sc(const mg128_t *ai, const mg128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg)
|
115
|
+
{
|
116
|
+
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
|
117
|
+
int32_t sidi = (ai->y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
|
118
|
+
int32_t sidj = (aj->y & MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT;
|
119
|
+
if (dq <= 0 || dq > max_dist_x) return INT32_MIN;
|
120
|
+
dr = (int32_t)(ai->x - aj->x);
|
121
|
+
if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN;
|
122
|
+
dd = dr > dq? dr - dq : dq - dr;
|
123
|
+
if (sidi == sidj && dd > bw) return INT32_MIN;
|
124
|
+
if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN;
|
125
|
+
dg = dr < dq? dr : dq;
|
126
|
+
q_span = aj->y>>32&0xff;
|
127
|
+
sc = q_span < dg? q_span : dg;
|
128
|
+
if (dd || dg > q_span) {
|
129
|
+
float lin_pen, log_pen;
|
130
|
+
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
|
131
|
+
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
|
132
|
+
if (is_cdna || sidi != sidj) {
|
133
|
+
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
|
134
|
+
else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends
|
135
|
+
else sc -= (int)(lin_pen + .5f * log_pen);
|
136
|
+
} else sc -= (int)(lin_pen + .5f * log_pen);
|
137
|
+
}
|
138
|
+
return sc;
|
139
|
+
}
|
140
|
+
|
141
|
+
/* Input:
|
142
|
+
* a[].x: tid<<33 | rev<<32 | tpos
|
143
|
+
* a[].y: flags<<40 | q_span<<32 | q_pos
|
144
|
+
* Output:
|
145
|
+
* n_u: #chains
|
146
|
+
* u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[])
|
147
|
+
* input a[] is deallocated on return
|
148
|
+
*/
|
149
|
+
mg128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
150
|
+
int is_cdna, int n_seg, int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km)
|
151
|
+
{ // TODO: make sure this works when n has more than 32 bits
|
152
|
+
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
153
|
+
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
|
154
|
+
uint64_t *u;
|
155
|
+
|
156
|
+
if (_u) *_u = 0, *n_u_ = 0;
|
157
|
+
if (n == 0 || a == 0) {
|
158
|
+
kfree(km, a);
|
159
|
+
return 0;
|
160
|
+
}
|
161
|
+
if (max_dist_x < bw) max_dist_x = bw;
|
162
|
+
if (max_dist_y < bw && !is_cdna) max_dist_y = bw;
|
163
|
+
if (is_cdna) max_drop = INT32_MAX;
|
164
|
+
KMALLOC(km, p, n);
|
165
|
+
KMALLOC(km, f, n);
|
166
|
+
KMALLOC(km, v, n);
|
167
|
+
KCALLOC(km, t, n);
|
168
|
+
|
169
|
+
// fill the score and backtrack arrays
|
170
|
+
for (i = 0, max_ii = -1; i < n; ++i) {
|
171
|
+
int64_t max_j = -1, end_j;
|
172
|
+
int32_t max_f = a[i].y>>32&0xff, n_skip = 0;
|
173
|
+
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st;
|
174
|
+
if (i - st > max_iter) st = i - max_iter;
|
175
|
+
for (j = i - 1; j >= st; --j) {
|
176
|
+
int32_t sc;
|
177
|
+
sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
178
|
+
++n_iter;
|
179
|
+
if (sc == INT32_MIN) continue;
|
180
|
+
sc += f[j];
|
181
|
+
if (sc > max_f) {
|
182
|
+
max_f = sc, max_j = j;
|
183
|
+
if (n_skip > 0) --n_skip;
|
184
|
+
} else if (t[j] == (int32_t)i) {
|
185
|
+
if (++n_skip > max_skip)
|
186
|
+
break;
|
187
|
+
}
|
188
|
+
if (p[j] >= 0) t[p[j]] = i;
|
189
|
+
}
|
190
|
+
end_j = j;
|
191
|
+
if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) {
|
192
|
+
int32_t max = INT32_MIN;
|
193
|
+
max_ii = -1;
|
194
|
+
for (j = i - 1; j >= st; --j)
|
195
|
+
if (max < f[j]) max = f[j], max_ii = j;
|
196
|
+
}
|
197
|
+
if (max_ii >= 0 && max_ii < end_j) {
|
198
|
+
int32_t tmp;
|
199
|
+
tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
200
|
+
if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
|
201
|
+
max_f = tmp + f[max_ii], max_j = max_ii;
|
202
|
+
}
|
203
|
+
f[i] = max_f, p[i] = max_j;
|
204
|
+
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
|
205
|
+
if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
|
206
|
+
max_ii = i;
|
207
|
+
if (mmax_f < max_f) mmax_f = max_f;
|
208
|
+
}
|
209
|
+
if (mg_dbg_flag & MG_DBG_LC_PROF) fprintf(stderr, "LP\tn_iter=%ld\tmmax_f=%d\n", (long)n_iter, mmax_f);
|
210
|
+
|
211
|
+
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, 0, &n_u, &n_v);
|
212
|
+
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
213
|
+
kfree(km, p); kfree(km, f); kfree(km, t);
|
214
|
+
if (n_u == 0) {
|
215
|
+
kfree(km, a); kfree(km, v);
|
216
|
+
return 0;
|
217
|
+
}
|
218
|
+
return compact_a(km, n_u, u, n_v, v, a);
|
219
|
+
}
|
220
|
+
|
221
|
+
typedef struct lc_elem_s {
|
222
|
+
int32_t y;
|
223
|
+
int64_t i;
|
224
|
+
double pri;
|
225
|
+
KRMQ_HEAD(struct lc_elem_s) head;
|
226
|
+
} lc_elem_t;
|
227
|
+
|
228
|
+
#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i))
|
229
|
+
#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri)
|
230
|
+
KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2)
|
231
|
+
|
232
|
+
KALLOC_POOL_INIT(rmq, lc_elem_t)
|
233
|
+
|
234
|
+
static inline int32_t comput_sc_simple(const mg128_t *ai, const mg128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width)
|
235
|
+
{
|
236
|
+
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
|
237
|
+
dr = (int32_t)(ai->x - aj->x);
|
238
|
+
*width = dd = dr > dq? dr - dq : dq - dr;
|
239
|
+
dg = dr < dq? dr : dq;
|
240
|
+
q_span = aj->y>>32&0xff;
|
241
|
+
sc = q_span < dg? q_span : dg;
|
242
|
+
if (exact) *exact = (dd == 0 && dg <= q_span);
|
243
|
+
if (dd || dq > q_span) {
|
244
|
+
float lin_pen, log_pen;
|
245
|
+
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
|
246
|
+
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
|
247
|
+
sc -= (int)(lin_pen + .5f * log_pen);
|
248
|
+
}
|
249
|
+
return sc;
|
250
|
+
}
|
251
|
+
|
252
|
+
mg128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
253
|
+
int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km)
|
254
|
+
{
|
255
|
+
int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0, max_drop = bw;
|
256
|
+
int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0;
|
257
|
+
uint64_t *u;
|
258
|
+
lc_elem_t *root = 0, *root_inner = 0;
|
259
|
+
void *mem_mp = 0;
|
260
|
+
kmp_rmq_t *mp;
|
261
|
+
|
262
|
+
if (_u) *_u = 0, *n_u_ = 0;
|
263
|
+
if (n == 0 || a == 0) {
|
264
|
+
kfree(km, a);
|
265
|
+
return 0;
|
266
|
+
}
|
267
|
+
if (max_dist < bw) max_dist = bw;
|
268
|
+
if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
|
269
|
+
KMALLOC(km, p, n);
|
270
|
+
KMALLOC(km, f, n);
|
271
|
+
KCALLOC(km, t, n);
|
272
|
+
KMALLOC(km, v, n);
|
273
|
+
mem_mp = km_init2(km, 0x10000);
|
274
|
+
mp = kmp_init_rmq(mem_mp);
|
275
|
+
|
276
|
+
// fill the score and backtrack arrays
|
277
|
+
for (i = i0 = 0; i < n; ++i) {
|
278
|
+
int64_t max_j = -1;
|
279
|
+
int32_t q_span = a[i].y>>32&0xff, max_f = q_span;
|
280
|
+
lc_elem_t s, *q, *r, lo, hi;
|
281
|
+
// add in-range anchors
|
282
|
+
if (i0 < i && a[i0].x != a[i].x) {
|
283
|
+
int64_t j;
|
284
|
+
for (j = i0; j < i; ++j) {
|
285
|
+
q = kmp_alloc_rmq(mp);
|
286
|
+
q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y));
|
287
|
+
krmq_insert(lc_elem, &root, q, 0);
|
288
|
+
if (max_dist_inner > 0) {
|
289
|
+
r = kmp_alloc_rmq(mp);
|
290
|
+
*r = *q;
|
291
|
+
krmq_insert(lc_elem, &root_inner, r, 0);
|
292
|
+
}
|
293
|
+
}
|
294
|
+
i0 = i;
|
295
|
+
}
|
296
|
+
// get rid of active chains out of range
|
297
|
+
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) {
|
298
|
+
s.y = (int32_t)a[st].y, s.i = st;
|
299
|
+
if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) {
|
300
|
+
q = krmq_erase(lc_elem, &root, q, 0);
|
301
|
+
kmp_free_rmq(mp, q);
|
302
|
+
}
|
303
|
+
++st;
|
304
|
+
}
|
305
|
+
if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree
|
306
|
+
while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) {
|
307
|
+
s.y = (int32_t)a[st_inner].y, s.i = st_inner;
|
308
|
+
if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) {
|
309
|
+
q = krmq_erase(lc_elem, &root_inner, q, 0);
|
310
|
+
kmp_free_rmq(mp, q);
|
311
|
+
}
|
312
|
+
++st_inner;
|
313
|
+
}
|
314
|
+
}
|
315
|
+
// RMQ
|
316
|
+
lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist;
|
317
|
+
hi.i = 0, hi.y = (int32_t)a[i].y - 1;
|
318
|
+
if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) {
|
319
|
+
int32_t sc, exact, width, n_skip = 0;
|
320
|
+
int64_t j = q->i;
|
321
|
+
assert(q->y >= lo.y && q->y <= hi.y);
|
322
|
+
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width);
|
323
|
+
if (width <= bw && sc > max_f) max_f = sc, max_j = j;
|
324
|
+
if (!exact && root_inner && (int32_t)a[i].y > 0) {
|
325
|
+
lc_elem_t *lo, *hi;
|
326
|
+
s.y = (int32_t)a[i].y - 1, s.i = n;
|
327
|
+
krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
|
328
|
+
if (lo) {
|
329
|
+
const lc_elem_t *q;
|
330
|
+
int32_t width, n_rmq_iter = 0;
|
331
|
+
krmq_itr_t(lc_elem) itr;
|
332
|
+
krmq_itr_find(lc_elem, root_inner, lo, &itr);
|
333
|
+
while ((q = krmq_at(&itr)) != 0) {
|
334
|
+
if (q->y < (int32_t)a[i].y - max_dist_inner) break;
|
335
|
+
++n_rmq_iter;
|
336
|
+
j = q->i;
|
337
|
+
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
|
338
|
+
if (width <= bw) {
|
339
|
+
if (sc > max_f) {
|
340
|
+
max_f = sc, max_j = j;
|
341
|
+
if (n_skip > 0) --n_skip;
|
342
|
+
} else if (t[j] == (int32_t)i) {
|
343
|
+
if (++n_skip > max_chn_skip)
|
344
|
+
break;
|
345
|
+
}
|
346
|
+
if (p[j] >= 0) t[p[j]] = i;
|
347
|
+
}
|
348
|
+
if (!krmq_itr_prev(lc_elem, &itr)) break;
|
349
|
+
}
|
350
|
+
n_iter += n_rmq_iter;
|
351
|
+
}
|
352
|
+
}
|
353
|
+
}
|
354
|
+
// set max
|
355
|
+
assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y));
|
356
|
+
f[i] = max_f, p[i] = max_j;
|
357
|
+
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
|
358
|
+
if (mmax_f < max_f) mmax_f = max_f;
|
359
|
+
if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root);
|
360
|
+
}
|
361
|
+
if (mg_dbg_flag & MG_DBG_LC_PROF) fprintf(stderr, "LP\tn_iter=%ld\tmmax_f=%d\trmq_size=%d\tmp_max=%ld\n", (long)n_iter, mmax_f, max_rmq_size, mp->max);
|
362
|
+
km_destroy(mem_mp);
|
363
|
+
|
364
|
+
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, 0, &n_u, &n_v);
|
365
|
+
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
366
|
+
kfree(km, p); kfree(km, f); kfree(km, t);
|
367
|
+
if (n_u == 0) {
|
368
|
+
kfree(km, a); kfree(km, v);
|
369
|
+
return 0;
|
370
|
+
}
|
371
|
+
return compact_a(km, n_u, u, n_v, v, a);
|
372
|
+
}
|
373
|
+
|
374
|
+
mg_lchain_t *mg_lchain_gen(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mg128_t *a)
|
375
|
+
{
|
376
|
+
mg128_t *z;
|
377
|
+
mg_lchain_t *r;
|
378
|
+
int i, k;
|
379
|
+
|
380
|
+
if (n_u == 0) return 0;
|
381
|
+
KCALLOC(km, r, n_u);
|
382
|
+
|
383
|
+
// sort by query position
|
384
|
+
KMALLOC(km, z, n_u);
|
385
|
+
for (i = k = 0; i < n_u; ++i) {
|
386
|
+
int32_t qs = (int32_t)a[k].y + 1 - (a[k].y>>32 & 0xff);
|
387
|
+
z[i].x = (uint64_t)qs << 32 | u[i] >> 32;
|
388
|
+
z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
|
389
|
+
k += (int32_t)u[i];
|
390
|
+
}
|
391
|
+
radix_sort_128x(z, z + n_u);
|
392
|
+
|
393
|
+
// populate r[]
|
394
|
+
for (i = 0; i < n_u; ++i) {
|
395
|
+
mg_lchain_t *ri = &r[i];
|
396
|
+
int32_t k = z[i].y >> 32, q_span = a[k].y >> 32 & 0xff;
|
397
|
+
ri->off = k;
|
398
|
+
ri->cnt = (int32_t)z[i].y;
|
399
|
+
ri->score = (uint32_t)z[i].x;
|
400
|
+
ri->v = a[k].x >> 32;
|
401
|
+
ri->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // for HPC k-mer
|
402
|
+
ri->qs = z[i].x >> 32;
|
403
|
+
ri->re = (int32_t)a[k + ri->cnt - 1].x + 1;
|
404
|
+
ri->qe = (int32_t)a[k + ri->cnt - 1].y + 1;
|
405
|
+
}
|
406
|
+
kfree(km, z);
|
407
|
+
return r;
|
408
|
+
}
|
409
|
+
|
410
|
+
static int32_t get_mini_idx(const mg128_t *a, int32_t n, const int32_t *mini_pos)
|
411
|
+
{
|
412
|
+
int32_t x, L = 0, R = n - 1;
|
413
|
+
x = (int32_t)a->y;
|
414
|
+
while (L <= R) { // binary search
|
415
|
+
int32_t m = ((uint64_t)L + R) >> 1;
|
416
|
+
int32_t y = mini_pos[m];
|
417
|
+
if (y < x) L = m + 1;
|
418
|
+
else if (y > x) R = m - 1;
|
419
|
+
else return m;
|
420
|
+
}
|
421
|
+
return -1;
|
422
|
+
}
|
423
|
+
|
424
|
+
/* Before:
|
425
|
+
* a[].x: tid<<33 | rev<<32 | tpos
|
426
|
+
* a[].y: flags<<40 | q_span<<32 | q_pos
|
427
|
+
* After:
|
428
|
+
* a[].x: mini_pos<<32 | tpos
|
429
|
+
* a[].y: same
|
430
|
+
*/
|
431
|
+
void mg_update_anchors(int32_t n_a, mg128_t *a, int32_t n, const int32_t *mini_pos)
|
432
|
+
{
|
433
|
+
int32_t st, j, k;
|
434
|
+
if (n_a <= 0) return;
|
435
|
+
st = get_mini_idx(&a[0], n, mini_pos);
|
436
|
+
assert(st >= 0);
|
437
|
+
for (k = 0, j = st; j < n && k < n_a; ++j)
|
438
|
+
if ((int32_t)a[k].y == mini_pos[j])
|
439
|
+
a[k].x = (uint64_t)j << 32 | (a[k].x & 0xffffffffU), ++k;
|
440
|
+
assert(k == n_a);
|
441
|
+
}
|