minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/hit.c
ADDED
@@ -0,0 +1,466 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <math.h>
|
4
|
+
#include "mmpriv.h"
|
5
|
+
#include "kalloc.h"
|
6
|
+
#include "khash.h"
|
7
|
+
|
8
|
+
static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
|
9
|
+
{
|
10
|
+
int i;
|
11
|
+
r->mlen = r->blen = 0;
|
12
|
+
if (r->cnt <= 0) return;
|
13
|
+
r->mlen = r->blen = a[r->as].y>>32&0xff;
|
14
|
+
for (i = r->as + 1; i < r->as + r->cnt; ++i) {
|
15
|
+
int span = a[i].y>>32&0xff;
|
16
|
+
int tl = (int32_t)a[i].x - (int32_t)a[i-1].x;
|
17
|
+
int ql = (int32_t)a[i].y - (int32_t)a[i-1].y;
|
18
|
+
r->blen += tl > ql? tl : ql;
|
19
|
+
r->mlen += tl > span && ql > span? span : tl < ql? tl : ql;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a, int is_qstrand)
|
24
|
+
{ // NB: r->as and r->cnt MUST BE set correctly for this function to work
|
25
|
+
int32_t k = r->as, q_span = (int32_t)(a[k].y>>32&0xff);
|
26
|
+
r->rev = a[k].x>>63;
|
27
|
+
r->rid = a[k].x<<1>>33;
|
28
|
+
r->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // NB: target span may be shorter, so this test is necessary
|
29
|
+
r->re = (int32_t)a[k + r->cnt - 1].x + 1;
|
30
|
+
if (!r->rev || is_qstrand) {
|
31
|
+
r->qs = (int32_t)a[k].y + 1 - q_span;
|
32
|
+
r->qe = (int32_t)a[k + r->cnt - 1].y + 1;
|
33
|
+
} else {
|
34
|
+
r->qs = qlen - ((int32_t)a[k + r->cnt - 1].y + 1);
|
35
|
+
r->qe = qlen - ((int32_t)a[k].y + 1 - q_span);
|
36
|
+
}
|
37
|
+
mm_cal_fuzzy_len(r, a);
|
38
|
+
}
|
39
|
+
|
40
|
+
static inline uint64_t hash64(uint64_t key)
|
41
|
+
{
|
42
|
+
key = (~key + (key << 21));
|
43
|
+
key = key ^ key >> 24;
|
44
|
+
key = ((key + (key << 3)) + (key << 8));
|
45
|
+
key = key ^ key >> 14;
|
46
|
+
key = ((key + (key << 2)) + (key << 4));
|
47
|
+
key = key ^ key >> 28;
|
48
|
+
key = (key + (key << 31));
|
49
|
+
return key;
|
50
|
+
}
|
51
|
+
|
52
|
+
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand) // convert chains to hits
|
53
|
+
{
|
54
|
+
mm128_t *z, tmp;
|
55
|
+
mm_reg1_t *r;
|
56
|
+
int i, k;
|
57
|
+
|
58
|
+
if (n_u == 0) return 0;
|
59
|
+
|
60
|
+
// sort by score
|
61
|
+
z = (mm128_t*)kmalloc(km, n_u * 16);
|
62
|
+
for (i = k = 0; i < n_u; ++i) {
|
63
|
+
uint32_t h;
|
64
|
+
h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
|
65
|
+
z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
|
66
|
+
z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
|
67
|
+
k += (int32_t)u[i];
|
68
|
+
}
|
69
|
+
radix_sort_128x(z, z + n_u);
|
70
|
+
for (i = 0; i < n_u>>1; ++i) // reverse, s.t. larger score first
|
71
|
+
tmp = z[i], z[i] = z[n_u-1-i], z[n_u-1-i] = tmp;
|
72
|
+
|
73
|
+
// populate r[]
|
74
|
+
r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
|
75
|
+
for (i = 0; i < n_u; ++i) {
|
76
|
+
mm_reg1_t *ri = &r[i];
|
77
|
+
ri->id = i;
|
78
|
+
ri->parent = MM_PARENT_UNSET;
|
79
|
+
ri->score = ri->score0 = z[i].x >> 32;
|
80
|
+
ri->hash = (uint32_t)z[i].x;
|
81
|
+
ri->cnt = (int32_t)z[i].y;
|
82
|
+
ri->as = z[i].y >> 32;
|
83
|
+
ri->div = -1.0f;
|
84
|
+
mm_reg_set_coor(ri, qlen, a, is_qstrand);
|
85
|
+
}
|
86
|
+
kfree(km, z);
|
87
|
+
return r;
|
88
|
+
}
|
89
|
+
|
90
|
+
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
|
91
|
+
{
|
92
|
+
int i;
|
93
|
+
if (mi->n_alt == 0) return;
|
94
|
+
for (i = 0; i < n; ++i)
|
95
|
+
if (mi->seq[r[i].rid].is_alt)
|
96
|
+
r[i].is_alt = 1;
|
97
|
+
}
|
98
|
+
|
99
|
+
static inline int mm_alt_score(int score, float alt_diff_frac)
|
100
|
+
{
|
101
|
+
if (score < 0) return score;
|
102
|
+
score = (int)(score * (1.0 - alt_diff_frac) + .499);
|
103
|
+
return score > 0? score : 1;
|
104
|
+
}
|
105
|
+
|
106
|
+
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand)
|
107
|
+
{
|
108
|
+
if (n <= 0 || n >= r->cnt) return;
|
109
|
+
*r2 = *r;
|
110
|
+
r2->id = -1;
|
111
|
+
r2->sam_pri = 0;
|
112
|
+
r2->p = 0;
|
113
|
+
r2->split_inv = 0;
|
114
|
+
r2->cnt = r->cnt - n;
|
115
|
+
r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
|
116
|
+
r2->as = r->as + n;
|
117
|
+
if (r->parent == r->id) r2->parent = MM_PARENT_TMP_PRI;
|
118
|
+
mm_reg_set_coor(r2, qlen, a, is_qstrand);
|
119
|
+
r->cnt -= r2->cnt;
|
120
|
+
r->score -= r2->score;
|
121
|
+
mm_reg_set_coor(r, qlen, a, is_qstrand);
|
122
|
+
r->split |= 1, r2->split |= 2;
|
123
|
+
}
|
124
|
+
|
125
|
+
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
|
126
|
+
{
|
127
|
+
int i, j, k, *w;
|
128
|
+
uint64_t *cov;
|
129
|
+
if (n <= 0) return;
|
130
|
+
for (i = 0; i < n; ++i) r[i].id = i;
|
131
|
+
cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
|
132
|
+
w = (int*)kmalloc(km, n * sizeof(int));
|
133
|
+
w[0] = 0, r[0].parent = 0;
|
134
|
+
for (i = 1, k = 1; i < n; ++i) {
|
135
|
+
mm_reg1_t *ri = &r[i];
|
136
|
+
int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
|
137
|
+
if (hard_mask_level) goto skip_uncov;
|
138
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
|
139
|
+
mm_reg1_t *rp = &r[w[j]];
|
140
|
+
int sj = rp->qs, ej = rp->qe;
|
141
|
+
if (ej <= si || sj >= ei) continue;
|
142
|
+
if (sj < si) sj = si;
|
143
|
+
if (ej > ei) ej = ei;
|
144
|
+
cov[n_cov++] = (uint64_t)sj<<32 | ej;
|
145
|
+
}
|
146
|
+
if (n_cov == 0) {
|
147
|
+
goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
|
148
|
+
} else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
|
149
|
+
int j, x = si;
|
150
|
+
radix_sort_64(cov, cov + n_cov);
|
151
|
+
for (j = 0; j < n_cov; ++j) {
|
152
|
+
if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
|
153
|
+
x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
|
154
|
+
}
|
155
|
+
if (ei > x) uncov_len += ei - x;
|
156
|
+
}
|
157
|
+
skip_uncov:
|
158
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits again
|
159
|
+
mm_reg1_t *rp = &r[w[j]];
|
160
|
+
int sj = rp->qs, ej = rp->qe, min, max, ol;
|
161
|
+
if (ej <= si || sj >= ei) continue; // no overlap
|
162
|
+
min = ej - sj < ei - si? ej - sj : ei - si;
|
163
|
+
max = ej - sj > ei - si? ej - sj : ei - si;
|
164
|
+
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
165
|
+
if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
|
166
|
+
int cnt_sub = 0, sci = ri->score;
|
167
|
+
ri->parent = rp->parent;
|
168
|
+
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
169
|
+
rp->subsc = rp->subsc > sci? rp->subsc : sci;
|
170
|
+
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
171
|
+
if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
|
172
|
+
sci = ri->p->dp_max;
|
173
|
+
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
174
|
+
rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
|
175
|
+
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
|
176
|
+
}
|
177
|
+
if (cnt_sub) ++rp->n_sub;
|
178
|
+
break;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
set_parent_test:
|
182
|
+
if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
|
183
|
+
}
|
184
|
+
kfree(km, cov);
|
185
|
+
kfree(km, w);
|
186
|
+
}
|
187
|
+
|
188
|
+
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
|
189
|
+
{
|
190
|
+
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
|
191
|
+
mm128_t *aux;
|
192
|
+
mm_reg1_t *t;
|
193
|
+
|
194
|
+
if (n <= 1) return;
|
195
|
+
aux = (mm128_t*)kmalloc(km, n * 16);
|
196
|
+
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
|
197
|
+
for (i = n_aux = 0; i < n; ++i) {
|
198
|
+
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
|
199
|
+
int score;
|
200
|
+
if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
|
201
|
+
else score = r[i].score, no_cigar = 1;
|
202
|
+
if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
|
203
|
+
aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
|
204
|
+
aux[n_aux++].y = i;
|
205
|
+
} else if (r[i].p) {
|
206
|
+
free(r[i].p);
|
207
|
+
r[i].p = 0;
|
208
|
+
}
|
209
|
+
}
|
210
|
+
assert(has_cigar + no_cigar == 1);
|
211
|
+
radix_sort_128x(aux, aux + n_aux);
|
212
|
+
for (i = n_aux - 1; i >= 0; --i)
|
213
|
+
t[n_aux - 1 - i] = r[aux[i].y];
|
214
|
+
memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
|
215
|
+
*n_regs = n_aux;
|
216
|
+
kfree(km, aux);
|
217
|
+
kfree(km, t);
|
218
|
+
}
|
219
|
+
|
220
|
+
int mm_set_sam_pri(int n, mm_reg1_t *r)
|
221
|
+
{
|
222
|
+
int i, n_pri = 0;
|
223
|
+
for (i = 0; i < n; ++i)
|
224
|
+
if (r[i].id == r[i].parent) {
|
225
|
+
++n_pri;
|
226
|
+
r[i].sam_pri = (n_pri == 1);
|
227
|
+
} else r[i].sam_pri = 0;
|
228
|
+
return n_pri;
|
229
|
+
}
|
230
|
+
|
231
|
+
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id
|
232
|
+
{
|
233
|
+
int *tmp, i, max_id = -1, n_tmp;
|
234
|
+
if (n_regs <= 0) return;
|
235
|
+
for (i = 0; i < n_regs; ++i) // NB: doesn't work if mm_reg1_t::id is negative
|
236
|
+
max_id = max_id > regs[i].id? max_id : regs[i].id;
|
237
|
+
n_tmp = max_id + 1;
|
238
|
+
tmp = (int*)kmalloc(km, n_tmp * sizeof(int));
|
239
|
+
for (i = 0; i < n_tmp; ++i) tmp[i] = -1;
|
240
|
+
for (i = 0; i < n_regs; ++i)
|
241
|
+
if (regs[i].id >= 0) tmp[regs[i].id] = i;
|
242
|
+
for (i = 0; i < n_regs; ++i) {
|
243
|
+
mm_reg1_t *r = ®s[i];
|
244
|
+
r->id = i;
|
245
|
+
if (r->parent == MM_PARENT_TMP_PRI)
|
246
|
+
r->parent = i;
|
247
|
+
else if (r->parent >= 0 && tmp[r->parent] >= 0)
|
248
|
+
r->parent = tmp[r->parent];
|
249
|
+
else r->parent = MM_PARENT_UNSET;
|
250
|
+
}
|
251
|
+
kfree(km, tmp);
|
252
|
+
mm_set_sam_pri(n_regs, regs);
|
253
|
+
}
|
254
|
+
|
255
|
+
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r)
|
256
|
+
{
|
257
|
+
if (pri_ratio > 0.0f && *n_ > 0) {
|
258
|
+
int i, k, n = *n_, n_2nd = 0;
|
259
|
+
for (i = k = 0; i < n; ++i) {
|
260
|
+
int p = r[i].parent;
|
261
|
+
if (p == i || r[i].inv) { // primary or inversion
|
262
|
+
r[k++] = r[i];
|
263
|
+
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
|
264
|
+
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rid == r[p].rid && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
|
265
|
+
r[k++] = r[i], ++n_2nd;
|
266
|
+
else if (r[i].p) free(r[i].p);
|
267
|
+
} else if (check_strand && n_2nd < best_n && r[i].score > min_strand_sc && r[i].rev != r[p].rev) {
|
268
|
+
r[i].strand_retained = 1;
|
269
|
+
r[k++] = r[i], ++n_2nd;
|
270
|
+
} else if (r[i].p) free(r[i].p);
|
271
|
+
}
|
272
|
+
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
|
273
|
+
*n_ = k;
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
int mm_filter_strand_retained(int n_regs, mm_reg1_t *r)
|
278
|
+
{
|
279
|
+
int i, k;
|
280
|
+
for (i = k = 0; i < n_regs; ++i) {
|
281
|
+
int p = r[i].parent;
|
282
|
+
if (!r[i].strand_retained || r[i].div < r[p].div * 5.0f || r[i].div < 0.01f) {
|
283
|
+
if (k < i) r[k++] = r[i];
|
284
|
+
else ++k;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
return k;
|
288
|
+
}
|
289
|
+
|
290
|
+
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs)
|
291
|
+
{ // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
|
292
|
+
int i, k;
|
293
|
+
for (i = k = 0; i < *n_regs; ++i) {
|
294
|
+
mm_reg1_t *r = ®s[i];
|
295
|
+
int flt = 0;
|
296
|
+
if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1;
|
297
|
+
if (r->p) { // these filters are only applied when base-alignment is available
|
298
|
+
if (r->mlen < opt->min_chain_score) flt = 1;
|
299
|
+
else if (r->p->dp_max < opt->min_dp_max) flt = 1;
|
300
|
+
else if (r->qs > qlen * opt->max_clip_ratio && qlen - r->qe > qlen * opt->max_clip_ratio) flt = 1;
|
301
|
+
if (flt) free(r->p);
|
302
|
+
}
|
303
|
+
if (!flt) {
|
304
|
+
if (k < i) regs[k++] = regs[i];
|
305
|
+
else ++k;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
*n_regs = k;
|
309
|
+
}
|
310
|
+
|
311
|
+
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
|
312
|
+
{ // squeeze out regions in a[] that are not referenced by regs[]
|
313
|
+
int i, as = 0;
|
314
|
+
uint64_t *aux;
|
315
|
+
aux = (uint64_t*)kmalloc(km, n_regs * 8);
|
316
|
+
for (i = 0; i < n_regs; ++i)
|
317
|
+
aux[i] = (uint64_t)regs[i].as << 32 | i;
|
318
|
+
radix_sort_64(aux, aux + n_regs);
|
319
|
+
for (i = 0; i < n_regs; ++i) {
|
320
|
+
mm_reg1_t *r = ®s[(int32_t)aux[i]];
|
321
|
+
if (r->as != as) {
|
322
|
+
memmove(&a[as], &a[r->as], r->cnt * 16);
|
323
|
+
r->as = as;
|
324
|
+
}
|
325
|
+
as += r->cnt;
|
326
|
+
}
|
327
|
+
kfree(km, aux);
|
328
|
+
return as;
|
329
|
+
}
|
330
|
+
|
331
|
+
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
|
332
|
+
{
|
333
|
+
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
|
334
|
+
mm_seg_t *seg;
|
335
|
+
|
336
|
+
assert(n_segs <= MM_MAX_SEG);
|
337
|
+
for (s = 1, acc_qlen[0] = 0; s < n_segs; ++s)
|
338
|
+
acc_qlen[s] = acc_qlen[s-1] + qlens[s-1];
|
339
|
+
qlen_sum = acc_qlen[n_segs - 1] + qlens[n_segs - 1];
|
340
|
+
|
341
|
+
seg = (mm_seg_t*)kcalloc(km, n_segs, sizeof(mm_seg_t));
|
342
|
+
for (s = 0; s < n_segs; ++s) {
|
343
|
+
seg[s].u = (uint64_t*)kmalloc(km, n_regs0 * 8);
|
344
|
+
for (i = 0; i < n_regs0; ++i)
|
345
|
+
seg[s].u[i] = (uint64_t)regs0[i].score << 32;
|
346
|
+
}
|
347
|
+
for (i = 0; i < n_regs0; ++i) {
|
348
|
+
const mm_reg1_t *r = ®s0[i];
|
349
|
+
for (j = 0; j < r->cnt; ++j) {
|
350
|
+
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
|
351
|
+
++seg[sid].u[i];
|
352
|
+
++seg[sid].n_a;
|
353
|
+
}
|
354
|
+
}
|
355
|
+
for (s = 0; s < n_segs; ++s) {
|
356
|
+
mm_seg_t *sr = &seg[s];
|
357
|
+
for (i = 0, sr->n_u = 0; i < n_regs0; ++i) // squeeze out zero-length per-segment chains
|
358
|
+
if ((int32_t)sr->u[i] != 0)
|
359
|
+
sr->u[sr->n_u++] = sr->u[i];
|
360
|
+
sr->a = (mm128_t*)kmalloc(km, sr->n_a * sizeof(mm128_t));
|
361
|
+
sr->n_a = 0;
|
362
|
+
}
|
363
|
+
|
364
|
+
for (i = 0; i < n_regs0; ++i) {
|
365
|
+
const mm_reg1_t *r = ®s0[i];
|
366
|
+
for (j = 0; j < r->cnt; ++j) {
|
367
|
+
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
|
368
|
+
mm128_t a1 = a[r->as + j];
|
369
|
+
// on reverse strand, the segment position is:
|
370
|
+
// x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
|
371
|
+
// (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
|
372
|
+
a1.y -= a1.x>>63? qlen_sum - (qlens[sid] + acc_qlen[sid]) : acc_qlen[sid];
|
373
|
+
seg[sid].a[seg[sid].n_a++] = a1;
|
374
|
+
}
|
375
|
+
}
|
376
|
+
for (s = 0; s < n_segs; ++s) {
|
377
|
+
regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a, 0);
|
378
|
+
n_regs[s] = seg[s].n_u;
|
379
|
+
for (i = 0; i < n_regs[s]; ++i) {
|
380
|
+
regs[s][i].seg_split = 1;
|
381
|
+
regs[s][i].seg_id = s;
|
382
|
+
}
|
383
|
+
}
|
384
|
+
return seg;
|
385
|
+
}
|
386
|
+
|
387
|
+
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs)
|
388
|
+
{
|
389
|
+
int i;
|
390
|
+
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].u);
|
391
|
+
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].a);
|
392
|
+
kfree(km, segs);
|
393
|
+
}
|
394
|
+
|
395
|
+
static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
|
396
|
+
{
|
397
|
+
int i, n_aux;
|
398
|
+
mm128_t *aux;
|
399
|
+
if (n_regs < 3) return;
|
400
|
+
for (i = 0; i < n_regs; ++i)
|
401
|
+
if (regs[i].inv) break;
|
402
|
+
if (i == n_regs) return; // no inversion hits
|
403
|
+
|
404
|
+
aux = (mm128_t*)kmalloc(km, n_regs * 16);
|
405
|
+
for (i = n_aux = 0; i < n_regs; ++i)
|
406
|
+
if (regs[i].parent == i || regs[i].parent < 0)
|
407
|
+
aux[n_aux].y = i, aux[n_aux++].x = (uint64_t)regs[i].rid << 32 | regs[i].rs;
|
408
|
+
radix_sort_128x(aux, aux + n_aux);
|
409
|
+
|
410
|
+
for (i = 1; i < n_aux - 1; ++i) {
|
411
|
+
mm_reg1_t *inv = ®s[aux[i].y];
|
412
|
+
if (inv->inv) {
|
413
|
+
mm_reg1_t *l = ®s[aux[i-1].y];
|
414
|
+
mm_reg1_t *r = ®s[aux[i+1].y];
|
415
|
+
inv->mapq = l->mapq < r->mapq? l->mapq : r->mapq;
|
416
|
+
}
|
417
|
+
}
|
418
|
+
kfree(km, aux);
|
419
|
+
}
|
420
|
+
|
421
|
+
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
|
422
|
+
{
|
423
|
+
static const float q_coef = 40.0f;
|
424
|
+
int64_t sum_sc = 0;
|
425
|
+
float uniq_ratio;
|
426
|
+
int i;
|
427
|
+
if (n_regs == 0) return;
|
428
|
+
for (i = 0; i < n_regs; ++i)
|
429
|
+
if (regs[i].parent == regs[i].id)
|
430
|
+
sum_sc += regs[i].score;
|
431
|
+
uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
|
432
|
+
for (i = 0; i < n_regs; ++i) {
|
433
|
+
mm_reg1_t *r = ®s[i];
|
434
|
+
if (r->inv) {
|
435
|
+
r->mapq = 0;
|
436
|
+
} else if (r->parent == r->id) {
|
437
|
+
int mapq, subsc;
|
438
|
+
float pen_s1 = (r->score > 100? 1.0f : 0.01f * r->score) * uniq_ratio;
|
439
|
+
float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
|
440
|
+
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
441
|
+
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
|
442
|
+
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
|
443
|
+
float identity = (float)r->mlen / r->blen;
|
444
|
+
float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
|
445
|
+
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
|
446
|
+
if (!is_sr) {
|
447
|
+
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
|
448
|
+
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
|
449
|
+
}
|
450
|
+
} else {
|
451
|
+
float x = (float)subsc / r->score0;
|
452
|
+
if (r->p) {
|
453
|
+
float identity = (float)r->mlen / r->blen;
|
454
|
+
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc));
|
455
|
+
} else {
|
456
|
+
mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
|
457
|
+
}
|
458
|
+
}
|
459
|
+
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
|
460
|
+
mapq = mapq > 0? mapq : 0;
|
461
|
+
r->mapq = mapq < 60? mapq : 60;
|
462
|
+
if (r->p && r->p->dp_max > r->p->dp_max2 && r->mapq == 0) r->mapq = 1;
|
463
|
+
} else r->mapq = 0;
|
464
|
+
}
|
465
|
+
mm_set_inv_mapq(km, n_regs, regs);
|
466
|
+
}
|