minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/hit.c
ADDED
@@ -0,0 +1,466 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <math.h>
|
4
|
+
#include "mmpriv.h"
|
5
|
+
#include "kalloc.h"
|
6
|
+
#include "khash.h"
|
7
|
+
|
8
|
+
static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
|
9
|
+
{
|
10
|
+
int i;
|
11
|
+
r->mlen = r->blen = 0;
|
12
|
+
if (r->cnt <= 0) return;
|
13
|
+
r->mlen = r->blen = a[r->as].y>>32&0xff;
|
14
|
+
for (i = r->as + 1; i < r->as + r->cnt; ++i) {
|
15
|
+
int span = a[i].y>>32&0xff;
|
16
|
+
int tl = (int32_t)a[i].x - (int32_t)a[i-1].x;
|
17
|
+
int ql = (int32_t)a[i].y - (int32_t)a[i-1].y;
|
18
|
+
r->blen += tl > ql? tl : ql;
|
19
|
+
r->mlen += tl > span && ql > span? span : tl < ql? tl : ql;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a, int is_qstrand)
|
24
|
+
{ // NB: r->as and r->cnt MUST BE set correctly for this function to work
|
25
|
+
int32_t k = r->as, q_span = (int32_t)(a[k].y>>32&0xff);
|
26
|
+
r->rev = a[k].x>>63;
|
27
|
+
r->rid = a[k].x<<1>>33;
|
28
|
+
r->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // NB: target span may be shorter, so this test is necessary
|
29
|
+
r->re = (int32_t)a[k + r->cnt - 1].x + 1;
|
30
|
+
if (!r->rev || is_qstrand) {
|
31
|
+
r->qs = (int32_t)a[k].y + 1 - q_span;
|
32
|
+
r->qe = (int32_t)a[k + r->cnt - 1].y + 1;
|
33
|
+
} else {
|
34
|
+
r->qs = qlen - ((int32_t)a[k + r->cnt - 1].y + 1);
|
35
|
+
r->qe = qlen - ((int32_t)a[k].y + 1 - q_span);
|
36
|
+
}
|
37
|
+
mm_cal_fuzzy_len(r, a);
|
38
|
+
}
|
39
|
+
|
40
|
+
static inline uint64_t hash64(uint64_t key)
|
41
|
+
{
|
42
|
+
key = (~key + (key << 21));
|
43
|
+
key = key ^ key >> 24;
|
44
|
+
key = ((key + (key << 3)) + (key << 8));
|
45
|
+
key = key ^ key >> 14;
|
46
|
+
key = ((key + (key << 2)) + (key << 4));
|
47
|
+
key = key ^ key >> 28;
|
48
|
+
key = (key + (key << 31));
|
49
|
+
return key;
|
50
|
+
}
|
51
|
+
|
52
|
+
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand) // convert chains to hits
|
53
|
+
{
|
54
|
+
mm128_t *z, tmp;
|
55
|
+
mm_reg1_t *r;
|
56
|
+
int i, k;
|
57
|
+
|
58
|
+
if (n_u == 0) return 0;
|
59
|
+
|
60
|
+
// sort by score
|
61
|
+
z = (mm128_t*)kmalloc(km, n_u * 16);
|
62
|
+
for (i = k = 0; i < n_u; ++i) {
|
63
|
+
uint32_t h;
|
64
|
+
h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
|
65
|
+
z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
|
66
|
+
z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
|
67
|
+
k += (int32_t)u[i];
|
68
|
+
}
|
69
|
+
radix_sort_128x(z, z + n_u);
|
70
|
+
for (i = 0; i < n_u>>1; ++i) // reverse, s.t. larger score first
|
71
|
+
tmp = z[i], z[i] = z[n_u-1-i], z[n_u-1-i] = tmp;
|
72
|
+
|
73
|
+
// populate r[]
|
74
|
+
r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
|
75
|
+
for (i = 0; i < n_u; ++i) {
|
76
|
+
mm_reg1_t *ri = &r[i];
|
77
|
+
ri->id = i;
|
78
|
+
ri->parent = MM_PARENT_UNSET;
|
79
|
+
ri->score = ri->score0 = z[i].x >> 32;
|
80
|
+
ri->hash = (uint32_t)z[i].x;
|
81
|
+
ri->cnt = (int32_t)z[i].y;
|
82
|
+
ri->as = z[i].y >> 32;
|
83
|
+
ri->div = -1.0f;
|
84
|
+
mm_reg_set_coor(ri, qlen, a, is_qstrand);
|
85
|
+
}
|
86
|
+
kfree(km, z);
|
87
|
+
return r;
|
88
|
+
}
|
89
|
+
|
90
|
+
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
|
91
|
+
{
|
92
|
+
int i;
|
93
|
+
if (mi->n_alt == 0) return;
|
94
|
+
for (i = 0; i < n; ++i)
|
95
|
+
if (mi->seq[r[i].rid].is_alt)
|
96
|
+
r[i].is_alt = 1;
|
97
|
+
}
|
98
|
+
|
99
|
+
static inline int mm_alt_score(int score, float alt_diff_frac)
|
100
|
+
{
|
101
|
+
if (score < 0) return score;
|
102
|
+
score = (int)(score * (1.0 - alt_diff_frac) + .499);
|
103
|
+
return score > 0? score : 1;
|
104
|
+
}
|
105
|
+
|
106
|
+
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand)
|
107
|
+
{
|
108
|
+
if (n <= 0 || n >= r->cnt) return;
|
109
|
+
*r2 = *r;
|
110
|
+
r2->id = -1;
|
111
|
+
r2->sam_pri = 0;
|
112
|
+
r2->p = 0;
|
113
|
+
r2->split_inv = 0;
|
114
|
+
r2->cnt = r->cnt - n;
|
115
|
+
r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
|
116
|
+
r2->as = r->as + n;
|
117
|
+
if (r->parent == r->id) r2->parent = MM_PARENT_TMP_PRI;
|
118
|
+
mm_reg_set_coor(r2, qlen, a, is_qstrand);
|
119
|
+
r->cnt -= r2->cnt;
|
120
|
+
r->score -= r2->score;
|
121
|
+
mm_reg_set_coor(r, qlen, a, is_qstrand);
|
122
|
+
r->split |= 1, r2->split |= 2;
|
123
|
+
}
|
124
|
+
|
125
|
+
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
|
126
|
+
{
|
127
|
+
int i, j, k, *w;
|
128
|
+
uint64_t *cov;
|
129
|
+
if (n <= 0) return;
|
130
|
+
for (i = 0; i < n; ++i) r[i].id = i;
|
131
|
+
cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
|
132
|
+
w = (int*)kmalloc(km, n * sizeof(int));
|
133
|
+
w[0] = 0, r[0].parent = 0;
|
134
|
+
for (i = 1, k = 1; i < n; ++i) {
|
135
|
+
mm_reg1_t *ri = &r[i];
|
136
|
+
int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
|
137
|
+
if (hard_mask_level) goto skip_uncov;
|
138
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
|
139
|
+
mm_reg1_t *rp = &r[w[j]];
|
140
|
+
int sj = rp->qs, ej = rp->qe;
|
141
|
+
if (ej <= si || sj >= ei) continue;
|
142
|
+
if (sj < si) sj = si;
|
143
|
+
if (ej > ei) ej = ei;
|
144
|
+
cov[n_cov++] = (uint64_t)sj<<32 | ej;
|
145
|
+
}
|
146
|
+
if (n_cov == 0) {
|
147
|
+
goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
|
148
|
+
} else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
|
149
|
+
int j, x = si;
|
150
|
+
radix_sort_64(cov, cov + n_cov);
|
151
|
+
for (j = 0; j < n_cov; ++j) {
|
152
|
+
if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
|
153
|
+
x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
|
154
|
+
}
|
155
|
+
if (ei > x) uncov_len += ei - x;
|
156
|
+
}
|
157
|
+
skip_uncov:
|
158
|
+
for (j = 0; j < k; ++j) { // traverse existing primary hits again
|
159
|
+
mm_reg1_t *rp = &r[w[j]];
|
160
|
+
int sj = rp->qs, ej = rp->qe, min, max, ol;
|
161
|
+
if (ej <= si || sj >= ei) continue; // no overlap
|
162
|
+
min = ej - sj < ei - si? ej - sj : ei - si;
|
163
|
+
max = ej - sj > ei - si? ej - sj : ei - si;
|
164
|
+
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
165
|
+
if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
|
166
|
+
int cnt_sub = 0, sci = ri->score;
|
167
|
+
ri->parent = rp->parent;
|
168
|
+
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
169
|
+
rp->subsc = rp->subsc > sci? rp->subsc : sci;
|
170
|
+
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
171
|
+
if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
|
172
|
+
sci = ri->p->dp_max;
|
173
|
+
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
174
|
+
rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
|
175
|
+
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
|
176
|
+
}
|
177
|
+
if (cnt_sub) ++rp->n_sub;
|
178
|
+
break;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
set_parent_test:
|
182
|
+
if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
|
183
|
+
}
|
184
|
+
kfree(km, cov);
|
185
|
+
kfree(km, w);
|
186
|
+
}
|
187
|
+
|
188
|
+
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
|
189
|
+
{
|
190
|
+
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
|
191
|
+
mm128_t *aux;
|
192
|
+
mm_reg1_t *t;
|
193
|
+
|
194
|
+
if (n <= 1) return;
|
195
|
+
aux = (mm128_t*)kmalloc(km, n * 16);
|
196
|
+
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
|
197
|
+
for (i = n_aux = 0; i < n; ++i) {
|
198
|
+
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
|
199
|
+
int score;
|
200
|
+
if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
|
201
|
+
else score = r[i].score, no_cigar = 1;
|
202
|
+
if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
|
203
|
+
aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
|
204
|
+
aux[n_aux++].y = i;
|
205
|
+
} else if (r[i].p) {
|
206
|
+
free(r[i].p);
|
207
|
+
r[i].p = 0;
|
208
|
+
}
|
209
|
+
}
|
210
|
+
assert(has_cigar + no_cigar == 1);
|
211
|
+
radix_sort_128x(aux, aux + n_aux);
|
212
|
+
for (i = n_aux - 1; i >= 0; --i)
|
213
|
+
t[n_aux - 1 - i] = r[aux[i].y];
|
214
|
+
memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
|
215
|
+
*n_regs = n_aux;
|
216
|
+
kfree(km, aux);
|
217
|
+
kfree(km, t);
|
218
|
+
}
|
219
|
+
|
220
|
+
int mm_set_sam_pri(int n, mm_reg1_t *r)
|
221
|
+
{
|
222
|
+
int i, n_pri = 0;
|
223
|
+
for (i = 0; i < n; ++i)
|
224
|
+
if (r[i].id == r[i].parent) {
|
225
|
+
++n_pri;
|
226
|
+
r[i].sam_pri = (n_pri == 1);
|
227
|
+
} else r[i].sam_pri = 0;
|
228
|
+
return n_pri;
|
229
|
+
}
|
230
|
+
|
231
|
+
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id
|
232
|
+
{
|
233
|
+
int *tmp, i, max_id = -1, n_tmp;
|
234
|
+
if (n_regs <= 0) return;
|
235
|
+
for (i = 0; i < n_regs; ++i) // NB: doesn't work if mm_reg1_t::id is negative
|
236
|
+
max_id = max_id > regs[i].id? max_id : regs[i].id;
|
237
|
+
n_tmp = max_id + 1;
|
238
|
+
tmp = (int*)kmalloc(km, n_tmp * sizeof(int));
|
239
|
+
for (i = 0; i < n_tmp; ++i) tmp[i] = -1;
|
240
|
+
for (i = 0; i < n_regs; ++i)
|
241
|
+
if (regs[i].id >= 0) tmp[regs[i].id] = i;
|
242
|
+
for (i = 0; i < n_regs; ++i) {
|
243
|
+
mm_reg1_t *r = ®s[i];
|
244
|
+
r->id = i;
|
245
|
+
if (r->parent == MM_PARENT_TMP_PRI)
|
246
|
+
r->parent = i;
|
247
|
+
else if (r->parent >= 0 && tmp[r->parent] >= 0)
|
248
|
+
r->parent = tmp[r->parent];
|
249
|
+
else r->parent = MM_PARENT_UNSET;
|
250
|
+
}
|
251
|
+
kfree(km, tmp);
|
252
|
+
mm_set_sam_pri(n_regs, regs);
|
253
|
+
}
|
254
|
+
|
255
|
+
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r)
|
256
|
+
{
|
257
|
+
if (pri_ratio > 0.0f && *n_ > 0) {
|
258
|
+
int i, k, n = *n_, n_2nd = 0;
|
259
|
+
for (i = k = 0; i < n; ++i) {
|
260
|
+
int p = r[i].parent;
|
261
|
+
if (p == i || r[i].inv) { // primary or inversion
|
262
|
+
r[k++] = r[i];
|
263
|
+
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
|
264
|
+
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rid == r[p].rid && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
|
265
|
+
r[k++] = r[i], ++n_2nd;
|
266
|
+
else if (r[i].p) free(r[i].p);
|
267
|
+
} else if (check_strand && n_2nd < best_n && r[i].score > min_strand_sc && r[i].rev != r[p].rev) {
|
268
|
+
r[i].strand_retained = 1;
|
269
|
+
r[k++] = r[i], ++n_2nd;
|
270
|
+
} else if (r[i].p) free(r[i].p);
|
271
|
+
}
|
272
|
+
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
|
273
|
+
*n_ = k;
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
int mm_filter_strand_retained(int n_regs, mm_reg1_t *r)
|
278
|
+
{
|
279
|
+
int i, k;
|
280
|
+
for (i = k = 0; i < n_regs; ++i) {
|
281
|
+
int p = r[i].parent;
|
282
|
+
if (!r[i].strand_retained || r[i].div < r[p].div * 5.0f || r[i].div < 0.01f) {
|
283
|
+
if (k < i) r[k++] = r[i];
|
284
|
+
else ++k;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
return k;
|
288
|
+
}
|
289
|
+
|
290
|
+
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs)
|
291
|
+
{ // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
|
292
|
+
int i, k;
|
293
|
+
for (i = k = 0; i < *n_regs; ++i) {
|
294
|
+
mm_reg1_t *r = ®s[i];
|
295
|
+
int flt = 0;
|
296
|
+
if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1;
|
297
|
+
if (r->p) { // these filters are only applied when base-alignment is available
|
298
|
+
if (r->mlen < opt->min_chain_score) flt = 1;
|
299
|
+
else if (r->p->dp_max < opt->min_dp_max) flt = 1;
|
300
|
+
else if (r->qs > qlen * opt->max_clip_ratio && qlen - r->qe > qlen * opt->max_clip_ratio) flt = 1;
|
301
|
+
if (flt) free(r->p);
|
302
|
+
}
|
303
|
+
if (!flt) {
|
304
|
+
if (k < i) regs[k++] = regs[i];
|
305
|
+
else ++k;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
*n_regs = k;
|
309
|
+
}
|
310
|
+
|
311
|
+
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
|
312
|
+
{ // squeeze out regions in a[] that are not referenced by regs[]
|
313
|
+
int i, as = 0;
|
314
|
+
uint64_t *aux;
|
315
|
+
aux = (uint64_t*)kmalloc(km, n_regs * 8);
|
316
|
+
for (i = 0; i < n_regs; ++i)
|
317
|
+
aux[i] = (uint64_t)regs[i].as << 32 | i;
|
318
|
+
radix_sort_64(aux, aux + n_regs);
|
319
|
+
for (i = 0; i < n_regs; ++i) {
|
320
|
+
mm_reg1_t *r = ®s[(int32_t)aux[i]];
|
321
|
+
if (r->as != as) {
|
322
|
+
memmove(&a[as], &a[r->as], r->cnt * 16);
|
323
|
+
r->as = as;
|
324
|
+
}
|
325
|
+
as += r->cnt;
|
326
|
+
}
|
327
|
+
kfree(km, aux);
|
328
|
+
return as;
|
329
|
+
}
|
330
|
+
|
331
|
+
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
|
332
|
+
{
|
333
|
+
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
|
334
|
+
mm_seg_t *seg;
|
335
|
+
|
336
|
+
assert(n_segs <= MM_MAX_SEG);
|
337
|
+
for (s = 1, acc_qlen[0] = 0; s < n_segs; ++s)
|
338
|
+
acc_qlen[s] = acc_qlen[s-1] + qlens[s-1];
|
339
|
+
qlen_sum = acc_qlen[n_segs - 1] + qlens[n_segs - 1];
|
340
|
+
|
341
|
+
seg = (mm_seg_t*)kcalloc(km, n_segs, sizeof(mm_seg_t));
|
342
|
+
for (s = 0; s < n_segs; ++s) {
|
343
|
+
seg[s].u = (uint64_t*)kmalloc(km, n_regs0 * 8);
|
344
|
+
for (i = 0; i < n_regs0; ++i)
|
345
|
+
seg[s].u[i] = (uint64_t)regs0[i].score << 32;
|
346
|
+
}
|
347
|
+
for (i = 0; i < n_regs0; ++i) {
|
348
|
+
const mm_reg1_t *r = ®s0[i];
|
349
|
+
for (j = 0; j < r->cnt; ++j) {
|
350
|
+
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
|
351
|
+
++seg[sid].u[i];
|
352
|
+
++seg[sid].n_a;
|
353
|
+
}
|
354
|
+
}
|
355
|
+
for (s = 0; s < n_segs; ++s) {
|
356
|
+
mm_seg_t *sr = &seg[s];
|
357
|
+
for (i = 0, sr->n_u = 0; i < n_regs0; ++i) // squeeze out zero-length per-segment chains
|
358
|
+
if ((int32_t)sr->u[i] != 0)
|
359
|
+
sr->u[sr->n_u++] = sr->u[i];
|
360
|
+
sr->a = (mm128_t*)kmalloc(km, sr->n_a * sizeof(mm128_t));
|
361
|
+
sr->n_a = 0;
|
362
|
+
}
|
363
|
+
|
364
|
+
for (i = 0; i < n_regs0; ++i) {
|
365
|
+
const mm_reg1_t *r = ®s0[i];
|
366
|
+
for (j = 0; j < r->cnt; ++j) {
|
367
|
+
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
|
368
|
+
mm128_t a1 = a[r->as + j];
|
369
|
+
// on reverse strand, the segment position is:
|
370
|
+
// x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
|
371
|
+
// (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
|
372
|
+
a1.y -= a1.x>>63? qlen_sum - (qlens[sid] + acc_qlen[sid]) : acc_qlen[sid];
|
373
|
+
seg[sid].a[seg[sid].n_a++] = a1;
|
374
|
+
}
|
375
|
+
}
|
376
|
+
for (s = 0; s < n_segs; ++s) {
|
377
|
+
regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a, 0);
|
378
|
+
n_regs[s] = seg[s].n_u;
|
379
|
+
for (i = 0; i < n_regs[s]; ++i) {
|
380
|
+
regs[s][i].seg_split = 1;
|
381
|
+
regs[s][i].seg_id = s;
|
382
|
+
}
|
383
|
+
}
|
384
|
+
return seg;
|
385
|
+
}
|
386
|
+
|
387
|
+
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs)
|
388
|
+
{
|
389
|
+
int i;
|
390
|
+
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].u);
|
391
|
+
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].a);
|
392
|
+
kfree(km, segs);
|
393
|
+
}
|
394
|
+
|
395
|
+
static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
|
396
|
+
{
|
397
|
+
int i, n_aux;
|
398
|
+
mm128_t *aux;
|
399
|
+
if (n_regs < 3) return;
|
400
|
+
for (i = 0; i < n_regs; ++i)
|
401
|
+
if (regs[i].inv) break;
|
402
|
+
if (i == n_regs) return; // no inversion hits
|
403
|
+
|
404
|
+
aux = (mm128_t*)kmalloc(km, n_regs * 16);
|
405
|
+
for (i = n_aux = 0; i < n_regs; ++i)
|
406
|
+
if (regs[i].parent == i || regs[i].parent < 0)
|
407
|
+
aux[n_aux].y = i, aux[n_aux++].x = (uint64_t)regs[i].rid << 32 | regs[i].rs;
|
408
|
+
radix_sort_128x(aux, aux + n_aux);
|
409
|
+
|
410
|
+
for (i = 1; i < n_aux - 1; ++i) {
|
411
|
+
mm_reg1_t *inv = ®s[aux[i].y];
|
412
|
+
if (inv->inv) {
|
413
|
+
mm_reg1_t *l = ®s[aux[i-1].y];
|
414
|
+
mm_reg1_t *r = ®s[aux[i+1].y];
|
415
|
+
inv->mapq = l->mapq < r->mapq? l->mapq : r->mapq;
|
416
|
+
}
|
417
|
+
}
|
418
|
+
kfree(km, aux);
|
419
|
+
}
|
420
|
+
|
421
|
+
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
|
422
|
+
{
|
423
|
+
static const float q_coef = 40.0f;
|
424
|
+
int64_t sum_sc = 0;
|
425
|
+
float uniq_ratio;
|
426
|
+
int i;
|
427
|
+
if (n_regs == 0) return;
|
428
|
+
for (i = 0; i < n_regs; ++i)
|
429
|
+
if (regs[i].parent == regs[i].id)
|
430
|
+
sum_sc += regs[i].score;
|
431
|
+
uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
|
432
|
+
for (i = 0; i < n_regs; ++i) {
|
433
|
+
mm_reg1_t *r = ®s[i];
|
434
|
+
if (r->inv) {
|
435
|
+
r->mapq = 0;
|
436
|
+
} else if (r->parent == r->id) {
|
437
|
+
int mapq, subsc;
|
438
|
+
float pen_s1 = (r->score > 100? 1.0f : 0.01f * r->score) * uniq_ratio;
|
439
|
+
float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
|
440
|
+
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
441
|
+
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
|
442
|
+
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
|
443
|
+
float identity = (float)r->mlen / r->blen;
|
444
|
+
float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
|
445
|
+
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
|
446
|
+
if (!is_sr) {
|
447
|
+
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
|
448
|
+
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
|
449
|
+
}
|
450
|
+
} else {
|
451
|
+
float x = (float)subsc / r->score0;
|
452
|
+
if (r->p) {
|
453
|
+
float identity = (float)r->mlen / r->blen;
|
454
|
+
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc));
|
455
|
+
} else {
|
456
|
+
mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
|
457
|
+
}
|
458
|
+
}
|
459
|
+
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
|
460
|
+
mapq = mapq > 0? mapq : 0;
|
461
|
+
r->mapq = mapq < 60? mapq : 60;
|
462
|
+
if (r->p && r->p->dp_max > r->p->dp_max2 && r->mapq == 0) r->mapq = 1;
|
463
|
+
} else r->mapq = 0;
|
464
|
+
}
|
465
|
+
mm_set_inv_mapq(km, n_regs, regs);
|
466
|
+
}
|