minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,1020 @@
|
|
1
|
+
#include <assert.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include "minimap.h"
|
6
|
+
#include "mmpriv.h"
|
7
|
+
#include "ksw2.h"
|
8
|
+
|
9
|
+
static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc_ambi)
|
10
|
+
{
|
11
|
+
int i, j;
|
12
|
+
a = a < 0? -a : a;
|
13
|
+
b = b > 0? -b : b;
|
14
|
+
sc_ambi = sc_ambi > 0? -sc_ambi : sc_ambi;
|
15
|
+
for (i = 0; i < m - 1; ++i) {
|
16
|
+
for (j = 0; j < m - 1; ++j)
|
17
|
+
mat[i * m + j] = i == j? a : b;
|
18
|
+
mat[i * m + m - 1] = sc_ambi;
|
19
|
+
}
|
20
|
+
for (j = 0; j < m; ++j)
|
21
|
+
mat[(m - 1) * m + j] = sc_ambi;
|
22
|
+
}
|
23
|
+
|
24
|
+
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
25
|
+
{
|
26
|
+
uint32_t i;
|
27
|
+
uint8_t t;
|
28
|
+
for (i = 0; i < len>>1; ++i)
|
29
|
+
t = seq[i], seq[i] = seq[len - 1 - i], seq[len - 1 - i] = t;
|
30
|
+
}
|
31
|
+
|
32
|
+
static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int *max_zdrop, int pos[2][2])
|
33
|
+
{
|
34
|
+
if (score < *max) {
|
35
|
+
int li = i - *max_i;
|
36
|
+
int lj = j - *max_j;
|
37
|
+
int diff = li > lj? li - lj : lj - li;
|
38
|
+
int z = *max - score - diff * e;
|
39
|
+
if (z > *max_zdrop) {
|
40
|
+
*max_zdrop = z;
|
41
|
+
pos[0][0] = *max_i, pos[0][1] = i;
|
42
|
+
pos[1][0] = *max_j, pos[1][1] = j;
|
43
|
+
}
|
44
|
+
} else *max = score, *max_i = i, *max_j = j;
|
45
|
+
}
|
46
|
+
|
47
|
+
static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat)
|
48
|
+
{
|
49
|
+
uint32_t k;
|
50
|
+
int32_t score = 0, max = INT32_MIN, max_i = -1, max_j = -1, i = 0, j = 0, max_zdrop = 0;
|
51
|
+
int pos[2][2] = {{-1, -1}, {-1, -1}}, q_len, t_len;
|
52
|
+
|
53
|
+
// find the score and the region where score drops most along diagonal
|
54
|
+
for (k = 0, score = 0; k < n_cigar; ++k) {
|
55
|
+
uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
|
56
|
+
if (op == MM_CIGAR_MATCH) {
|
57
|
+
for (l = 0; l < len; ++l) {
|
58
|
+
score += mat[tseq[i + l] * 5 + qseq[j + l]];
|
59
|
+
update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
|
60
|
+
}
|
61
|
+
i += len, j += len;
|
62
|
+
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
|
63
|
+
score -= opt->q + opt->e * len;
|
64
|
+
if (op == MM_CIGAR_INS) j += len;
|
65
|
+
else i += len;
|
66
|
+
update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
// test if there is an inversion in the most dropped region
|
71
|
+
q_len = pos[1][1] - pos[1][0], t_len = pos[0][1] - pos[0][0];
|
72
|
+
if (!(opt->flag&(MM_F_SPLICE|MM_F_SR|MM_F_FOR_ONLY|MM_F_REV_ONLY)) && max_zdrop > opt->zdrop_inv && q_len < opt->max_gap && t_len < opt->max_gap) {
|
73
|
+
uint8_t *qseq2;
|
74
|
+
void *qp;
|
75
|
+
int q_off, t_off;
|
76
|
+
qseq2 = (uint8_t*)kmalloc(km, q_len);
|
77
|
+
for (i = 0; i < q_len; ++i) {
|
78
|
+
int c = qseq[pos[1][1] - i - 1];
|
79
|
+
qseq2[i] = c >= 4? 4 : 3 - c;
|
80
|
+
}
|
81
|
+
qp = ksw_ll_qinit(km, 2, q_len, qseq2, 5, mat);
|
82
|
+
score = ksw_ll_i16(qp, t_len, tseq + pos[0][0], opt->q, opt->e, &q_off, &t_off);
|
83
|
+
kfree(km, qseq2);
|
84
|
+
kfree(km, qp);
|
85
|
+
if (score >= opt->min_chain_score * opt->a && score >= opt->min_dp_max)
|
86
|
+
return 2; // there is a potential inversion
|
87
|
+
}
|
88
|
+
return max_zdrop > opt->zdrop? 1 : 0;
|
89
|
+
}
|
90
|
+
|
91
|
+
static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift)
|
92
|
+
{
|
93
|
+
mm_extra_t *p = r->p;
|
94
|
+
int32_t toff = 0, qoff = 0, to_shrink = 0;
|
95
|
+
uint32_t k;
|
96
|
+
*qshift = *tshift = 0;
|
97
|
+
if (p->n_cigar <= 1) return;
|
98
|
+
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
|
99
|
+
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
100
|
+
if (len == 0) to_shrink = 1;
|
101
|
+
if (op == MM_CIGAR_MATCH) {
|
102
|
+
toff += len, qoff += len;
|
103
|
+
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
104
|
+
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
|
105
|
+
int l, prev_len = p->cigar[k-1] >> 4;
|
106
|
+
if (op == MM_CIGAR_INS) {
|
107
|
+
for (l = 0; l < prev_len; ++l)
|
108
|
+
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
|
109
|
+
break;
|
110
|
+
} else {
|
111
|
+
for (l = 0; l < prev_len; ++l)
|
112
|
+
if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l])
|
113
|
+
break;
|
114
|
+
}
|
115
|
+
if (l > 0)
|
116
|
+
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
|
117
|
+
if (l == prev_len) to_shrink = 1;
|
118
|
+
}
|
119
|
+
if (op == MM_CIGAR_INS) qoff += len;
|
120
|
+
else toff += len;
|
121
|
+
} else if (op == MM_CIGAR_N_SKIP) {
|
122
|
+
toff += len;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
126
|
+
for (k = 0; k < p->n_cigar - 2; ++k) { // fix CIGAR like 5I6D7I
|
127
|
+
if ((p->cigar[k]&0xf) > 0 && (p->cigar[k]&0xf) + (p->cigar[k+1]&0xf) == 3) {
|
128
|
+
uint32_t l, s[3] = {0,0,0};
|
129
|
+
for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
|
130
|
+
uint32_t op = p->cigar[l]&0xf;
|
131
|
+
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
|
132
|
+
s[op] += p->cigar[l] >> 4;
|
133
|
+
else break;
|
134
|
+
}
|
135
|
+
if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
|
136
|
+
p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
|
137
|
+
p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
|
138
|
+
for (k += 2; k < l; ++k)
|
139
|
+
p->cigar[k] &= 0xf;
|
140
|
+
to_shrink = 1;
|
141
|
+
}
|
142
|
+
k = l;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
if (to_shrink) { // squeeze out zero-length operations
|
146
|
+
int32_t l = 0;
|
147
|
+
for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations
|
148
|
+
if (p->cigar[k]>>4 != 0)
|
149
|
+
p->cigar[l++] = p->cigar[k];
|
150
|
+
p->n_cigar = l;
|
151
|
+
for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same
|
152
|
+
if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf))
|
153
|
+
p->cigar[l++] = p->cigar[k];
|
154
|
+
else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
|
155
|
+
p->n_cigar = l;
|
156
|
+
}
|
157
|
+
if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
|
158
|
+
int32_t l = p->cigar[0] >> 4;
|
159
|
+
if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
|
160
|
+
if (r->rev) r->qe -= l;
|
161
|
+
else r->qs += l;
|
162
|
+
*qshift = l;
|
163
|
+
} else r->rs += l, *tshift = l;
|
164
|
+
--p->n_cigar;
|
165
|
+
memmove(p->cigar, p->cigar + 1, p->n_cigar * 4);
|
166
|
+
}
|
167
|
+
}
|
168
|
+
|
169
|
+
static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer
|
170
|
+
{
|
171
|
+
uint32_t n_EQX = 0;
|
172
|
+
uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0;
|
173
|
+
mm_extra_t *p;
|
174
|
+
if (r->p == 0) return;
|
175
|
+
for (k = 0; k < r->p->n_cigar; ++k) {
|
176
|
+
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
177
|
+
if (op == MM_CIGAR_MATCH) {
|
178
|
+
while (len > 0) {
|
179
|
+
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
|
180
|
+
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
|
181
|
+
|
182
|
+
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} // run of "X"
|
183
|
+
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
|
184
|
+
}
|
185
|
+
++n_M;
|
186
|
+
} else if (op == MM_CIGAR_INS) {
|
187
|
+
qoff += len;
|
188
|
+
} else if (op == MM_CIGAR_DEL) {
|
189
|
+
toff += len;
|
190
|
+
} else if (op == MM_CIGAR_N_SKIP) {
|
191
|
+
toff += len;
|
192
|
+
}
|
193
|
+
}
|
194
|
+
// update in-place if we can
|
195
|
+
if (n_EQX == n_M) {
|
196
|
+
for (k = 0; k < r->p->n_cigar; ++k) {
|
197
|
+
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
198
|
+
if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
|
199
|
+
}
|
200
|
+
return;
|
201
|
+
}
|
202
|
+
// allocate new storage
|
203
|
+
cap = r->p->n_cigar + (n_EQX - n_M) + sizeof(mm_extra_t);
|
204
|
+
kroundup32(cap);
|
205
|
+
p = (mm_extra_t*)calloc(cap, 4);
|
206
|
+
memcpy(p, r->p, sizeof(mm_extra_t));
|
207
|
+
p->capacity = cap;
|
208
|
+
// update cigar while copying
|
209
|
+
toff = qoff = m = 0;
|
210
|
+
for (k = 0; k < r->p->n_cigar; ++k) {
|
211
|
+
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
212
|
+
if (op == MM_CIGAR_MATCH) {
|
213
|
+
while (len > 0) {
|
214
|
+
// match
|
215
|
+
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
|
216
|
+
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
|
217
|
+
len -= l;
|
218
|
+
toff += l, qoff += l;
|
219
|
+
// mismatch
|
220
|
+
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
|
221
|
+
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
|
222
|
+
len -= l;
|
223
|
+
toff += l, qoff += l;
|
224
|
+
}
|
225
|
+
continue;
|
226
|
+
} else if (op == MM_CIGAR_INS) {
|
227
|
+
qoff += len;
|
228
|
+
} else if (op == MM_CIGAR_DEL) {
|
229
|
+
toff += len;
|
230
|
+
} else if (op == MM_CIGAR_N_SKIP) {
|
231
|
+
toff += len;
|
232
|
+
}
|
233
|
+
p->cigar[m++] = r->p->cigar[k];
|
234
|
+
}
|
235
|
+
p->n_cigar = m;
|
236
|
+
free(r->p);
|
237
|
+
r->p = p;
|
238
|
+
}
|
239
|
+
|
240
|
+
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
|
241
|
+
{
|
242
|
+
uint32_t k, l;
|
243
|
+
int32_t qshift, tshift, toff = 0, qoff = 0;
|
244
|
+
double s = 0.0, max = 0.0;
|
245
|
+
mm_extra_t *p = r->p;
|
246
|
+
if (p == 0) return;
|
247
|
+
mm_fix_cigar(r, qseq, tseq, &qshift, &tshift);
|
248
|
+
qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D
|
249
|
+
r->blen = r->mlen = 0;
|
250
|
+
for (k = 0; k < p->n_cigar; ++k) {
|
251
|
+
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
252
|
+
if (op == MM_CIGAR_MATCH) {
|
253
|
+
int n_ambi = 0, n_diff = 0;
|
254
|
+
for (l = 0; l < len; ++l) {
|
255
|
+
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
256
|
+
if (ct > 3 || cq > 3) ++n_ambi;
|
257
|
+
else if (ct != cq) ++n_diff;
|
258
|
+
s += mat[ct * 5 + cq];
|
259
|
+
if (s < 0) s = 0;
|
260
|
+
else max = max > s? max : s;
|
261
|
+
}
|
262
|
+
r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
|
263
|
+
toff += len, qoff += len;
|
264
|
+
} else if (op == MM_CIGAR_INS) {
|
265
|
+
int n_ambi = 0;
|
266
|
+
for (l = 0; l < len; ++l)
|
267
|
+
if (qseq[qoff + l] > 3) ++n_ambi;
|
268
|
+
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
269
|
+
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
270
|
+
else s -= q + e;
|
271
|
+
if (s < 0) s = 0;
|
272
|
+
qoff += len;
|
273
|
+
} else if (op == MM_CIGAR_DEL) {
|
274
|
+
int n_ambi = 0;
|
275
|
+
for (l = 0; l < len; ++l)
|
276
|
+
if (tseq[toff + l] > 3) ++n_ambi;
|
277
|
+
r->blen += len - n_ambi, p->n_ambi += n_ambi;
|
278
|
+
if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
|
279
|
+
else s -= q + e;
|
280
|
+
if (s < 0) s = 0;
|
281
|
+
toff += len;
|
282
|
+
} else if (op == MM_CIGAR_N_SKIP) {
|
283
|
+
toff += len;
|
284
|
+
}
|
285
|
+
}
|
286
|
+
p->dp_max = (int32_t)(max + .499);
|
287
|
+
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
288
|
+
if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
|
289
|
+
}
|
290
|
+
|
291
|
+
static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
|
292
|
+
{
|
293
|
+
mm_extra_t *p;
|
294
|
+
if (n_cigar == 0) return;
|
295
|
+
if (r->p == 0) {
|
296
|
+
uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
|
297
|
+
kroundup32(capacity);
|
298
|
+
r->p = (mm_extra_t*)calloc(capacity, 4);
|
299
|
+
r->p->capacity = capacity;
|
300
|
+
} else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) {
|
301
|
+
r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4;
|
302
|
+
kroundup32(r->p->capacity);
|
303
|
+
r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4);
|
304
|
+
}
|
305
|
+
p = r->p;
|
306
|
+
if (p->n_cigar > 0 && (p->cigar[p->n_cigar-1]&0xf) == (cigar[0]&0xf)) { // same CIGAR op at the boundary
|
307
|
+
p->cigar[p->n_cigar-1] += cigar[0]>>4<<4;
|
308
|
+
if (n_cigar > 1) memcpy(p->cigar + p->n_cigar, cigar + 1, (n_cigar - 1) * 4);
|
309
|
+
p->n_cigar += n_cigar - 1;
|
310
|
+
} else {
|
311
|
+
memcpy(p->cigar + p->n_cigar, cigar, n_cigar * 4);
|
312
|
+
p->n_cigar += n_cigar;
|
313
|
+
}
|
314
|
+
}
|
315
|
+
|
316
|
+
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const uint8_t *junc, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez)
|
317
|
+
{
|
318
|
+
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
|
319
|
+
int i;
|
320
|
+
fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop);
|
321
|
+
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr);
|
322
|
+
fputc('\n', stderr);
|
323
|
+
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
324
|
+
fputc('\n', stderr);
|
325
|
+
}
|
326
|
+
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
327
|
+
ksw_reset_extz(ez);
|
328
|
+
ez->zdropped = 1;
|
329
|
+
} else if (opt->flag & MM_F_SPLICE)
|
330
|
+
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez);
|
331
|
+
else if (opt->q == opt->q2 && opt->e == opt->e2)
|
332
|
+
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
|
333
|
+
else
|
334
|
+
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);
|
335
|
+
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
|
336
|
+
int i;
|
337
|
+
fprintf(stderr, "score=%d, cigar=", ez->score);
|
338
|
+
for (i = 0; i < ez->n_cigar; ++i)
|
339
|
+
fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
|
340
|
+
fprintf(stderr, "\n");
|
341
|
+
}
|
342
|
+
}
|
343
|
+
|
344
|
+
static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x)
|
345
|
+
{
|
346
|
+
int64_t i, off0 = mi->seq[rid].offset, off = off0 + x;
|
347
|
+
int c = mm_seq4_get(mi->S, off);
|
348
|
+
for (i = off - 1; i >= off0; --i)
|
349
|
+
if (mm_seq4_get(mi->S, i) != c) break;
|
350
|
+
return (int)(off - i);
|
351
|
+
}
|
352
|
+
|
353
|
+
static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q)
|
354
|
+
{
|
355
|
+
if (mi->flag & MM_I_HPC) {
|
356
|
+
const uint8_t *qseq = qseq0[a->x>>63];
|
357
|
+
int i, c;
|
358
|
+
*q = (int32_t)a->y;
|
359
|
+
for (i = *q - 1, c = qseq[*q]; i > 0; --i)
|
360
|
+
if (qseq[i] != c) break;
|
361
|
+
*q = i + 1;
|
362
|
+
c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
|
363
|
+
*r = (int32_t)a->x + 1 - c;
|
364
|
+
} else {
|
365
|
+
*r = (int32_t)a->x - (mi->k>>1);
|
366
|
+
*q = (int32_t)a->y - (mi->k>>1);
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
static int *collect_long_gaps(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int *n_)
|
371
|
+
{
|
372
|
+
int i, n, *K;
|
373
|
+
*n_ = 0;
|
374
|
+
for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap
|
375
|
+
int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
|
376
|
+
if (gap < -min_gap || gap > min_gap) ++n;
|
377
|
+
}
|
378
|
+
if (n <= 1) return 0;
|
379
|
+
K = (int*)kmalloc(km, n * sizeof(int));
|
380
|
+
for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps
|
381
|
+
int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
|
382
|
+
if (gap < -min_gap || gap > min_gap)
|
383
|
+
K[n++] = i;
|
384
|
+
}
|
385
|
+
*n_ = n;
|
386
|
+
return K;
|
387
|
+
}
|
388
|
+
|
389
|
+
static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt)
|
390
|
+
{
|
391
|
+
int max_st, max_en, n, i, k, max, *K;
|
392
|
+
K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
|
393
|
+
if (K == 0) return;
|
394
|
+
max = 0, max_st = max_en = -1;
|
395
|
+
for (k = 0;; ++k) { // traverse long gaps
|
396
|
+
int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1;
|
397
|
+
if (k == n || k >= max_en) {
|
398
|
+
if (max_en > 0)
|
399
|
+
for (i = K[max_st]; i < K[max_en]; ++i)
|
400
|
+
a[as1 + i].y |= MM_SEED_IGNORE;
|
401
|
+
max = 0, max_st = max_en = -1;
|
402
|
+
if (k == n) break;
|
403
|
+
}
|
404
|
+
i = K[k];
|
405
|
+
gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x);
|
406
|
+
if (gap > 0) n_ins += gap;
|
407
|
+
else n_del += -gap;
|
408
|
+
qs = (int32_t)a[as1 + i - 1].y;
|
409
|
+
rs = (int32_t)a[as1 + i - 1].x;
|
410
|
+
for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) {
|
411
|
+
int j = K[l], diff;
|
412
|
+
if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break;
|
413
|
+
gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
|
414
|
+
if (gap > 0) n_ins += gap;
|
415
|
+
else n_del += -gap;
|
416
|
+
diff = n_ins + n_del - abs(n_ins - n_del);
|
417
|
+
if (max_diff < diff)
|
418
|
+
max_diff = diff, max_diff_l = l;
|
419
|
+
}
|
420
|
+
if (max_diff > diff_thres && max_diff > max)
|
421
|
+
max = max_diff, max_st = k, max_en = max_diff_l;
|
422
|
+
}
|
423
|
+
kfree(km, K);
|
424
|
+
}
|
425
|
+
|
426
|
+
static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int max_ext)
|
427
|
+
{
|
428
|
+
int n, k, *K;
|
429
|
+
K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
|
430
|
+
if (K == 0) return;
|
431
|
+
for (k = 0; k < n;) {
|
432
|
+
int i = K[k], l;
|
433
|
+
int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x);
|
434
|
+
int re1 = (int32_t)a[as1 + i].x;
|
435
|
+
int qe1 = (int32_t)a[as1 + i].y;
|
436
|
+
gap1 = gap1 > 0? gap1 : -gap1;
|
437
|
+
for (l = k + 1; l < n; ++l) {
|
438
|
+
int j = K[l], gap2, q_span_pre, rs2, qs2, m;
|
439
|
+
if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break;
|
440
|
+
gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
|
441
|
+
q_span_pre = a[as1 + j - 1].y >> 32 & 0xff;
|
442
|
+
rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre;
|
443
|
+
qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre;
|
444
|
+
m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1;
|
445
|
+
gap2 = gap2 > 0? gap2 : -gap2;
|
446
|
+
if (m > gap1 + gap2) break;
|
447
|
+
re1 = (int32_t)a[as1 + j].x;
|
448
|
+
qe1 = (int32_t)a[as1 + j].y;
|
449
|
+
gap1 = gap2;
|
450
|
+
}
|
451
|
+
if (l > k + 1) {
|
452
|
+
int j, end = K[l - 1];
|
453
|
+
for (j = K[k]; j < end; ++j)
|
454
|
+
a[as1 + j].y |= MM_SEED_IGNORE;
|
455
|
+
a[as1 + end].y |= MM_SEED_LONG_JOIN;
|
456
|
+
}
|
457
|
+
k = l;
|
458
|
+
}
|
459
|
+
kfree(km, K);
|
460
|
+
}
|
461
|
+
|
462
|
+
static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int min_match, int32_t *as, int32_t *cnt)
|
463
|
+
{
|
464
|
+
int32_t i, l, m;
|
465
|
+
*as = r->as, *cnt = r->cnt;
|
466
|
+
if (r->cnt < 3) return;
|
467
|
+
m = l = a[r->as].y >> 32 & 0xff;
|
468
|
+
for (i = r->as + 1; i < r->as + r->cnt - 1; ++i) {
|
469
|
+
int32_t lq, lr, min, max;
|
470
|
+
int32_t q_span = a[i].y >> 32 & 0xff;
|
471
|
+
if (a[i].y & MM_SEED_LONG_JOIN) break;
|
472
|
+
lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
|
473
|
+
lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
|
474
|
+
min = lr < lq? lr : lq;
|
475
|
+
max = lr > lq? lr : lq;
|
476
|
+
if (max - min > l >> 1) *as = i;
|
477
|
+
l += min;
|
478
|
+
m += min < q_span? min : q_span;
|
479
|
+
if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
|
480
|
+
}
|
481
|
+
*cnt = r->as + r->cnt - *as;
|
482
|
+
m = l = a[r->as + r->cnt - 1].y >> 32 & 0xff;
|
483
|
+
for (i = r->as + r->cnt - 2; i > *as; --i) {
|
484
|
+
int32_t lq, lr, min, max;
|
485
|
+
int32_t q_span = a[i+1].y >> 32 & 0xff;
|
486
|
+
if (a[i+1].y & MM_SEED_LONG_JOIN) break;
|
487
|
+
lr = (int32_t)a[i+1].x - (int32_t)a[i].x;
|
488
|
+
lq = (int32_t)a[i+1].y - (int32_t)a[i].y;
|
489
|
+
min = lr < lq? lr : lq;
|
490
|
+
max = lr > lq? lr : lq;
|
491
|
+
if (max - min > l >> 1) *cnt = i + 1 - *as;
|
492
|
+
l += min;
|
493
|
+
m += min < q_span? min : q_span;
|
494
|
+
if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
|
495
|
+
}
|
496
|
+
}
|
497
|
+
|
498
|
+
static void mm_max_stretch(const mm_reg1_t *r, const mm128_t *a, int32_t *as, int32_t *cnt)
|
499
|
+
{
|
500
|
+
int32_t i, score, max_score, len, max_i, max_len;
|
501
|
+
|
502
|
+
*as = r->as, *cnt = r->cnt;
|
503
|
+
if (r->cnt < 2) return;
|
504
|
+
|
505
|
+
max_score = -1, max_i = -1, max_len = 0;
|
506
|
+
score = a[r->as].y >> 32 & 0xff, len = 1;
|
507
|
+
for (i = r->as + 1; i < r->as + r->cnt; ++i) {
|
508
|
+
int32_t lq, lr, q_span;
|
509
|
+
q_span = a[i].y >> 32 & 0xff;
|
510
|
+
lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
|
511
|
+
lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
|
512
|
+
if (lq == lr) {
|
513
|
+
score += lq < q_span? lq : q_span;
|
514
|
+
++len;
|
515
|
+
} else {
|
516
|
+
if (score > max_score)
|
517
|
+
max_score = score, max_len = len, max_i = i - len;
|
518
|
+
score = q_span, len = 1;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
if (score > max_score)
|
522
|
+
max_score = score, max_len = len, max_i = i - len;
|
523
|
+
*as = max_i, *cnt = max_len;
|
524
|
+
}
|
525
|
+
|
526
|
+
static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a)
|
527
|
+
{
|
528
|
+
uint8_t *qseq, *tseq;
|
529
|
+
int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len;
|
530
|
+
void *qp;
|
531
|
+
rid = a->x<<1>>33;
|
532
|
+
re = (uint32_t)a->x + 1, rs = re - q_span;
|
533
|
+
qe = (uint32_t)a->y + 1, qs = qe - q_span;
|
534
|
+
rs = rs - ext_len > 0? rs - ext_len : 0;
|
535
|
+
qs = qs - ext_len > 0? qs - ext_len : 0;
|
536
|
+
re = re + ext_len < (int32_t)mi->seq[rid].len? re + ext_len : mi->seq[rid].len;
|
537
|
+
qe = qe + ext_len < qlen? qe + ext_len : qlen;
|
538
|
+
tseq = (uint8_t*)kmalloc(km, re - rs);
|
539
|
+
if (opt->flag & MM_F_QSTRAND) {
|
540
|
+
qseq = qseq0[0] + qs;
|
541
|
+
mm_idx_getseq2(mi, a->x>>63, rid, rs, re, tseq);
|
542
|
+
} else {
|
543
|
+
qseq = qseq0[a->x>>63] + qs;
|
544
|
+
mm_idx_getseq(mi, rid, rs, re, tseq);
|
545
|
+
}
|
546
|
+
qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat);
|
547
|
+
score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off);
|
548
|
+
kfree(km, tseq);
|
549
|
+
kfree(km, qp);
|
550
|
+
return score;
|
551
|
+
}
|
552
|
+
|
553
|
+
static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1)
|
554
|
+
{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
|
555
|
+
int score;
|
556
|
+
double log_gap;
|
557
|
+
*as1 = r->as, *cnt1 = r->cnt;
|
558
|
+
if (r->cnt < 3) return;
|
559
|
+
log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x);
|
560
|
+
if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
|
561
|
+
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]);
|
562
|
+
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift"
|
563
|
+
++(*as1), --(*cnt1);
|
564
|
+
}
|
565
|
+
log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x);
|
566
|
+
if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
|
567
|
+
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]);
|
568
|
+
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift)
|
569
|
+
--(*cnt1);
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
|
574
|
+
{
|
575
|
+
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
|
576
|
+
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
|
577
|
+
uint8_t *tseq, *qseq, *junc;
|
578
|
+
int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
|
579
|
+
int32_t rs, re, qs, qe;
|
580
|
+
int32_t rs1, qs1, re1, qe1;
|
581
|
+
int8_t mat[25];
|
582
|
+
|
583
|
+
if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
|
584
|
+
|
585
|
+
r2->cnt = 0;
|
586
|
+
if (r->cnt == 0) return;
|
587
|
+
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
|
588
|
+
bw = (int)(opt->bw * 1.5 + 1.);
|
589
|
+
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
590
|
+
if (bw_long < bw) bw_long = bw;
|
591
|
+
|
592
|
+
if (is_sr && !(mi->flag & MM_I_HPC)) {
|
593
|
+
mm_max_stretch(r, a, &as1, &cnt1);
|
594
|
+
rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff);
|
595
|
+
qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff);
|
596
|
+
re = (int32_t)a[as1+cnt1-1].x + 1;
|
597
|
+
qe = (int32_t)a[as1+cnt1-1].y + 1;
|
598
|
+
} else {
|
599
|
+
if (!(opt->flag & MM_F_NO_END_FLT)) {
|
600
|
+
if (is_splice)
|
601
|
+
mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1);
|
602
|
+
else
|
603
|
+
mm_fix_bad_ends(r, a, opt->bw, opt->min_chain_score * 2, &as1, &cnt1);
|
604
|
+
} else as1 = r->as, cnt1 = r->cnt;
|
605
|
+
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
|
606
|
+
mm_filter_bad_seeds_alt(km, as1, cnt1, a, 30, opt->max_gap>>1);
|
607
|
+
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
|
608
|
+
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
|
609
|
+
}
|
610
|
+
assert(cnt1 > 0);
|
611
|
+
|
612
|
+
if (is_splice) {
|
613
|
+
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
|
614
|
+
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
|
615
|
+
if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
|
616
|
+
}
|
617
|
+
|
618
|
+
/* Look for the start and end of regions to perform DP. This sounds easy
|
619
|
+
* but is in fact tricky. Excessively small regions lead to unnecessary
|
620
|
+
* clippings and lose alignable sequences. Excessively large regions
|
621
|
+
* occasionally lead to large overlaps between two chains and may cause
|
622
|
+
* loss of alignments in corner cases. */
|
623
|
+
if (is_sr) {
|
624
|
+
qs0 = 0, qe0 = qlen;
|
625
|
+
l = qs;
|
626
|
+
l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
|
627
|
+
rs0 = rs - l > 0? rs - l : 0;
|
628
|
+
l = qlen - qe;
|
629
|
+
l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
|
630
|
+
re0 = re + l < (int32_t)mi->seq[rid].len? re + l : mi->seq[rid].len;
|
631
|
+
} else {
|
632
|
+
// compute rs0 and qs0
|
633
|
+
rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff);
|
634
|
+
qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff);
|
635
|
+
if (rs0 < 0) rs0 = 0; // this may happen when HPC is in use
|
636
|
+
assert(qs0 >= 0); // this should never happen, or it is logic error
|
637
|
+
rs1 = qs1 = 0;
|
638
|
+
for (i = r->as - 1, l = 0; i >= 0 && a[i].x>>32 == a[r->as].x>>32; --i) { // inspect nearby seeds
|
639
|
+
int32_t x = (int32_t)a[i].x + 1 - (int32_t)(a[i].y>>32&0xff);
|
640
|
+
int32_t y = (int32_t)a[i].y + 1 - (int32_t)(a[i].y>>32&0xff);
|
641
|
+
if (x < rs0 && y < qs0) {
|
642
|
+
if (++l > opt->min_cnt) {
|
643
|
+
l = rs0 - x > qs0 - y? rs0 - x : qs0 - y;
|
644
|
+
rs1 = rs0 - l, qs1 = qs0 - l;
|
645
|
+
if (rs1 < 0) rs1 = 0; // not strictly necessary; better have this guard for explicit
|
646
|
+
break;
|
647
|
+
}
|
648
|
+
}
|
649
|
+
}
|
650
|
+
if (qs > 0 && rs > 0) {
|
651
|
+
l = qs < opt->max_gap? qs : opt->max_gap;
|
652
|
+
qs1 = qs1 > qs - l? qs1 : qs - l;
|
653
|
+
qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0
|
654
|
+
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
|
655
|
+
l = l < opt->max_gap? l : opt->max_gap;
|
656
|
+
l = l < rs? l : rs;
|
657
|
+
rs1 = rs1 > rs - l? rs1 : rs - l;
|
658
|
+
rs0 = rs0 < rs1? rs0 : rs1;
|
659
|
+
rs0 = rs0 < rs? rs0 : rs;
|
660
|
+
} else rs0 = rs, qs0 = qs;
|
661
|
+
// compute re0 and qe0
|
662
|
+
re0 = (int32_t)a[r->as + r->cnt - 1].x + 1;
|
663
|
+
qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1;
|
664
|
+
re1 = mi->seq[rid].len, qe1 = qlen;
|
665
|
+
for (i = r->as + r->cnt, l = 0; i < n_a && a[i].x>>32 == a[r->as].x>>32; ++i) { // inspect nearby seeds
|
666
|
+
int32_t x = (int32_t)a[i].x + 1;
|
667
|
+
int32_t y = (int32_t)a[i].y + 1;
|
668
|
+
if (x > re0 && y > qe0) {
|
669
|
+
if (++l > opt->min_cnt) {
|
670
|
+
l = x - re0 > y - qe0? x - re0 : y - qe0;
|
671
|
+
re1 = re0 + l, qe1 = qe0 + l;
|
672
|
+
break;
|
673
|
+
}
|
674
|
+
}
|
675
|
+
}
|
676
|
+
if (qe < qlen && re < (int32_t)mi->seq[rid].len) {
|
677
|
+
l = qlen - qe < opt->max_gap? qlen - qe : opt->max_gap;
|
678
|
+
qe1 = qe1 < qe + l? qe1 : qe + l;
|
679
|
+
qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0
|
680
|
+
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
|
681
|
+
l = l < opt->max_gap? l : opt->max_gap;
|
682
|
+
l = l < (int32_t)mi->seq[rid].len - re? l : mi->seq[rid].len - re;
|
683
|
+
re1 = re1 < re + l? re1 : re + l;
|
684
|
+
re0 = re0 > re1? re0 : re1;
|
685
|
+
} else re0 = re, qe0 = qe;
|
686
|
+
}
|
687
|
+
if (a[r->as].y & MM_SEED_SELF) {
|
688
|
+
int max_ext = r->qs > r->rs? r->qs - r->rs : r->rs - r->qs;
|
689
|
+
if (r->rs - rs0 > max_ext) rs0 = r->rs - max_ext;
|
690
|
+
if (r->qs - qs0 > max_ext) qs0 = r->qs - max_ext;
|
691
|
+
max_ext = r->qe > r->re? r->qe - r->re : r->re - r->qe;
|
692
|
+
if (re0 - r->re > max_ext) re0 = r->re + max_ext;
|
693
|
+
if (qe0 - r->qe > max_ext) qe0 = r->qe + max_ext;
|
694
|
+
}
|
695
|
+
|
696
|
+
assert(re0 > rs0);
|
697
|
+
tseq = (uint8_t*)kmalloc(km, re0 - rs0);
|
698
|
+
junc = (uint8_t*)kmalloc(km, re0 - rs0);
|
699
|
+
|
700
|
+
if (qs > 0 && rs > 0) { // left extension; probably the condition can be changed to "qs > qs0 && rs > rs0"
|
701
|
+
if (opt->flag & MM_F_QSTRAND) {
|
702
|
+
qseq = &qseq0[0][qs0];
|
703
|
+
mm_idx_getseq2(mi, rev, rid, rs0, rs, tseq);
|
704
|
+
} else {
|
705
|
+
qseq = &qseq0[rev][qs0];
|
706
|
+
mm_idx_getseq(mi, rid, rs0, rs, tseq);
|
707
|
+
}
|
708
|
+
mm_idx_bed_junc(mi, rid, rs0, rs, junc);
|
709
|
+
mm_seq_rev(qs - qs0, qseq);
|
710
|
+
mm_seq_rev(rs - rs0, tseq);
|
711
|
+
mm_seq_rev(rs - rs0, junc);
|
712
|
+
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, junc, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
|
713
|
+
if (ez->n_cigar > 0) {
|
714
|
+
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
715
|
+
r->p->dp_score += ez->max;
|
716
|
+
}
|
717
|
+
rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
|
718
|
+
qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1);
|
719
|
+
mm_seq_rev(qs - qs0, qseq);
|
720
|
+
} else rs1 = rs, qs1 = qs;
|
721
|
+
re1 = rs, qe1 = qs;
|
722
|
+
assert(qs1 >= 0 && rs1 >= 0);
|
723
|
+
|
724
|
+
for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
|
725
|
+
if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
|
726
|
+
if (is_sr && !(mi->flag & MM_I_HPC)) {
|
727
|
+
re = (int32_t)a[as1 + i].x + 1;
|
728
|
+
qe = (int32_t)a[as1 + i].y + 1;
|
729
|
+
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
|
730
|
+
re1 = re, qe1 = qe;
|
731
|
+
if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
|
732
|
+
int j, bw1 = bw_long, zdrop_code;
|
733
|
+
if (a[as1+i].y & MM_SEED_LONG_JOIN)
|
734
|
+
bw1 = qe - qs > re - rs? qe - qs : re - rs;
|
735
|
+
// perform alignment
|
736
|
+
if (opt->flag & MM_F_QSTRAND) {
|
737
|
+
qseq = &qseq0[0][qs];
|
738
|
+
mm_idx_getseq2(mi, rev, rid, rs, re, tseq);
|
739
|
+
} else {
|
740
|
+
qseq = &qseq0[rev][qs];
|
741
|
+
mm_idx_getseq(mi, rid, rs, re, tseq);
|
742
|
+
}
|
743
|
+
mm_idx_bed_junc(mi, rid, rs, re, junc);
|
744
|
+
if (is_sr) { // perform ungapped alignment
|
745
|
+
assert(qe - qs == re - rs);
|
746
|
+
ksw_reset_extz(ez);
|
747
|
+
for (j = 0, ez->score = 0; j < qe - qs; ++j) {
|
748
|
+
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
|
749
|
+
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
|
750
|
+
}
|
751
|
+
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
|
752
|
+
} else { // perform normal gapped alignment
|
753
|
+
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
|
754
|
+
}
|
755
|
+
// test Z-drop and inversion Z-drop
|
756
|
+
if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0)
|
757
|
+
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate
|
758
|
+
// update CIGAR
|
759
|
+
if (ez->n_cigar > 0)
|
760
|
+
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
761
|
+
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
|
762
|
+
if (!r->p) {
|
763
|
+
assert(ez->n_cigar == 0);
|
764
|
+
uint32_t capacity = sizeof(mm_extra_t)/4;
|
765
|
+
kroundup32(capacity);
|
766
|
+
r->p = (mm_extra_t*)calloc(capacity, 4);
|
767
|
+
r->p->capacity = capacity;
|
768
|
+
}
|
769
|
+
for (j = i - 1; j >= 0; --j)
|
770
|
+
if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
|
771
|
+
break;
|
772
|
+
dropped = 1;
|
773
|
+
if (j < 0) j = 0;
|
774
|
+
r->p->dp_score += ez->max;
|
775
|
+
re1 = rs + (ez->max_t + 1);
|
776
|
+
qe1 = qs + (ez->max_q + 1);
|
777
|
+
if (cnt1 - (j + 1) >= opt->min_cnt) {
|
778
|
+
mm_split_reg(r, r2, as1 + j + 1 - r->as, qlen, a, !!(opt->flag&MM_F_QSTRAND));
|
779
|
+
if (zdrop_code == 2) r2->split_inv = 1;
|
780
|
+
}
|
781
|
+
break;
|
782
|
+
} else r->p->dp_score += ez->score;
|
783
|
+
rs = re, qs = qe;
|
784
|
+
}
|
785
|
+
}
|
786
|
+
|
787
|
+
if (!dropped && qe < qe0 && re < re0) { // right extension
|
788
|
+
if (opt->flag & MM_F_QSTRAND) {
|
789
|
+
qseq = &qseq0[0][qe];
|
790
|
+
mm_idx_getseq2(mi, rev, rid, re, re0, tseq);
|
791
|
+
} else {
|
792
|
+
qseq = &qseq0[rev][qe];
|
793
|
+
mm_idx_getseq(mi, rid, re, re0, tseq);
|
794
|
+
}
|
795
|
+
mm_idx_bed_junc(mi, rid, re, re0, junc);
|
796
|
+
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, junc, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
|
797
|
+
if (ez->n_cigar > 0) {
|
798
|
+
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
799
|
+
r->p->dp_score += ez->max;
|
800
|
+
}
|
801
|
+
re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
|
802
|
+
qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1);
|
803
|
+
}
|
804
|
+
assert(qe1 <= qlen);
|
805
|
+
|
806
|
+
r->rs = rs1, r->re = re1;
|
807
|
+
if (!rev || (opt->flag & MM_F_QSTRAND)) r->qs = qs1, r->qe = qe1;
|
808
|
+
else r->qs = qlen - qe1, r->qe = qlen - qs1;
|
809
|
+
|
810
|
+
assert(re1 - rs1 <= re0 - rs0);
|
811
|
+
if (r->p) {
|
812
|
+
if (opt->flag & MM_F_QSTRAND) {
|
813
|
+
mm_idx_getseq2(mi, r->rev, rid, rs1, re1, tseq);
|
814
|
+
qseq = &qseq0[0][qs1];
|
815
|
+
} else {
|
816
|
+
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
817
|
+
qseq = &qseq0[r->rev][qs1];
|
818
|
+
}
|
819
|
+
mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
|
820
|
+
if (rev && r->p->trans_strand)
|
821
|
+
r->p->trans_strand ^= 3; // flip to the read strand
|
822
|
+
}
|
823
|
+
|
824
|
+
kfree(km, tseq);
|
825
|
+
kfree(km, junc);
|
826
|
+
}
|
827
|
+
|
828
|
+
static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
|
829
|
+
{ // NB: this doesn't work with the qstrand mode
|
830
|
+
int tl, ql, score, ret = 0, q_off, t_off;
|
831
|
+
uint8_t *tseq, *qseq;
|
832
|
+
int8_t mat[25];
|
833
|
+
void *qp;
|
834
|
+
|
835
|
+
memset(r_inv, 0, sizeof(mm_reg1_t));
|
836
|
+
if (!(r1->split&1) || !(r2->split&2)) return 0;
|
837
|
+
if (r1->id != r1->parent && r1->parent != MM_PARENT_TMP_PRI) return 0;
|
838
|
+
if (r2->id != r2->parent && r2->parent != MM_PARENT_TMP_PRI) return 0;
|
839
|
+
if (r1->rid != r2->rid || r1->rev != r2->rev) return 0;
|
840
|
+
ql = r1->rev? r1->qs - r2->qe : r2->qs - r1->qe;
|
841
|
+
tl = r2->rs - r1->re;
|
842
|
+
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
843
|
+
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
844
|
+
|
845
|
+
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
|
846
|
+
tseq = (uint8_t*)kmalloc(km, tl);
|
847
|
+
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
848
|
+
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
849
|
+
|
850
|
+
mm_seq_rev(ql, qseq);
|
851
|
+
mm_seq_rev(tl, tseq);
|
852
|
+
qp = ksw_ll_qinit(km, 2, ql, qseq, 5, mat);
|
853
|
+
score = ksw_ll_i16(qp, tl, tseq, opt->q, opt->e, &q_off, &t_off);
|
854
|
+
kfree(km, qp);
|
855
|
+
mm_seq_rev(ql, qseq);
|
856
|
+
mm_seq_rev(tl, tseq);
|
857
|
+
if (score < opt->min_dp_max) goto end_align1_inv;
|
858
|
+
q_off = ql - (q_off + 1), t_off = tl - (t_off + 1);
|
859
|
+
mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, 0, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez);
|
860
|
+
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
|
861
|
+
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
|
862
|
+
r_inv->p->dp_score = ez->max;
|
863
|
+
r_inv->id = -1;
|
864
|
+
r_inv->parent = MM_PARENT_UNSET;
|
865
|
+
r_inv->inv = 1;
|
866
|
+
r_inv->rev = !r1->rev;
|
867
|
+
r_inv->rid = r1->rid;
|
868
|
+
r_inv->div = -1.0f;
|
869
|
+
if (r_inv->rev == 0) {
|
870
|
+
r_inv->qs = r2->qe + q_off;
|
871
|
+
r_inv->qe = r_inv->qs + ez->max_q + 1;
|
872
|
+
} else {
|
873
|
+
r_inv->qe = r2->qs - q_off;
|
874
|
+
r_inv->qs = r_inv->qe - (ez->max_q + 1);
|
875
|
+
}
|
876
|
+
r_inv->rs = r1->re + t_off;
|
877
|
+
r_inv->re = r_inv->rs + ez->max_t + 1;
|
878
|
+
mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
|
879
|
+
ret = 1;
|
880
|
+
end_align1_inv:
|
881
|
+
kfree(km, tseq);
|
882
|
+
return ret;
|
883
|
+
}
|
884
|
+
|
885
|
+
static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs)
|
886
|
+
{
|
887
|
+
regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t));
|
888
|
+
if (i + 1 != *n_regs)
|
889
|
+
memmove(®s[i + 2], ®s[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1));
|
890
|
+
regs[i + 1] = *r;
|
891
|
+
++*n_regs;
|
892
|
+
return regs;
|
893
|
+
}
|
894
|
+
|
895
|
+
static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_)
|
896
|
+
{
|
897
|
+
uint32_t i;
|
898
|
+
int32_t n_gapo = 0, n_gap = 0;
|
899
|
+
*n_gap_ = *n_gapo_ = -1;
|
900
|
+
if (r->p == 0) return;
|
901
|
+
for (i = 0; i < r->p->n_cigar; ++i) {
|
902
|
+
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
903
|
+
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
|
904
|
+
++n_gapo, n_gap += len;
|
905
|
+
}
|
906
|
+
*n_gap_ = n_gap, *n_gapo_ = n_gapo;
|
907
|
+
}
|
908
|
+
|
909
|
+
double mm_event_identity(const mm_reg1_t *r)
|
910
|
+
{
|
911
|
+
int32_t n_gap, n_gapo;
|
912
|
+
if (r->p == 0) return -1.0f;
|
913
|
+
mm_count_gaps(r, &n_gap, &n_gapo);
|
914
|
+
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
|
915
|
+
}
|
916
|
+
|
917
|
+
static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
|
918
|
+
{
|
919
|
+
uint32_t i;
|
920
|
+
int32_t n_gap = 0, n_gapo = 0, n_mis;
|
921
|
+
double gap_cost = 0.0;
|
922
|
+
if (r->p == 0) return -1;
|
923
|
+
for (i = 0; i < r->p->n_cigar; ++i) {
|
924
|
+
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
925
|
+
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
926
|
+
gap_cost += b2 + (double)mg_log2(1.0 + len);
|
927
|
+
++n_gapo, n_gap += len;
|
928
|
+
}
|
929
|
+
}
|
930
|
+
n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
|
931
|
+
return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499);
|
932
|
+
}
|
933
|
+
|
934
|
+
void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b)
|
935
|
+
{
|
936
|
+
int32_t max = -1, max2 = -1, i, max_i = -1;
|
937
|
+
double div, b2;
|
938
|
+
if (n_regs < 2) return;
|
939
|
+
for (i = 0; i < n_regs; ++i) {
|
940
|
+
mm_reg1_t *r = ®s[i];
|
941
|
+
if (r->p == 0) continue;
|
942
|
+
if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i;
|
943
|
+
else if (r->p->dp_max > max2) max2 = r->p->dp_max;
|
944
|
+
}
|
945
|
+
if (max_i < 0 || max < 0 || max2 < 0) return;
|
946
|
+
if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return;
|
947
|
+
if (max2 < (double)max * frac) return;
|
948
|
+
div = 1. - mm_event_identity(®s[max_i]);
|
949
|
+
if (div < 0.02) div = 0.02;
|
950
|
+
b2 = 0.5 / div; // max value: 25
|
951
|
+
if (b2 * a < b) b2 = (double)a / b;
|
952
|
+
for (i = 0; i < n_regs; ++i) {
|
953
|
+
mm_reg1_t *r = ®s[i];
|
954
|
+
if (r->p == 0) continue;
|
955
|
+
r->p->dp_max = mm_recal_max_dp(r, b2, a);
|
956
|
+
if (r->p->dp_max < 0) r->p->dp_max = 0;
|
957
|
+
}
|
958
|
+
}
|
959
|
+
|
960
|
+
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
|
961
|
+
{
|
962
|
+
extern unsigned char seq_nt4_table[256];
|
963
|
+
int32_t i, n_regs = *n_regs_, n_a;
|
964
|
+
uint8_t *qseq0[2];
|
965
|
+
ksw_extz_t ez;
|
966
|
+
|
967
|
+
// encode the query sequence
|
968
|
+
qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2);
|
969
|
+
qseq0[1] = qseq0[0] + qlen;
|
970
|
+
for (i = 0; i < qlen; ++i) {
|
971
|
+
qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]];
|
972
|
+
qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4;
|
973
|
+
}
|
974
|
+
|
975
|
+
// align through seed hits
|
976
|
+
n_a = mm_squeeze_a(km, n_regs, regs, a);
|
977
|
+
memset(&ez, 0, sizeof(ksw_extz_t));
|
978
|
+
for (i = 0; i < n_regs; ++i) {
|
979
|
+
mm_reg1_t r2;
|
980
|
+
if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) { // then do two rounds of alignments for both strands
|
981
|
+
mm_reg1_t s[2], s2[2];
|
982
|
+
int which, trans_strand;
|
983
|
+
s[0] = s[1] = regs[i];
|
984
|
+
mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR);
|
985
|
+
mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV);
|
986
|
+
if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
|
987
|
+
else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
|
988
|
+
else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
|
989
|
+
if (which == 0) {
|
990
|
+
regs[i] = s[0], r2 = s2[0];
|
991
|
+
free(s[1].p);
|
992
|
+
} else {
|
993
|
+
regs[i] = s[1], r2 = s2[1];
|
994
|
+
free(s[0].p);
|
995
|
+
}
|
996
|
+
regs[i].p->trans_strand = trans_strand;
|
997
|
+
} else { // one round of alignment
|
998
|
+
mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, n_a, a, &ez, opt->flag);
|
999
|
+
if (opt->flag&MM_F_SPLICE)
|
1000
|
+
regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
|
1001
|
+
}
|
1002
|
+
if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
|
1003
|
+
if (i > 0 && regs[i].split_inv && !(opt->flag & MM_F_NO_INV)) {
|
1004
|
+
if (mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) {
|
1005
|
+
regs = mm_insert_reg(&r2, i, &n_regs, regs);
|
1006
|
+
++i; // skip the inserted INV alignment
|
1007
|
+
}
|
1008
|
+
}
|
1009
|
+
}
|
1010
|
+
*n_regs_ = n_regs;
|
1011
|
+
kfree(km, qseq0[0]);
|
1012
|
+
kfree(km, ez.cigar);
|
1013
|
+
mm_filter_regs(opt, qlen, n_regs_, regs);
|
1014
|
+
if (!(opt->flag&MM_F_SR) && !opt->split_prefix && qlen >= opt->rank_min_len) {
|
1015
|
+
mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b);
|
1016
|
+
mm_filter_regs(opt, qlen, n_regs_, regs);
|
1017
|
+
}
|
1018
|
+
mm_hit_sort(km, n_regs_, regs, opt->alt_drop);
|
1019
|
+
return regs;
|
1020
|
+
}
|