minimap2 0.0.4 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -98
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/ext/vendor/libminimap2.so +0 -0
  93. data/lib/minimap2/aligner.rb +16 -5
  94. data/lib/minimap2/alignment.rb +6 -2
  95. data/lib/minimap2/ffi/constants.rb +74 -53
  96. data/lib/minimap2/ffi/functions.rb +5 -0
  97. data/lib/minimap2/ffi.rb +1 -2
  98. data/lib/minimap2/version.rb +2 -1
  99. data/lib/minimap2.rb +67 -22
  100. metadata +98 -64
  101. data/lib/minimap2/ffi_helper.rb +0 -53
@@ -0,0 +1,1020 @@
1
+ #include <assert.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include "minimap.h"
6
+ #include "mmpriv.h"
7
+ #include "ksw2.h"
8
+
9
+ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc_ambi)
10
+ {
11
+ int i, j;
12
+ a = a < 0? -a : a;
13
+ b = b > 0? -b : b;
14
+ sc_ambi = sc_ambi > 0? -sc_ambi : sc_ambi;
15
+ for (i = 0; i < m - 1; ++i) {
16
+ for (j = 0; j < m - 1; ++j)
17
+ mat[i * m + j] = i == j? a : b;
18
+ mat[i * m + m - 1] = sc_ambi;
19
+ }
20
+ for (j = 0; j < m; ++j)
21
+ mat[(m - 1) * m + j] = sc_ambi;
22
+ }
23
+
24
+ static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
25
+ {
26
+ uint32_t i;
27
+ uint8_t t;
28
+ for (i = 0; i < len>>1; ++i)
29
+ t = seq[i], seq[i] = seq[len - 1 - i], seq[len - 1 - i] = t;
30
+ }
31
+
32
+ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int *max_zdrop, int pos[2][2])
33
+ {
34
+ if (score < *max) {
35
+ int li = i - *max_i;
36
+ int lj = j - *max_j;
37
+ int diff = li > lj? li - lj : lj - li;
38
+ int z = *max - score - diff * e;
39
+ if (z > *max_zdrop) {
40
+ *max_zdrop = z;
41
+ pos[0][0] = *max_i, pos[0][1] = i;
42
+ pos[1][0] = *max_j, pos[1][1] = j;
43
+ }
44
+ } else *max = score, *max_i = i, *max_j = j;
45
+ }
46
+
47
+ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat)
48
+ {
49
+ uint32_t k;
50
+ int32_t score = 0, max = INT32_MIN, max_i = -1, max_j = -1, i = 0, j = 0, max_zdrop = 0;
51
+ int pos[2][2] = {{-1, -1}, {-1, -1}}, q_len, t_len;
52
+
53
+ // find the score and the region where score drops most along diagonal
54
+ for (k = 0, score = 0; k < n_cigar; ++k) {
55
+ uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
56
+ if (op == MM_CIGAR_MATCH) {
57
+ for (l = 0; l < len; ++l) {
58
+ score += mat[tseq[i + l] * 5 + qseq[j + l]];
59
+ update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
60
+ }
61
+ i += len, j += len;
62
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
63
+ score -= opt->q + opt->e * len;
64
+ if (op == MM_CIGAR_INS) j += len;
65
+ else i += len;
66
+ update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
67
+ }
68
+ }
69
+
70
+ // test if there is an inversion in the most dropped region
71
+ q_len = pos[1][1] - pos[1][0], t_len = pos[0][1] - pos[0][0];
72
+ if (!(opt->flag&(MM_F_SPLICE|MM_F_SR|MM_F_FOR_ONLY|MM_F_REV_ONLY)) && max_zdrop > opt->zdrop_inv && q_len < opt->max_gap && t_len < opt->max_gap) {
73
+ uint8_t *qseq2;
74
+ void *qp;
75
+ int q_off, t_off;
76
+ qseq2 = (uint8_t*)kmalloc(km, q_len);
77
+ for (i = 0; i < q_len; ++i) {
78
+ int c = qseq[pos[1][1] - i - 1];
79
+ qseq2[i] = c >= 4? 4 : 3 - c;
80
+ }
81
+ qp = ksw_ll_qinit(km, 2, q_len, qseq2, 5, mat);
82
+ score = ksw_ll_i16(qp, t_len, tseq + pos[0][0], opt->q, opt->e, &q_off, &t_off);
83
+ kfree(km, qseq2);
84
+ kfree(km, qp);
85
+ if (score >= opt->min_chain_score * opt->a && score >= opt->min_dp_max)
86
+ return 2; // there is a potential inversion
87
+ }
88
+ return max_zdrop > opt->zdrop? 1 : 0;
89
+ }
90
+
91
+ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift)
92
+ {
93
+ mm_extra_t *p = r->p;
94
+ int32_t toff = 0, qoff = 0, to_shrink = 0;
95
+ uint32_t k;
96
+ *qshift = *tshift = 0;
97
+ if (p->n_cigar <= 1) return;
98
+ for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
99
+ uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
100
+ if (len == 0) to_shrink = 1;
101
+ if (op == MM_CIGAR_MATCH) {
102
+ toff += len, qoff += len;
103
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
104
+ if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
105
+ int l, prev_len = p->cigar[k-1] >> 4;
106
+ if (op == MM_CIGAR_INS) {
107
+ for (l = 0; l < prev_len; ++l)
108
+ if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
109
+ break;
110
+ } else {
111
+ for (l = 0; l < prev_len; ++l)
112
+ if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l])
113
+ break;
114
+ }
115
+ if (l > 0)
116
+ p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
117
+ if (l == prev_len) to_shrink = 1;
118
+ }
119
+ if (op == MM_CIGAR_INS) qoff += len;
120
+ else toff += len;
121
+ } else if (op == MM_CIGAR_N_SKIP) {
122
+ toff += len;
123
+ }
124
+ }
125
+ assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
126
+ for (k = 0; k < p->n_cigar - 2; ++k) { // fix CIGAR like 5I6D7I
127
+ if ((p->cigar[k]&0xf) > 0 && (p->cigar[k]&0xf) + (p->cigar[k+1]&0xf) == 3) {
128
+ uint32_t l, s[3] = {0,0,0};
129
+ for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
130
+ uint32_t op = p->cigar[l]&0xf;
131
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
132
+ s[op] += p->cigar[l] >> 4;
133
+ else break;
134
+ }
135
+ if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
136
+ p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
137
+ p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
138
+ for (k += 2; k < l; ++k)
139
+ p->cigar[k] &= 0xf;
140
+ to_shrink = 1;
141
+ }
142
+ k = l;
143
+ }
144
+ }
145
+ if (to_shrink) { // squeeze out zero-length operations
146
+ int32_t l = 0;
147
+ for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations
148
+ if (p->cigar[k]>>4 != 0)
149
+ p->cigar[l++] = p->cigar[k];
150
+ p->n_cigar = l;
151
+ for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same
152
+ if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf))
153
+ p->cigar[l++] = p->cigar[k];
154
+ else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
155
+ p->n_cigar = l;
156
+ }
157
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
158
+ int32_t l = p->cigar[0] >> 4;
159
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
160
+ if (r->rev) r->qe -= l;
161
+ else r->qs += l;
162
+ *qshift = l;
163
+ } else r->rs += l, *tshift = l;
164
+ --p->n_cigar;
165
+ memmove(p->cigar, p->cigar + 1, p->n_cigar * 4);
166
+ }
167
+ }
168
+
169
+ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer
170
+ {
171
+ uint32_t n_EQX = 0;
172
+ uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0;
173
+ mm_extra_t *p;
174
+ if (r->p == 0) return;
175
+ for (k = 0; k < r->p->n_cigar; ++k) {
176
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
177
+ if (op == MM_CIGAR_MATCH) {
178
+ while (len > 0) {
179
+ for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
180
+ if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
181
+
182
+ for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} // run of "X"
183
+ if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
184
+ }
185
+ ++n_M;
186
+ } else if (op == MM_CIGAR_INS) {
187
+ qoff += len;
188
+ } else if (op == MM_CIGAR_DEL) {
189
+ toff += len;
190
+ } else if (op == MM_CIGAR_N_SKIP) {
191
+ toff += len;
192
+ }
193
+ }
194
+ // update in-place if we can
195
+ if (n_EQX == n_M) {
196
+ for (k = 0; k < r->p->n_cigar; ++k) {
197
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
198
+ if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
199
+ }
200
+ return;
201
+ }
202
+ // allocate new storage
203
+ cap = r->p->n_cigar + (n_EQX - n_M) + sizeof(mm_extra_t);
204
+ kroundup32(cap);
205
+ p = (mm_extra_t*)calloc(cap, 4);
206
+ memcpy(p, r->p, sizeof(mm_extra_t));
207
+ p->capacity = cap;
208
+ // update cigar while copying
209
+ toff = qoff = m = 0;
210
+ for (k = 0; k < r->p->n_cigar; ++k) {
211
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
212
+ if (op == MM_CIGAR_MATCH) {
213
+ while (len > 0) {
214
+ // match
215
+ for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
216
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
217
+ len -= l;
218
+ toff += l, qoff += l;
219
+ // mismatch
220
+ for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
221
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
222
+ len -= l;
223
+ toff += l, qoff += l;
224
+ }
225
+ continue;
226
+ } else if (op == MM_CIGAR_INS) {
227
+ qoff += len;
228
+ } else if (op == MM_CIGAR_DEL) {
229
+ toff += len;
230
+ } else if (op == MM_CIGAR_N_SKIP) {
231
+ toff += len;
232
+ }
233
+ p->cigar[m++] = r->p->cigar[k];
234
+ }
235
+ p->n_cigar = m;
236
+ free(r->p);
237
+ r->p = p;
238
+ }
239
+
240
+ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
241
+ {
242
+ uint32_t k, l;
243
+ int32_t qshift, tshift, toff = 0, qoff = 0;
244
+ double s = 0.0, max = 0.0;
245
+ mm_extra_t *p = r->p;
246
+ if (p == 0) return;
247
+ mm_fix_cigar(r, qseq, tseq, &qshift, &tshift);
248
+ qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D
249
+ r->blen = r->mlen = 0;
250
+ for (k = 0; k < p->n_cigar; ++k) {
251
+ uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
252
+ if (op == MM_CIGAR_MATCH) {
253
+ int n_ambi = 0, n_diff = 0;
254
+ for (l = 0; l < len; ++l) {
255
+ int cq = qseq[qoff + l], ct = tseq[toff + l];
256
+ if (ct > 3 || cq > 3) ++n_ambi;
257
+ else if (ct != cq) ++n_diff;
258
+ s += mat[ct * 5 + cq];
259
+ if (s < 0) s = 0;
260
+ else max = max > s? max : s;
261
+ }
262
+ r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
263
+ toff += len, qoff += len;
264
+ } else if (op == MM_CIGAR_INS) {
265
+ int n_ambi = 0;
266
+ for (l = 0; l < len; ++l)
267
+ if (qseq[qoff + l] > 3) ++n_ambi;
268
+ r->blen += len - n_ambi, p->n_ambi += n_ambi;
269
+ if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
270
+ else s -= q + e;
271
+ if (s < 0) s = 0;
272
+ qoff += len;
273
+ } else if (op == MM_CIGAR_DEL) {
274
+ int n_ambi = 0;
275
+ for (l = 0; l < len; ++l)
276
+ if (tseq[toff + l] > 3) ++n_ambi;
277
+ r->blen += len - n_ambi, p->n_ambi += n_ambi;
278
+ if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
279
+ else s -= q + e;
280
+ if (s < 0) s = 0;
281
+ toff += len;
282
+ } else if (op == MM_CIGAR_N_SKIP) {
283
+ toff += len;
284
+ }
285
+ }
286
+ p->dp_max = (int32_t)(max + .499);
287
+ assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
288
+ if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
289
+ }
290
+
291
+ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
292
+ {
293
+ mm_extra_t *p;
294
+ if (n_cigar == 0) return;
295
+ if (r->p == 0) {
296
+ uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
297
+ kroundup32(capacity);
298
+ r->p = (mm_extra_t*)calloc(capacity, 4);
299
+ r->p->capacity = capacity;
300
+ } else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) {
301
+ r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4;
302
+ kroundup32(r->p->capacity);
303
+ r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4);
304
+ }
305
+ p = r->p;
306
+ if (p->n_cigar > 0 && (p->cigar[p->n_cigar-1]&0xf) == (cigar[0]&0xf)) { // same CIGAR op at the boundary
307
+ p->cigar[p->n_cigar-1] += cigar[0]>>4<<4;
308
+ if (n_cigar > 1) memcpy(p->cigar + p->n_cigar, cigar + 1, (n_cigar - 1) * 4);
309
+ p->n_cigar += n_cigar - 1;
310
+ } else {
311
+ memcpy(p->cigar + p->n_cigar, cigar, n_cigar * 4);
312
+ p->n_cigar += n_cigar;
313
+ }
314
+ }
315
+
316
+ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const uint8_t *junc, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez)
317
+ {
318
+ if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
319
+ int i;
320
+ fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop);
321
+ for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr);
322
+ fputc('\n', stderr);
323
+ for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
324
+ fputc('\n', stderr);
325
+ }
326
+ if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
327
+ ksw_reset_extz(ez);
328
+ ez->zdropped = 1;
329
+ } else if (opt->flag & MM_F_SPLICE)
330
+ ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez);
331
+ else if (opt->q == opt->q2 && opt->e == opt->e2)
332
+ ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
333
+ else
334
+ ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);
335
+ if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
336
+ int i;
337
+ fprintf(stderr, "score=%d, cigar=", ez->score);
338
+ for (i = 0; i < ez->n_cigar; ++i)
339
+ fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
340
+ fprintf(stderr, "\n");
341
+ }
342
+ }
343
+
344
+ static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x)
345
+ {
346
+ int64_t i, off0 = mi->seq[rid].offset, off = off0 + x;
347
+ int c = mm_seq4_get(mi->S, off);
348
+ for (i = off - 1; i >= off0; --i)
349
+ if (mm_seq4_get(mi->S, i) != c) break;
350
+ return (int)(off - i);
351
+ }
352
+
353
+ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q)
354
+ {
355
+ if (mi->flag & MM_I_HPC) {
356
+ const uint8_t *qseq = qseq0[a->x>>63];
357
+ int i, c;
358
+ *q = (int32_t)a->y;
359
+ for (i = *q - 1, c = qseq[*q]; i > 0; --i)
360
+ if (qseq[i] != c) break;
361
+ *q = i + 1;
362
+ c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
363
+ *r = (int32_t)a->x + 1 - c;
364
+ } else {
365
+ *r = (int32_t)a->x - (mi->k>>1);
366
+ *q = (int32_t)a->y - (mi->k>>1);
367
+ }
368
+ }
369
+
370
+ static int *collect_long_gaps(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int *n_)
371
+ {
372
+ int i, n, *K;
373
+ *n_ = 0;
374
+ for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap
375
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
376
+ if (gap < -min_gap || gap > min_gap) ++n;
377
+ }
378
+ if (n <= 1) return 0;
379
+ K = (int*)kmalloc(km, n * sizeof(int));
380
+ for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps
381
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
382
+ if (gap < -min_gap || gap > min_gap)
383
+ K[n++] = i;
384
+ }
385
+ *n_ = n;
386
+ return K;
387
+ }
388
+
389
+ static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt)
390
+ {
391
+ int max_st, max_en, n, i, k, max, *K;
392
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
393
+ if (K == 0) return;
394
+ max = 0, max_st = max_en = -1;
395
+ for (k = 0;; ++k) { // traverse long gaps
396
+ int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1;
397
+ if (k == n || k >= max_en) {
398
+ if (max_en > 0)
399
+ for (i = K[max_st]; i < K[max_en]; ++i)
400
+ a[as1 + i].y |= MM_SEED_IGNORE;
401
+ max = 0, max_st = max_en = -1;
402
+ if (k == n) break;
403
+ }
404
+ i = K[k];
405
+ gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x);
406
+ if (gap > 0) n_ins += gap;
407
+ else n_del += -gap;
408
+ qs = (int32_t)a[as1 + i - 1].y;
409
+ rs = (int32_t)a[as1 + i - 1].x;
410
+ for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) {
411
+ int j = K[l], diff;
412
+ if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break;
413
+ gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
414
+ if (gap > 0) n_ins += gap;
415
+ else n_del += -gap;
416
+ diff = n_ins + n_del - abs(n_ins - n_del);
417
+ if (max_diff < diff)
418
+ max_diff = diff, max_diff_l = l;
419
+ }
420
+ if (max_diff > diff_thres && max_diff > max)
421
+ max = max_diff, max_st = k, max_en = max_diff_l;
422
+ }
423
+ kfree(km, K);
424
+ }
425
+
426
+ static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int max_ext)
427
+ {
428
+ int n, k, *K;
429
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
430
+ if (K == 0) return;
431
+ for (k = 0; k < n;) {
432
+ int i = K[k], l;
433
+ int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x);
434
+ int re1 = (int32_t)a[as1 + i].x;
435
+ int qe1 = (int32_t)a[as1 + i].y;
436
+ gap1 = gap1 > 0? gap1 : -gap1;
437
+ for (l = k + 1; l < n; ++l) {
438
+ int j = K[l], gap2, q_span_pre, rs2, qs2, m;
439
+ if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break;
440
+ gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
441
+ q_span_pre = a[as1 + j - 1].y >> 32 & 0xff;
442
+ rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre;
443
+ qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre;
444
+ m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1;
445
+ gap2 = gap2 > 0? gap2 : -gap2;
446
+ if (m > gap1 + gap2) break;
447
+ re1 = (int32_t)a[as1 + j].x;
448
+ qe1 = (int32_t)a[as1 + j].y;
449
+ gap1 = gap2;
450
+ }
451
+ if (l > k + 1) {
452
+ int j, end = K[l - 1];
453
+ for (j = K[k]; j < end; ++j)
454
+ a[as1 + j].y |= MM_SEED_IGNORE;
455
+ a[as1 + end].y |= MM_SEED_LONG_JOIN;
456
+ }
457
+ k = l;
458
+ }
459
+ kfree(km, K);
460
+ }
461
+
462
+ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int min_match, int32_t *as, int32_t *cnt)
463
+ {
464
+ int32_t i, l, m;
465
+ *as = r->as, *cnt = r->cnt;
466
+ if (r->cnt < 3) return;
467
+ m = l = a[r->as].y >> 32 & 0xff;
468
+ for (i = r->as + 1; i < r->as + r->cnt - 1; ++i) {
469
+ int32_t lq, lr, min, max;
470
+ int32_t q_span = a[i].y >> 32 & 0xff;
471
+ if (a[i].y & MM_SEED_LONG_JOIN) break;
472
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
473
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
474
+ min = lr < lq? lr : lq;
475
+ max = lr > lq? lr : lq;
476
+ if (max - min > l >> 1) *as = i;
477
+ l += min;
478
+ m += min < q_span? min : q_span;
479
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
480
+ }
481
+ *cnt = r->as + r->cnt - *as;
482
+ m = l = a[r->as + r->cnt - 1].y >> 32 & 0xff;
483
+ for (i = r->as + r->cnt - 2; i > *as; --i) {
484
+ int32_t lq, lr, min, max;
485
+ int32_t q_span = a[i+1].y >> 32 & 0xff;
486
+ if (a[i+1].y & MM_SEED_LONG_JOIN) break;
487
+ lr = (int32_t)a[i+1].x - (int32_t)a[i].x;
488
+ lq = (int32_t)a[i+1].y - (int32_t)a[i].y;
489
+ min = lr < lq? lr : lq;
490
+ max = lr > lq? lr : lq;
491
+ if (max - min > l >> 1) *cnt = i + 1 - *as;
492
+ l += min;
493
+ m += min < q_span? min : q_span;
494
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
495
+ }
496
+ }
497
+
498
+ static void mm_max_stretch(const mm_reg1_t *r, const mm128_t *a, int32_t *as, int32_t *cnt)
499
+ {
500
+ int32_t i, score, max_score, len, max_i, max_len;
501
+
502
+ *as = r->as, *cnt = r->cnt;
503
+ if (r->cnt < 2) return;
504
+
505
+ max_score = -1, max_i = -1, max_len = 0;
506
+ score = a[r->as].y >> 32 & 0xff, len = 1;
507
+ for (i = r->as + 1; i < r->as + r->cnt; ++i) {
508
+ int32_t lq, lr, q_span;
509
+ q_span = a[i].y >> 32 & 0xff;
510
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
511
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
512
+ if (lq == lr) {
513
+ score += lq < q_span? lq : q_span;
514
+ ++len;
515
+ } else {
516
+ if (score > max_score)
517
+ max_score = score, max_len = len, max_i = i - len;
518
+ score = q_span, len = 1;
519
+ }
520
+ }
521
+ if (score > max_score)
522
+ max_score = score, max_len = len, max_i = i - len;
523
+ *as = max_i, *cnt = max_len;
524
+ }
525
+
526
+ static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a)
527
+ {
528
+ uint8_t *qseq, *tseq;
529
+ int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len;
530
+ void *qp;
531
+ rid = a->x<<1>>33;
532
+ re = (uint32_t)a->x + 1, rs = re - q_span;
533
+ qe = (uint32_t)a->y + 1, qs = qe - q_span;
534
+ rs = rs - ext_len > 0? rs - ext_len : 0;
535
+ qs = qs - ext_len > 0? qs - ext_len : 0;
536
+ re = re + ext_len < (int32_t)mi->seq[rid].len? re + ext_len : mi->seq[rid].len;
537
+ qe = qe + ext_len < qlen? qe + ext_len : qlen;
538
+ tseq = (uint8_t*)kmalloc(km, re - rs);
539
+ if (opt->flag & MM_F_QSTRAND) {
540
+ qseq = qseq0[0] + qs;
541
+ mm_idx_getseq2(mi, a->x>>63, rid, rs, re, tseq);
542
+ } else {
543
+ qseq = qseq0[a->x>>63] + qs;
544
+ mm_idx_getseq(mi, rid, rs, re, tseq);
545
+ }
546
+ qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat);
547
+ score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off);
548
+ kfree(km, tseq);
549
+ kfree(km, qp);
550
+ return score;
551
+ }
552
+
553
+ static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1)
554
+ { // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
555
+ int score;
556
+ double log_gap;
557
+ *as1 = r->as, *cnt1 = r->cnt;
558
+ if (r->cnt < 3) return;
559
+ log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x);
560
+ if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
561
+ score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]);
562
+ if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift"
563
+ ++(*as1), --(*cnt1);
564
+ }
565
+ log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x);
566
+ if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
567
+ score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]);
568
+ if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift)
569
+ --(*cnt1);
570
+ }
571
+ }
572
+
573
+ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
574
+ {
575
+ int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
576
+ int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
577
+ uint8_t *tseq, *qseq, *junc;
578
+ int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
579
+ int32_t rs, re, qs, qe;
580
+ int32_t rs1, qs1, re1, qe1;
581
+ int8_t mat[25];
582
+
583
+ if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
584
+
585
+ r2->cnt = 0;
586
+ if (r->cnt == 0) return;
587
+ ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
588
+ bw = (int)(opt->bw * 1.5 + 1.);
589
+ bw_long = (int)(opt->bw_long * 1.5 + 1.);
590
+ if (bw_long < bw) bw_long = bw;
591
+
592
+ if (is_sr && !(mi->flag & MM_I_HPC)) {
593
+ mm_max_stretch(r, a, &as1, &cnt1);
594
+ rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff);
595
+ qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff);
596
+ re = (int32_t)a[as1+cnt1-1].x + 1;
597
+ qe = (int32_t)a[as1+cnt1-1].y + 1;
598
+ } else {
599
+ if (!(opt->flag & MM_F_NO_END_FLT)) {
600
+ if (is_splice)
601
+ mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1);
602
+ else
603
+ mm_fix_bad_ends(r, a, opt->bw, opt->min_chain_score * 2, &as1, &cnt1);
604
+ } else as1 = r->as, cnt1 = r->cnt;
605
+ mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
606
+ mm_filter_bad_seeds_alt(km, as1, cnt1, a, 30, opt->max_gap>>1);
607
+ mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
608
+ mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
609
+ }
610
+ assert(cnt1 > 0);
611
+
612
+ if (is_splice) {
613
+ if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
614
+ if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
615
+ if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
616
+ }
617
+
618
+ /* Look for the start and end of regions to perform DP. This sounds easy
619
+ * but is in fact tricky. Excessively small regions lead to unnecessary
620
+ * clippings and lose alignable sequences. Excessively large regions
621
+ * occasionally lead to large overlaps between two chains and may cause
622
+ * loss of alignments in corner cases. */
623
+ if (is_sr) {
624
+ qs0 = 0, qe0 = qlen;
625
+ l = qs;
626
+ l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
627
+ rs0 = rs - l > 0? rs - l : 0;
628
+ l = qlen - qe;
629
+ l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
630
+ re0 = re + l < (int32_t)mi->seq[rid].len? re + l : mi->seq[rid].len;
631
+ } else {
632
+ // compute rs0 and qs0
633
+ rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff);
634
+ qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff);
635
+ if (rs0 < 0) rs0 = 0; // this may happen when HPC is in use
636
+ assert(qs0 >= 0); // this should never happen, or it is logic error
637
+ rs1 = qs1 = 0;
638
+ for (i = r->as - 1, l = 0; i >= 0 && a[i].x>>32 == a[r->as].x>>32; --i) { // inspect nearby seeds
639
+ int32_t x = (int32_t)a[i].x + 1 - (int32_t)(a[i].y>>32&0xff);
640
+ int32_t y = (int32_t)a[i].y + 1 - (int32_t)(a[i].y>>32&0xff);
641
+ if (x < rs0 && y < qs0) {
642
+ if (++l > opt->min_cnt) {
643
+ l = rs0 - x > qs0 - y? rs0 - x : qs0 - y;
644
+ rs1 = rs0 - l, qs1 = qs0 - l;
645
+ if (rs1 < 0) rs1 = 0; // not strictly necessary; better have this guard for explicit
646
+ break;
647
+ }
648
+ }
649
+ }
650
+ if (qs > 0 && rs > 0) {
651
+ l = qs < opt->max_gap? qs : opt->max_gap;
652
+ qs1 = qs1 > qs - l? qs1 : qs - l;
653
+ qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0
654
+ l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
655
+ l = l < opt->max_gap? l : opt->max_gap;
656
+ l = l < rs? l : rs;
657
+ rs1 = rs1 > rs - l? rs1 : rs - l;
658
+ rs0 = rs0 < rs1? rs0 : rs1;
659
+ rs0 = rs0 < rs? rs0 : rs;
660
+ } else rs0 = rs, qs0 = qs;
661
+ // compute re0 and qe0
662
+ re0 = (int32_t)a[r->as + r->cnt - 1].x + 1;
663
+ qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1;
664
+ re1 = mi->seq[rid].len, qe1 = qlen;
665
+ for (i = r->as + r->cnt, l = 0; i < n_a && a[i].x>>32 == a[r->as].x>>32; ++i) { // inspect nearby seeds
666
+ int32_t x = (int32_t)a[i].x + 1;
667
+ int32_t y = (int32_t)a[i].y + 1;
668
+ if (x > re0 && y > qe0) {
669
+ if (++l > opt->min_cnt) {
670
+ l = x - re0 > y - qe0? x - re0 : y - qe0;
671
+ re1 = re0 + l, qe1 = qe0 + l;
672
+ break;
673
+ }
674
+ }
675
+ }
676
+ if (qe < qlen && re < (int32_t)mi->seq[rid].len) {
677
+ l = qlen - qe < opt->max_gap? qlen - qe : opt->max_gap;
678
+ qe1 = qe1 < qe + l? qe1 : qe + l;
679
+ qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0
680
+ l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
681
+ l = l < opt->max_gap? l : opt->max_gap;
682
+ l = l < (int32_t)mi->seq[rid].len - re? l : mi->seq[rid].len - re;
683
+ re1 = re1 < re + l? re1 : re + l;
684
+ re0 = re0 > re1? re0 : re1;
685
+ } else re0 = re, qe0 = qe;
686
+ }
687
+ if (a[r->as].y & MM_SEED_SELF) {
688
+ int max_ext = r->qs > r->rs? r->qs - r->rs : r->rs - r->qs;
689
+ if (r->rs - rs0 > max_ext) rs0 = r->rs - max_ext;
690
+ if (r->qs - qs0 > max_ext) qs0 = r->qs - max_ext;
691
+ max_ext = r->qe > r->re? r->qe - r->re : r->re - r->qe;
692
+ if (re0 - r->re > max_ext) re0 = r->re + max_ext;
693
+ if (qe0 - r->qe > max_ext) qe0 = r->qe + max_ext;
694
+ }
695
+
696
+ assert(re0 > rs0);
697
+ tseq = (uint8_t*)kmalloc(km, re0 - rs0);
698
+ junc = (uint8_t*)kmalloc(km, re0 - rs0);
699
+
700
+ if (qs > 0 && rs > 0) { // left extension; probably the condition can be changed to "qs > qs0 && rs > rs0"
701
+ if (opt->flag & MM_F_QSTRAND) {
702
+ qseq = &qseq0[0][qs0];
703
+ mm_idx_getseq2(mi, rev, rid, rs0, rs, tseq);
704
+ } else {
705
+ qseq = &qseq0[rev][qs0];
706
+ mm_idx_getseq(mi, rid, rs0, rs, tseq);
707
+ }
708
+ mm_idx_bed_junc(mi, rid, rs0, rs, junc);
709
+ mm_seq_rev(qs - qs0, qseq);
710
+ mm_seq_rev(rs - rs0, tseq);
711
+ mm_seq_rev(rs - rs0, junc);
712
+ mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, junc, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
713
+ if (ez->n_cigar > 0) {
714
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
715
+ r->p->dp_score += ez->max;
716
+ }
717
+ rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
718
+ qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1);
719
+ mm_seq_rev(qs - qs0, qseq);
720
+ } else rs1 = rs, qs1 = qs;
721
+ re1 = rs, qe1 = qs;
722
+ assert(qs1 >= 0 && rs1 >= 0);
723
+
724
+ for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
725
+ if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
726
+ if (is_sr && !(mi->flag & MM_I_HPC)) {
727
+ re = (int32_t)a[as1 + i].x + 1;
728
+ qe = (int32_t)a[as1 + i].y + 1;
729
+ } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
730
+ re1 = re, qe1 = qe;
731
+ if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
732
+ int j, bw1 = bw_long, zdrop_code;
733
+ if (a[as1+i].y & MM_SEED_LONG_JOIN)
734
+ bw1 = qe - qs > re - rs? qe - qs : re - rs;
735
+ // perform alignment
736
+ if (opt->flag & MM_F_QSTRAND) {
737
+ qseq = &qseq0[0][qs];
738
+ mm_idx_getseq2(mi, rev, rid, rs, re, tseq);
739
+ } else {
740
+ qseq = &qseq0[rev][qs];
741
+ mm_idx_getseq(mi, rid, rs, re, tseq);
742
+ }
743
+ mm_idx_bed_junc(mi, rid, rs, re, junc);
744
+ if (is_sr) { // perform ungapped alignment
745
+ assert(qe - qs == re - rs);
746
+ ksw_reset_extz(ez);
747
+ for (j = 0, ez->score = 0; j < qe - qs; ++j) {
748
+ if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
749
+ else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
750
+ }
751
+ ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
752
+ } else { // perform normal gapped alignment
753
+ mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
754
+ }
755
+ // test Z-drop and inversion Z-drop
756
+ if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0)
757
+ mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate
758
+ // update CIGAR
759
+ if (ez->n_cigar > 0)
760
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
761
+ if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
762
+ if (!r->p) {
763
+ assert(ez->n_cigar == 0);
764
+ uint32_t capacity = sizeof(mm_extra_t)/4;
765
+ kroundup32(capacity);
766
+ r->p = (mm_extra_t*)calloc(capacity, 4);
767
+ r->p->capacity = capacity;
768
+ }
769
+ for (j = i - 1; j >= 0; --j)
770
+ if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
771
+ break;
772
+ dropped = 1;
773
+ if (j < 0) j = 0;
774
+ r->p->dp_score += ez->max;
775
+ re1 = rs + (ez->max_t + 1);
776
+ qe1 = qs + (ez->max_q + 1);
777
+ if (cnt1 - (j + 1) >= opt->min_cnt) {
778
+ mm_split_reg(r, r2, as1 + j + 1 - r->as, qlen, a, !!(opt->flag&MM_F_QSTRAND));
779
+ if (zdrop_code == 2) r2->split_inv = 1;
780
+ }
781
+ break;
782
+ } else r->p->dp_score += ez->score;
783
+ rs = re, qs = qe;
784
+ }
785
+ }
786
+
787
+ if (!dropped && qe < qe0 && re < re0) { // right extension
788
+ if (opt->flag & MM_F_QSTRAND) {
789
+ qseq = &qseq0[0][qe];
790
+ mm_idx_getseq2(mi, rev, rid, re, re0, tseq);
791
+ } else {
792
+ qseq = &qseq0[rev][qe];
793
+ mm_idx_getseq(mi, rid, re, re0, tseq);
794
+ }
795
+ mm_idx_bed_junc(mi, rid, re, re0, junc);
796
+ mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, junc, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
797
+ if (ez->n_cigar > 0) {
798
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
799
+ r->p->dp_score += ez->max;
800
+ }
801
+ re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
802
+ qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1);
803
+ }
804
+ assert(qe1 <= qlen);
805
+
806
+ r->rs = rs1, r->re = re1;
807
+ if (!rev || (opt->flag & MM_F_QSTRAND)) r->qs = qs1, r->qe = qe1;
808
+ else r->qs = qlen - qe1, r->qe = qlen - qs1;
809
+
810
+ assert(re1 - rs1 <= re0 - rs0);
811
+ if (r->p) {
812
+ if (opt->flag & MM_F_QSTRAND) {
813
+ mm_idx_getseq2(mi, r->rev, rid, rs1, re1, tseq);
814
+ qseq = &qseq0[0][qs1];
815
+ } else {
816
+ mm_idx_getseq(mi, rid, rs1, re1, tseq);
817
+ qseq = &qseq0[r->rev][qs1];
818
+ }
819
+ mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
820
+ if (rev && r->p->trans_strand)
821
+ r->p->trans_strand ^= 3; // flip to the read strand
822
+ }
823
+
824
+ kfree(km, tseq);
825
+ kfree(km, junc);
826
+ }
827
+
828
+ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
829
+ { // NB: this doesn't work with the qstrand mode
830
+ int tl, ql, score, ret = 0, q_off, t_off;
831
+ uint8_t *tseq, *qseq;
832
+ int8_t mat[25];
833
+ void *qp;
834
+
835
+ memset(r_inv, 0, sizeof(mm_reg1_t));
836
+ if (!(r1->split&1) || !(r2->split&2)) return 0;
837
+ if (r1->id != r1->parent && r1->parent != MM_PARENT_TMP_PRI) return 0;
838
+ if (r2->id != r2->parent && r2->parent != MM_PARENT_TMP_PRI) return 0;
839
+ if (r1->rid != r2->rid || r1->rev != r2->rev) return 0;
840
+ ql = r1->rev? r1->qs - r2->qe : r2->qs - r1->qe;
841
+ tl = r2->rs - r1->re;
842
+ if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
843
+ if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
844
+
845
+ ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
846
+ tseq = (uint8_t*)kmalloc(km, tl);
847
+ mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
848
+ qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
849
+
850
+ mm_seq_rev(ql, qseq);
851
+ mm_seq_rev(tl, tseq);
852
+ qp = ksw_ll_qinit(km, 2, ql, qseq, 5, mat);
853
+ score = ksw_ll_i16(qp, tl, tseq, opt->q, opt->e, &q_off, &t_off);
854
+ kfree(km, qp);
855
+ mm_seq_rev(ql, qseq);
856
+ mm_seq_rev(tl, tseq);
857
+ if (score < opt->min_dp_max) goto end_align1_inv;
858
+ q_off = ql - (q_off + 1), t_off = tl - (t_off + 1);
859
+ mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, 0, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez);
860
+ if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
861
+ mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
862
+ r_inv->p->dp_score = ez->max;
863
+ r_inv->id = -1;
864
+ r_inv->parent = MM_PARENT_UNSET;
865
+ r_inv->inv = 1;
866
+ r_inv->rev = !r1->rev;
867
+ r_inv->rid = r1->rid;
868
+ r_inv->div = -1.0f;
869
+ if (r_inv->rev == 0) {
870
+ r_inv->qs = r2->qe + q_off;
871
+ r_inv->qe = r_inv->qs + ez->max_q + 1;
872
+ } else {
873
+ r_inv->qe = r2->qs - q_off;
874
+ r_inv->qs = r_inv->qe - (ez->max_q + 1);
875
+ }
876
+ r_inv->rs = r1->re + t_off;
877
+ r_inv->re = r_inv->rs + ez->max_t + 1;
878
+ mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
879
+ ret = 1;
880
+ end_align1_inv:
881
+ kfree(km, tseq);
882
+ return ret;
883
+ }
884
+
885
+ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs)
886
+ {
887
+ regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t));
888
+ if (i + 1 != *n_regs)
889
+ memmove(&regs[i + 2], &regs[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1));
890
+ regs[i + 1] = *r;
891
+ ++*n_regs;
892
+ return regs;
893
+ }
894
+
895
+ static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_)
896
+ {
897
+ uint32_t i;
898
+ int32_t n_gapo = 0, n_gap = 0;
899
+ *n_gap_ = *n_gapo_ = -1;
900
+ if (r->p == 0) return;
901
+ for (i = 0; i < r->p->n_cigar; ++i) {
902
+ int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
903
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
904
+ ++n_gapo, n_gap += len;
905
+ }
906
+ *n_gap_ = n_gap, *n_gapo_ = n_gapo;
907
+ }
908
+
909
+ double mm_event_identity(const mm_reg1_t *r)
910
+ {
911
+ int32_t n_gap, n_gapo;
912
+ if (r->p == 0) return -1.0f;
913
+ mm_count_gaps(r, &n_gap, &n_gapo);
914
+ return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
915
+ }
916
+
917
+ static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
918
+ {
919
+ uint32_t i;
920
+ int32_t n_gap = 0, n_gapo = 0, n_mis;
921
+ double gap_cost = 0.0;
922
+ if (r->p == 0) return -1;
923
+ for (i = 0; i < r->p->n_cigar; ++i) {
924
+ int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
925
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
926
+ gap_cost += b2 + (double)mg_log2(1.0 + len);
927
+ ++n_gapo, n_gap += len;
928
+ }
929
+ }
930
+ n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
931
+ return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499);
932
+ }
933
+
934
+ void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b)
935
+ {
936
+ int32_t max = -1, max2 = -1, i, max_i = -1;
937
+ double div, b2;
938
+ if (n_regs < 2) return;
939
+ for (i = 0; i < n_regs; ++i) {
940
+ mm_reg1_t *r = &regs[i];
941
+ if (r->p == 0) continue;
942
+ if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i;
943
+ else if (r->p->dp_max > max2) max2 = r->p->dp_max;
944
+ }
945
+ if (max_i < 0 || max < 0 || max2 < 0) return;
946
+ if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return;
947
+ if (max2 < (double)max * frac) return;
948
+ div = 1. - mm_event_identity(&regs[max_i]);
949
+ if (div < 0.02) div = 0.02;
950
+ b2 = 0.5 / div; // max value: 25
951
+ if (b2 * a < b) b2 = (double)a / b;
952
+ for (i = 0; i < n_regs; ++i) {
953
+ mm_reg1_t *r = &regs[i];
954
+ if (r->p == 0) continue;
955
+ r->p->dp_max = mm_recal_max_dp(r, b2, a);
956
+ if (r->p->dp_max < 0) r->p->dp_max = 0;
957
+ }
958
+ }
959
+
960
+ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
961
+ {
962
+ extern unsigned char seq_nt4_table[256];
963
+ int32_t i, n_regs = *n_regs_, n_a;
964
+ uint8_t *qseq0[2];
965
+ ksw_extz_t ez;
966
+
967
+ // encode the query sequence
968
+ qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2);
969
+ qseq0[1] = qseq0[0] + qlen;
970
+ for (i = 0; i < qlen; ++i) {
971
+ qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]];
972
+ qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4;
973
+ }
974
+
975
+ // align through seed hits
976
+ n_a = mm_squeeze_a(km, n_regs, regs, a);
977
+ memset(&ez, 0, sizeof(ksw_extz_t));
978
+ for (i = 0; i < n_regs; ++i) {
979
+ mm_reg1_t r2;
980
+ if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) { // then do two rounds of alignments for both strands
981
+ mm_reg1_t s[2], s2[2];
982
+ int which, trans_strand;
983
+ s[0] = s[1] = regs[i];
984
+ mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR);
985
+ mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV);
986
+ if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
987
+ else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
988
+ else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
989
+ if (which == 0) {
990
+ regs[i] = s[0], r2 = s2[0];
991
+ free(s[1].p);
992
+ } else {
993
+ regs[i] = s[1], r2 = s2[1];
994
+ free(s[0].p);
995
+ }
996
+ regs[i].p->trans_strand = trans_strand;
997
+ } else { // one round of alignment
998
+ mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, n_a, a, &ez, opt->flag);
999
+ if (opt->flag&MM_F_SPLICE)
1000
+ regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
1001
+ }
1002
+ if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
1003
+ if (i > 0 && regs[i].split_inv && !(opt->flag & MM_F_NO_INV)) {
1004
+ if (mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
1005
+ regs = mm_insert_reg(&r2, i, &n_regs, regs);
1006
+ ++i; // skip the inserted INV alignment
1007
+ }
1008
+ }
1009
+ }
1010
+ *n_regs_ = n_regs;
1011
+ kfree(km, qseq0[0]);
1012
+ kfree(km, ez.cigar);
1013
+ mm_filter_regs(opt, qlen, n_regs_, regs);
1014
+ if (!(opt->flag&MM_F_SR) && !opt->split_prefix && qlen >= opt->rank_min_len) {
1015
+ mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b);
1016
+ mm_filter_regs(opt, qlen, n_regs_, regs);
1017
+ }
1018
+ mm_hit_sort(km, n_regs_, regs, opt->alt_drop);
1019
+ return regs;
1020
+ }