minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,1020 @@
1
+ #include <assert.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include "minimap.h"
6
+ #include "mmpriv.h"
7
+ #include "ksw2.h"
8
+
9
+ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc_ambi)
10
+ {
11
+ int i, j;
12
+ a = a < 0? -a : a;
13
+ b = b > 0? -b : b;
14
+ sc_ambi = sc_ambi > 0? -sc_ambi : sc_ambi;
15
+ for (i = 0; i < m - 1; ++i) {
16
+ for (j = 0; j < m - 1; ++j)
17
+ mat[i * m + j] = i == j? a : b;
18
+ mat[i * m + m - 1] = sc_ambi;
19
+ }
20
+ for (j = 0; j < m; ++j)
21
+ mat[(m - 1) * m + j] = sc_ambi;
22
+ }
23
+
24
+ static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
25
+ {
26
+ uint32_t i;
27
+ uint8_t t;
28
+ for (i = 0; i < len>>1; ++i)
29
+ t = seq[i], seq[i] = seq[len - 1 - i], seq[len - 1 - i] = t;
30
+ }
31
+
32
+ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int *max_zdrop, int pos[2][2])
33
+ {
34
+ if (score < *max) {
35
+ int li = i - *max_i;
36
+ int lj = j - *max_j;
37
+ int diff = li > lj? li - lj : lj - li;
38
+ int z = *max - score - diff * e;
39
+ if (z > *max_zdrop) {
40
+ *max_zdrop = z;
41
+ pos[0][0] = *max_i, pos[0][1] = i;
42
+ pos[1][0] = *max_j, pos[1][1] = j;
43
+ }
44
+ } else *max = score, *max_i = i, *max_j = j;
45
+ }
46
+
47
+ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat)
48
+ {
49
+ uint32_t k;
50
+ int32_t score = 0, max = INT32_MIN, max_i = -1, max_j = -1, i = 0, j = 0, max_zdrop = 0;
51
+ int pos[2][2] = {{-1, -1}, {-1, -1}}, q_len, t_len;
52
+
53
+ // find the score and the region where score drops most along diagonal
54
+ for (k = 0, score = 0; k < n_cigar; ++k) {
55
+ uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
56
+ if (op == MM_CIGAR_MATCH) {
57
+ for (l = 0; l < len; ++l) {
58
+ score += mat[tseq[i + l] * 5 + qseq[j + l]];
59
+ update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
60
+ }
61
+ i += len, j += len;
62
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
63
+ score -= opt->q + opt->e * len;
64
+ if (op == MM_CIGAR_INS) j += len;
65
+ else i += len;
66
+ update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
67
+ }
68
+ }
69
+
70
+ // test if there is an inversion in the most dropped region
71
+ q_len = pos[1][1] - pos[1][0], t_len = pos[0][1] - pos[0][0];
72
+ if (!(opt->flag&(MM_F_SPLICE|MM_F_SR|MM_F_FOR_ONLY|MM_F_REV_ONLY)) && max_zdrop > opt->zdrop_inv && q_len < opt->max_gap && t_len < opt->max_gap) {
73
+ uint8_t *qseq2;
74
+ void *qp;
75
+ int q_off, t_off;
76
+ qseq2 = (uint8_t*)kmalloc(km, q_len);
77
+ for (i = 0; i < q_len; ++i) {
78
+ int c = qseq[pos[1][1] - i - 1];
79
+ qseq2[i] = c >= 4? 4 : 3 - c;
80
+ }
81
+ qp = ksw_ll_qinit(km, 2, q_len, qseq2, 5, mat);
82
+ score = ksw_ll_i16(qp, t_len, tseq + pos[0][0], opt->q, opt->e, &q_off, &t_off);
83
+ kfree(km, qseq2);
84
+ kfree(km, qp);
85
+ if (score >= opt->min_chain_score * opt->a && score >= opt->min_dp_max)
86
+ return 2; // there is a potential inversion
87
+ }
88
+ return max_zdrop > opt->zdrop? 1 : 0;
89
+ }
90
+
91
+ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift)
92
+ {
93
+ mm_extra_t *p = r->p;
94
+ int32_t toff = 0, qoff = 0, to_shrink = 0;
95
+ uint32_t k;
96
+ *qshift = *tshift = 0;
97
+ if (p->n_cigar <= 1) return;
98
+ for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
99
+ uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
100
+ if (len == 0) to_shrink = 1;
101
+ if (op == MM_CIGAR_MATCH) {
102
+ toff += len, qoff += len;
103
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
104
+ if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
105
+ int l, prev_len = p->cigar[k-1] >> 4;
106
+ if (op == MM_CIGAR_INS) {
107
+ for (l = 0; l < prev_len; ++l)
108
+ if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
109
+ break;
110
+ } else {
111
+ for (l = 0; l < prev_len; ++l)
112
+ if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l])
113
+ break;
114
+ }
115
+ if (l > 0)
116
+ p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
117
+ if (l == prev_len) to_shrink = 1;
118
+ }
119
+ if (op == MM_CIGAR_INS) qoff += len;
120
+ else toff += len;
121
+ } else if (op == MM_CIGAR_N_SKIP) {
122
+ toff += len;
123
+ }
124
+ }
125
+ assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
126
+ for (k = 0; k < p->n_cigar - 2; ++k) { // fix CIGAR like 5I6D7I
127
+ if ((p->cigar[k]&0xf) > 0 && (p->cigar[k]&0xf) + (p->cigar[k+1]&0xf) == 3) {
128
+ uint32_t l, s[3] = {0,0,0};
129
+ for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
130
+ uint32_t op = p->cigar[l]&0xf;
131
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
132
+ s[op] += p->cigar[l] >> 4;
133
+ else break;
134
+ }
135
+ if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
136
+ p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
137
+ p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
138
+ for (k += 2; k < l; ++k)
139
+ p->cigar[k] &= 0xf;
140
+ to_shrink = 1;
141
+ }
142
+ k = l;
143
+ }
144
+ }
145
+ if (to_shrink) { // squeeze out zero-length operations
146
+ int32_t l = 0;
147
+ for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations
148
+ if (p->cigar[k]>>4 != 0)
149
+ p->cigar[l++] = p->cigar[k];
150
+ p->n_cigar = l;
151
+ for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same
152
+ if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf))
153
+ p->cigar[l++] = p->cigar[k];
154
+ else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
155
+ p->n_cigar = l;
156
+ }
157
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
158
+ int32_t l = p->cigar[0] >> 4;
159
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
160
+ if (r->rev) r->qe -= l;
161
+ else r->qs += l;
162
+ *qshift = l;
163
+ } else r->rs += l, *tshift = l;
164
+ --p->n_cigar;
165
+ memmove(p->cigar, p->cigar + 1, p->n_cigar * 4);
166
+ }
167
+ }
168
+
169
+ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer
170
+ {
171
+ uint32_t n_EQX = 0;
172
+ uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0;
173
+ mm_extra_t *p;
174
+ if (r->p == 0) return;
175
+ for (k = 0; k < r->p->n_cigar; ++k) {
176
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
177
+ if (op == MM_CIGAR_MATCH) {
178
+ while (len > 0) {
179
+ for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
180
+ if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
181
+
182
+ for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} // run of "X"
183
+ if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
184
+ }
185
+ ++n_M;
186
+ } else if (op == MM_CIGAR_INS) {
187
+ qoff += len;
188
+ } else if (op == MM_CIGAR_DEL) {
189
+ toff += len;
190
+ } else if (op == MM_CIGAR_N_SKIP) {
191
+ toff += len;
192
+ }
193
+ }
194
+ // update in-place if we can
195
+ if (n_EQX == n_M) {
196
+ for (k = 0; k < r->p->n_cigar; ++k) {
197
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
198
+ if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
199
+ }
200
+ return;
201
+ }
202
+ // allocate new storage
203
+ cap = r->p->n_cigar + (n_EQX - n_M) + sizeof(mm_extra_t);
204
+ kroundup32(cap);
205
+ p = (mm_extra_t*)calloc(cap, 4);
206
+ memcpy(p, r->p, sizeof(mm_extra_t));
207
+ p->capacity = cap;
208
+ // update cigar while copying
209
+ toff = qoff = m = 0;
210
+ for (k = 0; k < r->p->n_cigar; ++k) {
211
+ uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
212
+ if (op == MM_CIGAR_MATCH) {
213
+ while (len > 0) {
214
+ // match
215
+ for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
216
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
217
+ len -= l;
218
+ toff += l, qoff += l;
219
+ // mismatch
220
+ for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
221
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
222
+ len -= l;
223
+ toff += l, qoff += l;
224
+ }
225
+ continue;
226
+ } else if (op == MM_CIGAR_INS) {
227
+ qoff += len;
228
+ } else if (op == MM_CIGAR_DEL) {
229
+ toff += len;
230
+ } else if (op == MM_CIGAR_N_SKIP) {
231
+ toff += len;
232
+ }
233
+ p->cigar[m++] = r->p->cigar[k];
234
+ }
235
+ p->n_cigar = m;
236
+ free(r->p);
237
+ r->p = p;
238
+ }
239
+
240
+ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap)
241
+ {
242
+ uint32_t k, l;
243
+ int32_t qshift, tshift, toff = 0, qoff = 0;
244
+ double s = 0.0, max = 0.0;
245
+ mm_extra_t *p = r->p;
246
+ if (p == 0) return;
247
+ mm_fix_cigar(r, qseq, tseq, &qshift, &tshift);
248
+ qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D
249
+ r->blen = r->mlen = 0;
250
+ for (k = 0; k < p->n_cigar; ++k) {
251
+ uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
252
+ if (op == MM_CIGAR_MATCH) {
253
+ int n_ambi = 0, n_diff = 0;
254
+ for (l = 0; l < len; ++l) {
255
+ int cq = qseq[qoff + l], ct = tseq[toff + l];
256
+ if (ct > 3 || cq > 3) ++n_ambi;
257
+ else if (ct != cq) ++n_diff;
258
+ s += mat[ct * 5 + cq];
259
+ if (s < 0) s = 0;
260
+ else max = max > s? max : s;
261
+ }
262
+ r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
263
+ toff += len, qoff += len;
264
+ } else if (op == MM_CIGAR_INS) {
265
+ int n_ambi = 0;
266
+ for (l = 0; l < len; ++l)
267
+ if (qseq[qoff + l] > 3) ++n_ambi;
268
+ r->blen += len - n_ambi, p->n_ambi += n_ambi;
269
+ if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
270
+ else s -= q + e;
271
+ if (s < 0) s = 0;
272
+ qoff += len;
273
+ } else if (op == MM_CIGAR_DEL) {
274
+ int n_ambi = 0;
275
+ for (l = 0; l < len; ++l)
276
+ if (tseq[toff + l] > 3) ++n_ambi;
277
+ r->blen += len - n_ambi, p->n_ambi += n_ambi;
278
+ if (log_gap) s -= q + (double)e * mg_log2(1.0 + len);
279
+ else s -= q + e;
280
+ if (s < 0) s = 0;
281
+ toff += len;
282
+ } else if (op == MM_CIGAR_N_SKIP) {
283
+ toff += len;
284
+ }
285
+ }
286
+ p->dp_max = (int32_t)(max + .499);
287
+ assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
288
+ if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
289
+ }
290
+
291
+ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
292
+ {
293
+ mm_extra_t *p;
294
+ if (n_cigar == 0) return;
295
+ if (r->p == 0) {
296
+ uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
297
+ kroundup32(capacity);
298
+ r->p = (mm_extra_t*)calloc(capacity, 4);
299
+ r->p->capacity = capacity;
300
+ } else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) {
301
+ r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4;
302
+ kroundup32(r->p->capacity);
303
+ r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4);
304
+ }
305
+ p = r->p;
306
+ if (p->n_cigar > 0 && (p->cigar[p->n_cigar-1]&0xf) == (cigar[0]&0xf)) { // same CIGAR op at the boundary
307
+ p->cigar[p->n_cigar-1] += cigar[0]>>4<<4;
308
+ if (n_cigar > 1) memcpy(p->cigar + p->n_cigar, cigar + 1, (n_cigar - 1) * 4);
309
+ p->n_cigar += n_cigar - 1;
310
+ } else {
311
+ memcpy(p->cigar + p->n_cigar, cigar, n_cigar * 4);
312
+ p->n_cigar += n_cigar;
313
+ }
314
+ }
315
+
316
+ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const uint8_t *junc, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez)
317
+ {
318
+ if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
319
+ int i;
320
+ fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop);
321
+ for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr);
322
+ fputc('\n', stderr);
323
+ for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
324
+ fputc('\n', stderr);
325
+ }
326
+ if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
327
+ ksw_reset_extz(ez);
328
+ ez->zdropped = 1;
329
+ } else if (opt->flag & MM_F_SPLICE)
330
+ ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez);
331
+ else if (opt->q == opt->q2 && opt->e == opt->e2)
332
+ ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez);
333
+ else
334
+ ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez);
335
+ if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
336
+ int i;
337
+ fprintf(stderr, "score=%d, cigar=", ez->score);
338
+ for (i = 0; i < ez->n_cigar; ++i)
339
+ fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
340
+ fprintf(stderr, "\n");
341
+ }
342
+ }
343
+
344
+ static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x)
345
+ {
346
+ int64_t i, off0 = mi->seq[rid].offset, off = off0 + x;
347
+ int c = mm_seq4_get(mi->S, off);
348
+ for (i = off - 1; i >= off0; --i)
349
+ if (mm_seq4_get(mi->S, i) != c) break;
350
+ return (int)(off - i);
351
+ }
352
+
353
+ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q)
354
+ {
355
+ if (mi->flag & MM_I_HPC) {
356
+ const uint8_t *qseq = qseq0[a->x>>63];
357
+ int i, c;
358
+ *q = (int32_t)a->y;
359
+ for (i = *q - 1, c = qseq[*q]; i > 0; --i)
360
+ if (qseq[i] != c) break;
361
+ *q = i + 1;
362
+ c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
363
+ *r = (int32_t)a->x + 1 - c;
364
+ } else {
365
+ *r = (int32_t)a->x - (mi->k>>1);
366
+ *q = (int32_t)a->y - (mi->k>>1);
367
+ }
368
+ }
369
+
370
+ static int *collect_long_gaps(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int *n_)
371
+ {
372
+ int i, n, *K;
373
+ *n_ = 0;
374
+ for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap
375
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
376
+ if (gap < -min_gap || gap > min_gap) ++n;
377
+ }
378
+ if (n <= 1) return 0;
379
+ K = (int*)kmalloc(km, n * sizeof(int));
380
+ for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps
381
+ int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x);
382
+ if (gap < -min_gap || gap > min_gap)
383
+ K[n++] = i;
384
+ }
385
+ *n_ = n;
386
+ return K;
387
+ }
388
+
389
+ static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt)
390
+ {
391
+ int max_st, max_en, n, i, k, max, *K;
392
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
393
+ if (K == 0) return;
394
+ max = 0, max_st = max_en = -1;
395
+ for (k = 0;; ++k) { // traverse long gaps
396
+ int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1;
397
+ if (k == n || k >= max_en) {
398
+ if (max_en > 0)
399
+ for (i = K[max_st]; i < K[max_en]; ++i)
400
+ a[as1 + i].y |= MM_SEED_IGNORE;
401
+ max = 0, max_st = max_en = -1;
402
+ if (k == n) break;
403
+ }
404
+ i = K[k];
405
+ gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x);
406
+ if (gap > 0) n_ins += gap;
407
+ else n_del += -gap;
408
+ qs = (int32_t)a[as1 + i - 1].y;
409
+ rs = (int32_t)a[as1 + i - 1].x;
410
+ for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) {
411
+ int j = K[l], diff;
412
+ if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break;
413
+ gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
414
+ if (gap > 0) n_ins += gap;
415
+ else n_del += -gap;
416
+ diff = n_ins + n_del - abs(n_ins - n_del);
417
+ if (max_diff < diff)
418
+ max_diff = diff, max_diff_l = l;
419
+ }
420
+ if (max_diff > diff_thres && max_diff > max)
421
+ max = max_diff, max_st = k, max_en = max_diff_l;
422
+ }
423
+ kfree(km, K);
424
+ }
425
+
426
+ static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int max_ext)
427
+ {
428
+ int n, k, *K;
429
+ K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n);
430
+ if (K == 0) return;
431
+ for (k = 0; k < n;) {
432
+ int i = K[k], l;
433
+ int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x);
434
+ int re1 = (int32_t)a[as1 + i].x;
435
+ int qe1 = (int32_t)a[as1 + i].y;
436
+ gap1 = gap1 > 0? gap1 : -gap1;
437
+ for (l = k + 1; l < n; ++l) {
438
+ int j = K[l], gap2, q_span_pre, rs2, qs2, m;
439
+ if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break;
440
+ gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x);
441
+ q_span_pre = a[as1 + j - 1].y >> 32 & 0xff;
442
+ rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre;
443
+ qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre;
444
+ m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1;
445
+ gap2 = gap2 > 0? gap2 : -gap2;
446
+ if (m > gap1 + gap2) break;
447
+ re1 = (int32_t)a[as1 + j].x;
448
+ qe1 = (int32_t)a[as1 + j].y;
449
+ gap1 = gap2;
450
+ }
451
+ if (l > k + 1) {
452
+ int j, end = K[l - 1];
453
+ for (j = K[k]; j < end; ++j)
454
+ a[as1 + j].y |= MM_SEED_IGNORE;
455
+ a[as1 + end].y |= MM_SEED_LONG_JOIN;
456
+ }
457
+ k = l;
458
+ }
459
+ kfree(km, K);
460
+ }
461
+
462
+ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int min_match, int32_t *as, int32_t *cnt)
463
+ {
464
+ int32_t i, l, m;
465
+ *as = r->as, *cnt = r->cnt;
466
+ if (r->cnt < 3) return;
467
+ m = l = a[r->as].y >> 32 & 0xff;
468
+ for (i = r->as + 1; i < r->as + r->cnt - 1; ++i) {
469
+ int32_t lq, lr, min, max;
470
+ int32_t q_span = a[i].y >> 32 & 0xff;
471
+ if (a[i].y & MM_SEED_LONG_JOIN) break;
472
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
473
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
474
+ min = lr < lq? lr : lq;
475
+ max = lr > lq? lr : lq;
476
+ if (max - min > l >> 1) *as = i;
477
+ l += min;
478
+ m += min < q_span? min : q_span;
479
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
480
+ }
481
+ *cnt = r->as + r->cnt - *as;
482
+ m = l = a[r->as + r->cnt - 1].y >> 32 & 0xff;
483
+ for (i = r->as + r->cnt - 2; i > *as; --i) {
484
+ int32_t lq, lr, min, max;
485
+ int32_t q_span = a[i+1].y >> 32 & 0xff;
486
+ if (a[i+1].y & MM_SEED_LONG_JOIN) break;
487
+ lr = (int32_t)a[i+1].x - (int32_t)a[i].x;
488
+ lq = (int32_t)a[i+1].y - (int32_t)a[i].y;
489
+ min = lr < lq? lr : lq;
490
+ max = lr > lq? lr : lq;
491
+ if (max - min > l >> 1) *cnt = i + 1 - *as;
492
+ l += min;
493
+ m += min < q_span? min : q_span;
494
+ if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break;
495
+ }
496
+ }
497
+
498
+ static void mm_max_stretch(const mm_reg1_t *r, const mm128_t *a, int32_t *as, int32_t *cnt)
499
+ {
500
+ int32_t i, score, max_score, len, max_i, max_len;
501
+
502
+ *as = r->as, *cnt = r->cnt;
503
+ if (r->cnt < 2) return;
504
+
505
+ max_score = -1, max_i = -1, max_len = 0;
506
+ score = a[r->as].y >> 32 & 0xff, len = 1;
507
+ for (i = r->as + 1; i < r->as + r->cnt; ++i) {
508
+ int32_t lq, lr, q_span;
509
+ q_span = a[i].y >> 32 & 0xff;
510
+ lr = (int32_t)a[i].x - (int32_t)a[i-1].x;
511
+ lq = (int32_t)a[i].y - (int32_t)a[i-1].y;
512
+ if (lq == lr) {
513
+ score += lq < q_span? lq : q_span;
514
+ ++len;
515
+ } else {
516
+ if (score > max_score)
517
+ max_score = score, max_len = len, max_i = i - len;
518
+ score = q_span, len = 1;
519
+ }
520
+ }
521
+ if (score > max_score)
522
+ max_score = score, max_len = len, max_i = i - len;
523
+ *as = max_i, *cnt = max_len;
524
+ }
525
+
526
+ static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a)
527
+ {
528
+ uint8_t *qseq, *tseq;
529
+ int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len;
530
+ void *qp;
531
+ rid = a->x<<1>>33;
532
+ re = (uint32_t)a->x + 1, rs = re - q_span;
533
+ qe = (uint32_t)a->y + 1, qs = qe - q_span;
534
+ rs = rs - ext_len > 0? rs - ext_len : 0;
535
+ qs = qs - ext_len > 0? qs - ext_len : 0;
536
+ re = re + ext_len < (int32_t)mi->seq[rid].len? re + ext_len : mi->seq[rid].len;
537
+ qe = qe + ext_len < qlen? qe + ext_len : qlen;
538
+ tseq = (uint8_t*)kmalloc(km, re - rs);
539
+ if (opt->flag & MM_F_QSTRAND) {
540
+ qseq = qseq0[0] + qs;
541
+ mm_idx_getseq2(mi, a->x>>63, rid, rs, re, tseq);
542
+ } else {
543
+ qseq = qseq0[a->x>>63] + qs;
544
+ mm_idx_getseq(mi, rid, rs, re, tseq);
545
+ }
546
+ qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat);
547
+ score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off);
548
+ kfree(km, tseq);
549
+ kfree(km, qp);
550
+ return score;
551
+ }
552
+
553
+ static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1)
554
+ { // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
555
+ int score;
556
+ double log_gap;
557
+ *as1 = r->as, *cnt1 = r->cnt;
558
+ if (r->cnt < 3) return;
559
+ log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x);
560
+ if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
561
+ score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]);
562
+ if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift"
563
+ ++(*as1), --(*cnt1);
564
+ }
565
+ log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x);
566
+ if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
567
+ score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]);
568
+ if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift)
569
+ --(*cnt1);
570
+ }
571
+ }
572
+
573
+ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
574
+ {
575
+ int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
576
+ int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
577
+ uint8_t *tseq, *qseq, *junc;
578
+ int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
579
+ int32_t rs, re, qs, qe;
580
+ int32_t rs1, qs1, re1, qe1;
581
+ int8_t mat[25];
582
+
583
+ if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
584
+
585
+ r2->cnt = 0;
586
+ if (r->cnt == 0) return;
587
+ ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
588
+ bw = (int)(opt->bw * 1.5 + 1.);
589
+ bw_long = (int)(opt->bw_long * 1.5 + 1.);
590
+ if (bw_long < bw) bw_long = bw;
591
+
592
+ if (is_sr && !(mi->flag & MM_I_HPC)) {
593
+ mm_max_stretch(r, a, &as1, &cnt1);
594
+ rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff);
595
+ qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff);
596
+ re = (int32_t)a[as1+cnt1-1].x + 1;
597
+ qe = (int32_t)a[as1+cnt1-1].y + 1;
598
+ } else {
599
+ if (!(opt->flag & MM_F_NO_END_FLT)) {
600
+ if (is_splice)
601
+ mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1);
602
+ else
603
+ mm_fix_bad_ends(r, a, opt->bw, opt->min_chain_score * 2, &as1, &cnt1);
604
+ } else as1 = r->as, cnt1 = r->cnt;
605
+ mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
606
+ mm_filter_bad_seeds_alt(km, as1, cnt1, a, 30, opt->max_gap>>1);
607
+ mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
608
+ mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
609
+ }
610
+ assert(cnt1 > 0);
611
+
612
+ if (is_splice) {
613
+ if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
614
+ if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
615
+ if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
616
+ }
617
+
618
+ /* Look for the start and end of regions to perform DP. This sounds easy
619
+ * but is in fact tricky. Excessively small regions lead to unnecessary
620
+ * clippings and lose alignable sequences. Excessively large regions
621
+ * occasionally lead to large overlaps between two chains and may cause
622
+ * loss of alignments in corner cases. */
623
+ if (is_sr) {
624
+ qs0 = 0, qe0 = qlen;
625
+ l = qs;
626
+ l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
627
+ rs0 = rs - l > 0? rs - l : 0;
628
+ l = qlen - qe;
629
+ l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
630
+ re0 = re + l < (int32_t)mi->seq[rid].len? re + l : mi->seq[rid].len;
631
+ } else {
632
+ // compute rs0 and qs0
633
+ rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff);
634
+ qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff);
635
+ if (rs0 < 0) rs0 = 0; // this may happen when HPC is in use
636
+ assert(qs0 >= 0); // this should never happen, or it is logic error
637
+ rs1 = qs1 = 0;
638
+ for (i = r->as - 1, l = 0; i >= 0 && a[i].x>>32 == a[r->as].x>>32; --i) { // inspect nearby seeds
639
+ int32_t x = (int32_t)a[i].x + 1 - (int32_t)(a[i].y>>32&0xff);
640
+ int32_t y = (int32_t)a[i].y + 1 - (int32_t)(a[i].y>>32&0xff);
641
+ if (x < rs0 && y < qs0) {
642
+ if (++l > opt->min_cnt) {
643
+ l = rs0 - x > qs0 - y? rs0 - x : qs0 - y;
644
+ rs1 = rs0 - l, qs1 = qs0 - l;
645
+ if (rs1 < 0) rs1 = 0; // not strictly necessary; better have this guard for explicit
646
+ break;
647
+ }
648
+ }
649
+ }
650
+ if (qs > 0 && rs > 0) {
651
+ l = qs < opt->max_gap? qs : opt->max_gap;
652
+ qs1 = qs1 > qs - l? qs1 : qs - l;
653
+ qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0
654
+ l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
655
+ l = l < opt->max_gap? l : opt->max_gap;
656
+ l = l < rs? l : rs;
657
+ rs1 = rs1 > rs - l? rs1 : rs - l;
658
+ rs0 = rs0 < rs1? rs0 : rs1;
659
+ rs0 = rs0 < rs? rs0 : rs;
660
+ } else rs0 = rs, qs0 = qs;
661
+ // compute re0 and qe0
662
+ re0 = (int32_t)a[r->as + r->cnt - 1].x + 1;
663
+ qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1;
664
+ re1 = mi->seq[rid].len, qe1 = qlen;
665
+ for (i = r->as + r->cnt, l = 0; i < n_a && a[i].x>>32 == a[r->as].x>>32; ++i) { // inspect nearby seeds
666
+ int32_t x = (int32_t)a[i].x + 1;
667
+ int32_t y = (int32_t)a[i].y + 1;
668
+ if (x > re0 && y > qe0) {
669
+ if (++l > opt->min_cnt) {
670
+ l = x - re0 > y - qe0? x - re0 : y - qe0;
671
+ re1 = re0 + l, qe1 = qe0 + l;
672
+ break;
673
+ }
674
+ }
675
+ }
676
+ if (qe < qlen && re < (int32_t)mi->seq[rid].len) {
677
+ l = qlen - qe < opt->max_gap? qlen - qe : opt->max_gap;
678
+ qe1 = qe1 < qe + l? qe1 : qe + l;
679
+ qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0
680
+ l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
681
+ l = l < opt->max_gap? l : opt->max_gap;
682
+ l = l < (int32_t)mi->seq[rid].len - re? l : mi->seq[rid].len - re;
683
+ re1 = re1 < re + l? re1 : re + l;
684
+ re0 = re0 > re1? re0 : re1;
685
+ } else re0 = re, qe0 = qe;
686
+ }
687
+ if (a[r->as].y & MM_SEED_SELF) {
688
+ int max_ext = r->qs > r->rs? r->qs - r->rs : r->rs - r->qs;
689
+ if (r->rs - rs0 > max_ext) rs0 = r->rs - max_ext;
690
+ if (r->qs - qs0 > max_ext) qs0 = r->qs - max_ext;
691
+ max_ext = r->qe > r->re? r->qe - r->re : r->re - r->qe;
692
+ if (re0 - r->re > max_ext) re0 = r->re + max_ext;
693
+ if (qe0 - r->qe > max_ext) qe0 = r->qe + max_ext;
694
+ }
695
+
696
+ assert(re0 > rs0);
697
+ tseq = (uint8_t*)kmalloc(km, re0 - rs0);
698
+ junc = (uint8_t*)kmalloc(km, re0 - rs0);
699
+
700
+ if (qs > 0 && rs > 0) { // left extension; probably the condition can be changed to "qs > qs0 && rs > rs0"
701
+ if (opt->flag & MM_F_QSTRAND) {
702
+ qseq = &qseq0[0][qs0];
703
+ mm_idx_getseq2(mi, rev, rid, rs0, rs, tseq);
704
+ } else {
705
+ qseq = &qseq0[rev][qs0];
706
+ mm_idx_getseq(mi, rid, rs0, rs, tseq);
707
+ }
708
+ mm_idx_bed_junc(mi, rid, rs0, rs, junc);
709
+ mm_seq_rev(qs - qs0, qseq);
710
+ mm_seq_rev(rs - rs0, tseq);
711
+ mm_seq_rev(rs - rs0, junc);
712
+ mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, junc, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
713
+ if (ez->n_cigar > 0) {
714
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
715
+ r->p->dp_score += ez->max;
716
+ }
717
+ rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
718
+ qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1);
719
+ mm_seq_rev(qs - qs0, qseq);
720
+ } else rs1 = rs, qs1 = qs;
721
+ re1 = rs, qe1 = qs;
722
+ assert(qs1 >= 0 && rs1 >= 0);
723
+
724
+ for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
725
+ if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
726
+ if (is_sr && !(mi->flag & MM_I_HPC)) {
727
+ re = (int32_t)a[as1 + i].x + 1;
728
+ qe = (int32_t)a[as1 + i].y + 1;
729
+ } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
730
+ re1 = re, qe1 = qe;
731
+ if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
732
+ int j, bw1 = bw_long, zdrop_code;
733
+ if (a[as1+i].y & MM_SEED_LONG_JOIN)
734
+ bw1 = qe - qs > re - rs? qe - qs : re - rs;
735
+ // perform alignment
736
+ if (opt->flag & MM_F_QSTRAND) {
737
+ qseq = &qseq0[0][qs];
738
+ mm_idx_getseq2(mi, rev, rid, rs, re, tseq);
739
+ } else {
740
+ qseq = &qseq0[rev][qs];
741
+ mm_idx_getseq(mi, rid, rs, re, tseq);
742
+ }
743
+ mm_idx_bed_junc(mi, rid, rs, re, junc);
744
+ if (is_sr) { // perform ungapped alignment
745
+ assert(qe - qs == re - rs);
746
+ ksw_reset_extz(ez);
747
+ for (j = 0, ez->score = 0; j < qe - qs; ++j) {
748
+ if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
749
+ else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
750
+ }
751
+ ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
752
+ } else { // perform normal gapped alignment
753
+ mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
754
+ }
755
+ // test Z-drop and inversion Z-drop
756
+ if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0)
757
+ mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate
758
+ // update CIGAR
759
+ if (ez->n_cigar > 0)
760
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
761
+ if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
762
+ if (!r->p) {
763
+ assert(ez->n_cigar == 0);
764
+ uint32_t capacity = sizeof(mm_extra_t)/4;
765
+ kroundup32(capacity);
766
+ r->p = (mm_extra_t*)calloc(capacity, 4);
767
+ r->p->capacity = capacity;
768
+ }
769
+ for (j = i - 1; j >= 0; --j)
770
+ if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
771
+ break;
772
+ dropped = 1;
773
+ if (j < 0) j = 0;
774
+ r->p->dp_score += ez->max;
775
+ re1 = rs + (ez->max_t + 1);
776
+ qe1 = qs + (ez->max_q + 1);
777
+ if (cnt1 - (j + 1) >= opt->min_cnt) {
778
+ mm_split_reg(r, r2, as1 + j + 1 - r->as, qlen, a, !!(opt->flag&MM_F_QSTRAND));
779
+ if (zdrop_code == 2) r2->split_inv = 1;
780
+ }
781
+ break;
782
+ } else r->p->dp_score += ez->score;
783
+ rs = re, qs = qe;
784
+ }
785
+ }
786
+
787
+ if (!dropped && qe < qe0 && re < re0) { // right extension
788
+ if (opt->flag & MM_F_QSTRAND) {
789
+ qseq = &qseq0[0][qe];
790
+ mm_idx_getseq2(mi, rev, rid, re, re0, tseq);
791
+ } else {
792
+ qseq = &qseq0[rev][qe];
793
+ mm_idx_getseq(mi, rid, re, re0, tseq);
794
+ }
795
+ mm_idx_bed_junc(mi, rid, re, re0, junc);
796
+ mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, junc, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
797
+ if (ez->n_cigar > 0) {
798
+ mm_append_cigar(r, ez->n_cigar, ez->cigar);
799
+ r->p->dp_score += ez->max;
800
+ }
801
+ re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1);
802
+ qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1);
803
+ }
804
+ assert(qe1 <= qlen);
805
+
806
+ r->rs = rs1, r->re = re1;
807
+ if (!rev || (opt->flag & MM_F_QSTRAND)) r->qs = qs1, r->qe = qe1;
808
+ else r->qs = qlen - qe1, r->qe = qlen - qs1;
809
+
810
+ assert(re1 - rs1 <= re0 - rs0);
811
+ if (r->p) {
812
+ if (opt->flag & MM_F_QSTRAND) {
813
+ mm_idx_getseq2(mi, r->rev, rid, rs1, re1, tseq);
814
+ qseq = &qseq0[0][qs1];
815
+ } else {
816
+ mm_idx_getseq(mi, rid, rs1, re1, tseq);
817
+ qseq = &qseq0[r->rev][qs1];
818
+ }
819
+ mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
820
+ if (rev && r->p->trans_strand)
821
+ r->p->trans_strand ^= 3; // flip to the read strand
822
+ }
823
+
824
+ kfree(km, tseq);
825
+ kfree(km, junc);
826
+ }
827
+
828
+ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
829
+ { // NB: this doesn't work with the qstrand mode
830
+ int tl, ql, score, ret = 0, q_off, t_off;
831
+ uint8_t *tseq, *qseq;
832
+ int8_t mat[25];
833
+ void *qp;
834
+
835
+ memset(r_inv, 0, sizeof(mm_reg1_t));
836
+ if (!(r1->split&1) || !(r2->split&2)) return 0;
837
+ if (r1->id != r1->parent && r1->parent != MM_PARENT_TMP_PRI) return 0;
838
+ if (r2->id != r2->parent && r2->parent != MM_PARENT_TMP_PRI) return 0;
839
+ if (r1->rid != r2->rid || r1->rev != r2->rev) return 0;
840
+ ql = r1->rev? r1->qs - r2->qe : r2->qs - r1->qe;
841
+ tl = r2->rs - r1->re;
842
+ if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
843
+ if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
844
+
845
+ ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
846
+ tseq = (uint8_t*)kmalloc(km, tl);
847
+ mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
848
+ qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
849
+
850
+ mm_seq_rev(ql, qseq);
851
+ mm_seq_rev(tl, tseq);
852
+ qp = ksw_ll_qinit(km, 2, ql, qseq, 5, mat);
853
+ score = ksw_ll_i16(qp, tl, tseq, opt->q, opt->e, &q_off, &t_off);
854
+ kfree(km, qp);
855
+ mm_seq_rev(ql, qseq);
856
+ mm_seq_rev(tl, tseq);
857
+ if (score < opt->min_dp_max) goto end_align1_inv;
858
+ q_off = ql - (q_off + 1), t_off = tl - (t_off + 1);
859
+ mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, 0, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez);
860
+ if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
861
+ mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
862
+ r_inv->p->dp_score = ez->max;
863
+ r_inv->id = -1;
864
+ r_inv->parent = MM_PARENT_UNSET;
865
+ r_inv->inv = 1;
866
+ r_inv->rev = !r1->rev;
867
+ r_inv->rid = r1->rid;
868
+ r_inv->div = -1.0f;
869
+ if (r_inv->rev == 0) {
870
+ r_inv->qs = r2->qe + q_off;
871
+ r_inv->qe = r_inv->qs + ez->max_q + 1;
872
+ } else {
873
+ r_inv->qe = r2->qs - q_off;
874
+ r_inv->qs = r_inv->qe - (ez->max_q + 1);
875
+ }
876
+ r_inv->rs = r1->re + t_off;
877
+ r_inv->re = r_inv->rs + ez->max_t + 1;
878
+ mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR));
879
+ ret = 1;
880
+ end_align1_inv:
881
+ kfree(km, tseq);
882
+ return ret;
883
+ }
884
+
885
+ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs)
886
+ {
887
+ regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t));
888
+ if (i + 1 != *n_regs)
889
+ memmove(&regs[i + 2], &regs[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1));
890
+ regs[i + 1] = *r;
891
+ ++*n_regs;
892
+ return regs;
893
+ }
894
+
895
+ static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_)
896
+ {
897
+ uint32_t i;
898
+ int32_t n_gapo = 0, n_gap = 0;
899
+ *n_gap_ = *n_gapo_ = -1;
900
+ if (r->p == 0) return;
901
+ for (i = 0; i < r->p->n_cigar; ++i) {
902
+ int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
903
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
904
+ ++n_gapo, n_gap += len;
905
+ }
906
+ *n_gap_ = n_gap, *n_gapo_ = n_gapo;
907
+ }
908
+
909
+ double mm_event_identity(const mm_reg1_t *r)
910
+ {
911
+ int32_t n_gap, n_gapo;
912
+ if (r->p == 0) return -1.0f;
913
+ mm_count_gaps(r, &n_gap, &n_gapo);
914
+ return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
915
+ }
916
+
917
+ static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc)
918
+ {
919
+ uint32_t i;
920
+ int32_t n_gap = 0, n_gapo = 0, n_mis;
921
+ double gap_cost = 0.0;
922
+ if (r->p == 0) return -1;
923
+ for (i = 0; i < r->p->n_cigar; ++i) {
924
+ int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
925
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
926
+ gap_cost += b2 + (double)mg_log2(1.0 + len);
927
+ ++n_gapo, n_gap += len;
928
+ }
929
+ }
930
+ n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap;
931
+ return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499);
932
+ }
933
+
934
+ void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b)
935
+ {
936
+ int32_t max = -1, max2 = -1, i, max_i = -1;
937
+ double div, b2;
938
+ if (n_regs < 2) return;
939
+ for (i = 0; i < n_regs; ++i) {
940
+ mm_reg1_t *r = &regs[i];
941
+ if (r->p == 0) continue;
942
+ if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i;
943
+ else if (r->p->dp_max > max2) max2 = r->p->dp_max;
944
+ }
945
+ if (max_i < 0 || max < 0 || max2 < 0) return;
946
+ if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return;
947
+ if (max2 < (double)max * frac) return;
948
+ div = 1. - mm_event_identity(&regs[max_i]);
949
+ if (div < 0.02) div = 0.02;
950
+ b2 = 0.5 / div; // max value: 25
951
+ if (b2 * a < b) b2 = (double)a / b;
952
+ for (i = 0; i < n_regs; ++i) {
953
+ mm_reg1_t *r = &regs[i];
954
+ if (r->p == 0) continue;
955
+ r->p->dp_max = mm_recal_max_dp(r, b2, a);
956
+ if (r->p->dp_max < 0) r->p->dp_max = 0;
957
+ }
958
+ }
959
+
960
+ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
961
+ {
962
+ extern unsigned char seq_nt4_table[256];
963
+ int32_t i, n_regs = *n_regs_, n_a;
964
+ uint8_t *qseq0[2];
965
+ ksw_extz_t ez;
966
+
967
+ // encode the query sequence
968
+ qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2);
969
+ qseq0[1] = qseq0[0] + qlen;
970
+ for (i = 0; i < qlen; ++i) {
971
+ qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]];
972
+ qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4;
973
+ }
974
+
975
+ // align through seed hits
976
+ n_a = mm_squeeze_a(km, n_regs, regs, a);
977
+ memset(&ez, 0, sizeof(ksw_extz_t));
978
+ for (i = 0; i < n_regs; ++i) {
979
+ mm_reg1_t r2;
980
+ if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) { // then do two rounds of alignments for both strands
981
+ mm_reg1_t s[2], s2[2];
982
+ int which, trans_strand;
983
+ s[0] = s[1] = regs[i];
984
+ mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR);
985
+ mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV);
986
+ if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
987
+ else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
988
+ else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
989
+ if (which == 0) {
990
+ regs[i] = s[0], r2 = s2[0];
991
+ free(s[1].p);
992
+ } else {
993
+ regs[i] = s[1], r2 = s2[1];
994
+ free(s[0].p);
995
+ }
996
+ regs[i].p->trans_strand = trans_strand;
997
+ } else { // one round of alignment
998
+ mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, n_a, a, &ez, opt->flag);
999
+ if (opt->flag&MM_F_SPLICE)
1000
+ regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
1001
+ }
1002
+ if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
1003
+ if (i > 0 && regs[i].split_inv && !(opt->flag & MM_F_NO_INV)) {
1004
+ if (mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
1005
+ regs = mm_insert_reg(&r2, i, &n_regs, regs);
1006
+ ++i; // skip the inserted INV alignment
1007
+ }
1008
+ }
1009
+ }
1010
+ *n_regs_ = n_regs;
1011
+ kfree(km, qseq0[0]);
1012
+ kfree(km, ez.cigar);
1013
+ mm_filter_regs(opt, qlen, n_regs_, regs);
1014
+ if (!(opt->flag&MM_F_SR) && !opt->split_prefix && qlen >= opt->rank_min_len) {
1015
+ mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b);
1016
+ mm_filter_regs(opt, qlen, n_regs_, regs);
1017
+ }
1018
+ mm_hit_sort(km, n_regs_, regs, opt->alt_drop);
1019
+ return regs;
1020
+ }