minimap2 0.2.23.0 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/{vendor → ext/vendor}/libminimap2.so +0 -0
  93. data/lib/minimap2/ffi/functions.rb +5 -0
  94. data/lib/minimap2/version.rb +1 -1
  95. data/lib/minimap2.rb +32 -0
  96. metadata +94 -4
@@ -0,0 +1,466 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include <math.h>
4
+ #include "mmpriv.h"
5
+ #include "kalloc.h"
6
+ #include "khash.h"
7
+
8
+ static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
9
+ {
10
+ int i;
11
+ r->mlen = r->blen = 0;
12
+ if (r->cnt <= 0) return;
13
+ r->mlen = r->blen = a[r->as].y>>32&0xff;
14
+ for (i = r->as + 1; i < r->as + r->cnt; ++i) {
15
+ int span = a[i].y>>32&0xff;
16
+ int tl = (int32_t)a[i].x - (int32_t)a[i-1].x;
17
+ int ql = (int32_t)a[i].y - (int32_t)a[i-1].y;
18
+ r->blen += tl > ql? tl : ql;
19
+ r->mlen += tl > span && ql > span? span : tl < ql? tl : ql;
20
+ }
21
+ }
22
+
23
+ static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a, int is_qstrand)
24
+ { // NB: r->as and r->cnt MUST BE set correctly for this function to work
25
+ int32_t k = r->as, q_span = (int32_t)(a[k].y>>32&0xff);
26
+ r->rev = a[k].x>>63;
27
+ r->rid = a[k].x<<1>>33;
28
+ r->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // NB: target span may be shorter, so this test is necessary
29
+ r->re = (int32_t)a[k + r->cnt - 1].x + 1;
30
+ if (!r->rev || is_qstrand) {
31
+ r->qs = (int32_t)a[k].y + 1 - q_span;
32
+ r->qe = (int32_t)a[k + r->cnt - 1].y + 1;
33
+ } else {
34
+ r->qs = qlen - ((int32_t)a[k + r->cnt - 1].y + 1);
35
+ r->qe = qlen - ((int32_t)a[k].y + 1 - q_span);
36
+ }
37
+ mm_cal_fuzzy_len(r, a);
38
+ }
39
+
40
+ static inline uint64_t hash64(uint64_t key)
41
+ {
42
+ key = (~key + (key << 21));
43
+ key = key ^ key >> 24;
44
+ key = ((key + (key << 3)) + (key << 8));
45
+ key = key ^ key >> 14;
46
+ key = ((key + (key << 2)) + (key << 4));
47
+ key = key ^ key >> 28;
48
+ key = (key + (key << 31));
49
+ return key;
50
+ }
51
+
52
+ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand) // convert chains to hits
53
+ {
54
+ mm128_t *z, tmp;
55
+ mm_reg1_t *r;
56
+ int i, k;
57
+
58
+ if (n_u == 0) return 0;
59
+
60
+ // sort by score
61
+ z = (mm128_t*)kmalloc(km, n_u * 16);
62
+ for (i = k = 0; i < n_u; ++i) {
63
+ uint32_t h;
64
+ h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
65
+ z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
66
+ z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
67
+ k += (int32_t)u[i];
68
+ }
69
+ radix_sort_128x(z, z + n_u);
70
+ for (i = 0; i < n_u>>1; ++i) // reverse, s.t. larger score first
71
+ tmp = z[i], z[i] = z[n_u-1-i], z[n_u-1-i] = tmp;
72
+
73
+ // populate r[]
74
+ r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
75
+ for (i = 0; i < n_u; ++i) {
76
+ mm_reg1_t *ri = &r[i];
77
+ ri->id = i;
78
+ ri->parent = MM_PARENT_UNSET;
79
+ ri->score = ri->score0 = z[i].x >> 32;
80
+ ri->hash = (uint32_t)z[i].x;
81
+ ri->cnt = (int32_t)z[i].y;
82
+ ri->as = z[i].y >> 32;
83
+ ri->div = -1.0f;
84
+ mm_reg_set_coor(ri, qlen, a, is_qstrand);
85
+ }
86
+ kfree(km, z);
87
+ return r;
88
+ }
89
+
90
+ void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
91
+ {
92
+ int i;
93
+ if (mi->n_alt == 0) return;
94
+ for (i = 0; i < n; ++i)
95
+ if (mi->seq[r[i].rid].is_alt)
96
+ r[i].is_alt = 1;
97
+ }
98
+
99
+ static inline int mm_alt_score(int score, float alt_diff_frac)
100
+ {
101
+ if (score < 0) return score;
102
+ score = (int)(score * (1.0 - alt_diff_frac) + .499);
103
+ return score > 0? score : 1;
104
+ }
105
+
106
+ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand)
107
+ {
108
+ if (n <= 0 || n >= r->cnt) return;
109
+ *r2 = *r;
110
+ r2->id = -1;
111
+ r2->sam_pri = 0;
112
+ r2->p = 0;
113
+ r2->split_inv = 0;
114
+ r2->cnt = r->cnt - n;
115
+ r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
116
+ r2->as = r->as + n;
117
+ if (r->parent == r->id) r2->parent = MM_PARENT_TMP_PRI;
118
+ mm_reg_set_coor(r2, qlen, a, is_qstrand);
119
+ r->cnt -= r2->cnt;
120
+ r->score -= r2->score;
121
+ mm_reg_set_coor(r, qlen, a, is_qstrand);
122
+ r->split |= 1, r2->split |= 2;
123
+ }
124
+
125
+ void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
126
+ {
127
+ int i, j, k, *w;
128
+ uint64_t *cov;
129
+ if (n <= 0) return;
130
+ for (i = 0; i < n; ++i) r[i].id = i;
131
+ cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
132
+ w = (int*)kmalloc(km, n * sizeof(int));
133
+ w[0] = 0, r[0].parent = 0;
134
+ for (i = 1, k = 1; i < n; ++i) {
135
+ mm_reg1_t *ri = &r[i];
136
+ int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
137
+ if (hard_mask_level) goto skip_uncov;
138
+ for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
139
+ mm_reg1_t *rp = &r[w[j]];
140
+ int sj = rp->qs, ej = rp->qe;
141
+ if (ej <= si || sj >= ei) continue;
142
+ if (sj < si) sj = si;
143
+ if (ej > ei) ej = ei;
144
+ cov[n_cov++] = (uint64_t)sj<<32 | ej;
145
+ }
146
+ if (n_cov == 0) {
147
+ goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
148
+ } else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
149
+ int j, x = si;
150
+ radix_sort_64(cov, cov + n_cov);
151
+ for (j = 0; j < n_cov; ++j) {
152
+ if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
153
+ x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
154
+ }
155
+ if (ei > x) uncov_len += ei - x;
156
+ }
157
+ skip_uncov:
158
+ for (j = 0; j < k; ++j) { // traverse existing primary hits again
159
+ mm_reg1_t *rp = &r[w[j]];
160
+ int sj = rp->qs, ej = rp->qe, min, max, ol;
161
+ if (ej <= si || sj >= ei) continue; // no overlap
162
+ min = ej - sj < ei - si? ej - sj : ei - si;
163
+ max = ej - sj > ei - si? ej - sj : ei - si;
164
+ ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
165
+ if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
166
+ int cnt_sub = 0, sci = ri->score;
167
+ ri->parent = rp->parent;
168
+ if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
169
+ rp->subsc = rp->subsc > sci? rp->subsc : sci;
170
+ if (ri->cnt >= rp->cnt) cnt_sub = 1;
171
+ if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
172
+ sci = ri->p->dp_max;
173
+ if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
174
+ rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
175
+ if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
176
+ }
177
+ if (cnt_sub) ++rp->n_sub;
178
+ break;
179
+ }
180
+ }
181
+ set_parent_test:
182
+ if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
183
+ }
184
+ kfree(km, cov);
185
+ kfree(km, w);
186
+ }
187
+
188
+ void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
189
+ {
190
+ int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
191
+ mm128_t *aux;
192
+ mm_reg1_t *t;
193
+
194
+ if (n <= 1) return;
195
+ aux = (mm128_t*)kmalloc(km, n * 16);
196
+ t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
197
+ for (i = n_aux = 0; i < n; ++i) {
198
+ if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
199
+ int score;
200
+ if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
201
+ else score = r[i].score, no_cigar = 1;
202
+ if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
203
+ aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
204
+ aux[n_aux++].y = i;
205
+ } else if (r[i].p) {
206
+ free(r[i].p);
207
+ r[i].p = 0;
208
+ }
209
+ }
210
+ assert(has_cigar + no_cigar == 1);
211
+ radix_sort_128x(aux, aux + n_aux);
212
+ for (i = n_aux - 1; i >= 0; --i)
213
+ t[n_aux - 1 - i] = r[aux[i].y];
214
+ memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
215
+ *n_regs = n_aux;
216
+ kfree(km, aux);
217
+ kfree(km, t);
218
+ }
219
+
220
+ int mm_set_sam_pri(int n, mm_reg1_t *r)
221
+ {
222
+ int i, n_pri = 0;
223
+ for (i = 0; i < n; ++i)
224
+ if (r[i].id == r[i].parent) {
225
+ ++n_pri;
226
+ r[i].sam_pri = (n_pri == 1);
227
+ } else r[i].sam_pri = 0;
228
+ return n_pri;
229
+ }
230
+
231
+ void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id
232
+ {
233
+ int *tmp, i, max_id = -1, n_tmp;
234
+ if (n_regs <= 0) return;
235
+ for (i = 0; i < n_regs; ++i) // NB: doesn't work if mm_reg1_t::id is negative
236
+ max_id = max_id > regs[i].id? max_id : regs[i].id;
237
+ n_tmp = max_id + 1;
238
+ tmp = (int*)kmalloc(km, n_tmp * sizeof(int));
239
+ for (i = 0; i < n_tmp; ++i) tmp[i] = -1;
240
+ for (i = 0; i < n_regs; ++i)
241
+ if (regs[i].id >= 0) tmp[regs[i].id] = i;
242
+ for (i = 0; i < n_regs; ++i) {
243
+ mm_reg1_t *r = &regs[i];
244
+ r->id = i;
245
+ if (r->parent == MM_PARENT_TMP_PRI)
246
+ r->parent = i;
247
+ else if (r->parent >= 0 && tmp[r->parent] >= 0)
248
+ r->parent = tmp[r->parent];
249
+ else r->parent = MM_PARENT_UNSET;
250
+ }
251
+ kfree(km, tmp);
252
+ mm_set_sam_pri(n_regs, regs);
253
+ }
254
+
255
+ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r)
256
+ {
257
+ if (pri_ratio > 0.0f && *n_ > 0) {
258
+ int i, k, n = *n_, n_2nd = 0;
259
+ for (i = k = 0; i < n; ++i) {
260
+ int p = r[i].parent;
261
+ if (p == i || r[i].inv) { // primary or inversion
262
+ r[k++] = r[i];
263
+ } else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
264
+ if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rid == r[p].rid && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
265
+ r[k++] = r[i], ++n_2nd;
266
+ else if (r[i].p) free(r[i].p);
267
+ } else if (check_strand && n_2nd < best_n && r[i].score > min_strand_sc && r[i].rev != r[p].rev) {
268
+ r[i].strand_retained = 1;
269
+ r[k++] = r[i], ++n_2nd;
270
+ } else if (r[i].p) free(r[i].p);
271
+ }
272
+ if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
273
+ *n_ = k;
274
+ }
275
+ }
276
+
277
+ int mm_filter_strand_retained(int n_regs, mm_reg1_t *r)
278
+ {
279
+ int i, k;
280
+ for (i = k = 0; i < n_regs; ++i) {
281
+ int p = r[i].parent;
282
+ if (!r[i].strand_retained || r[i].div < r[p].div * 5.0f) {
283
+ if (k < i) r[k++] = r[i];
284
+ else ++k;
285
+ }
286
+ }
287
+ return k;
288
+ }
289
+
290
+ void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs)
291
+ { // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
292
+ int i, k;
293
+ for (i = k = 0; i < *n_regs; ++i) {
294
+ mm_reg1_t *r = &regs[i];
295
+ int flt = 0;
296
+ if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1;
297
+ if (r->p) { // these filters are only applied when base-alignment is available
298
+ if (r->mlen < opt->min_chain_score) flt = 1;
299
+ else if (r->p->dp_max < opt->min_dp_max) flt = 1;
300
+ else if (r->qs > qlen * opt->max_clip_ratio && qlen - r->qe > qlen * opt->max_clip_ratio) flt = 1;
301
+ if (flt) free(r->p);
302
+ }
303
+ if (!flt) {
304
+ if (k < i) regs[k++] = regs[i];
305
+ else ++k;
306
+ }
307
+ }
308
+ *n_regs = k;
309
+ }
310
+
311
+ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
312
+ { // squeeze out regions in a[] that are not referenced by regs[]
313
+ int i, as = 0;
314
+ uint64_t *aux;
315
+ aux = (uint64_t*)kmalloc(km, n_regs * 8);
316
+ for (i = 0; i < n_regs; ++i)
317
+ aux[i] = (uint64_t)regs[i].as << 32 | i;
318
+ radix_sort_64(aux, aux + n_regs);
319
+ for (i = 0; i < n_regs; ++i) {
320
+ mm_reg1_t *r = &regs[(int32_t)aux[i]];
321
+ if (r->as != as) {
322
+ memmove(&a[as], &a[r->as], r->cnt * 16);
323
+ r->as = as;
324
+ }
325
+ as += r->cnt;
326
+ }
327
+ kfree(km, aux);
328
+ return as;
329
+ }
330
+
331
+ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
332
+ {
333
+ int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
334
+ mm_seg_t *seg;
335
+
336
+ assert(n_segs <= MM_MAX_SEG);
337
+ for (s = 1, acc_qlen[0] = 0; s < n_segs; ++s)
338
+ acc_qlen[s] = acc_qlen[s-1] + qlens[s-1];
339
+ qlen_sum = acc_qlen[n_segs - 1] + qlens[n_segs - 1];
340
+
341
+ seg = (mm_seg_t*)kcalloc(km, n_segs, sizeof(mm_seg_t));
342
+ for (s = 0; s < n_segs; ++s) {
343
+ seg[s].u = (uint64_t*)kmalloc(km, n_regs0 * 8);
344
+ for (i = 0; i < n_regs0; ++i)
345
+ seg[s].u[i] = (uint64_t)regs0[i].score << 32;
346
+ }
347
+ for (i = 0; i < n_regs0; ++i) {
348
+ const mm_reg1_t *r = &regs0[i];
349
+ for (j = 0; j < r->cnt; ++j) {
350
+ int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
351
+ ++seg[sid].u[i];
352
+ ++seg[sid].n_a;
353
+ }
354
+ }
355
+ for (s = 0; s < n_segs; ++s) {
356
+ mm_seg_t *sr = &seg[s];
357
+ for (i = 0, sr->n_u = 0; i < n_regs0; ++i) // squeeze out zero-length per-segment chains
358
+ if ((int32_t)sr->u[i] != 0)
359
+ sr->u[sr->n_u++] = sr->u[i];
360
+ sr->a = (mm128_t*)kmalloc(km, sr->n_a * sizeof(mm128_t));
361
+ sr->n_a = 0;
362
+ }
363
+
364
+ for (i = 0; i < n_regs0; ++i) {
365
+ const mm_reg1_t *r = &regs0[i];
366
+ for (j = 0; j < r->cnt; ++j) {
367
+ int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
368
+ mm128_t a1 = a[r->as + j];
369
+ // on reverse strand, the segment position is:
370
+ // x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
371
+ // (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
372
+ a1.y -= a1.x>>63? qlen_sum - (qlens[sid] + acc_qlen[sid]) : acc_qlen[sid];
373
+ seg[sid].a[seg[sid].n_a++] = a1;
374
+ }
375
+ }
376
+ for (s = 0; s < n_segs; ++s) {
377
+ regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a, 0);
378
+ n_regs[s] = seg[s].n_u;
379
+ for (i = 0; i < n_regs[s]; ++i) {
380
+ regs[s][i].seg_split = 1;
381
+ regs[s][i].seg_id = s;
382
+ }
383
+ }
384
+ return seg;
385
+ }
386
+
387
+ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs)
388
+ {
389
+ int i;
390
+ for (i = 0; i < n_segs; ++i) kfree(km, segs[i].u);
391
+ for (i = 0; i < n_segs; ++i) kfree(km, segs[i].a);
392
+ kfree(km, segs);
393
+ }
394
+
395
+ static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
396
+ {
397
+ int i, n_aux;
398
+ mm128_t *aux;
399
+ if (n_regs < 3) return;
400
+ for (i = 0; i < n_regs; ++i)
401
+ if (regs[i].inv) break;
402
+ if (i == n_regs) return; // no inversion hits
403
+
404
+ aux = (mm128_t*)kmalloc(km, n_regs * 16);
405
+ for (i = n_aux = 0; i < n_regs; ++i)
406
+ if (regs[i].parent == i || regs[i].parent < 0)
407
+ aux[n_aux].y = i, aux[n_aux++].x = (uint64_t)regs[i].rid << 32 | regs[i].rs;
408
+ radix_sort_128x(aux, aux + n_aux);
409
+
410
+ for (i = 1; i < n_aux - 1; ++i) {
411
+ mm_reg1_t *inv = &regs[aux[i].y];
412
+ if (inv->inv) {
413
+ mm_reg1_t *l = &regs[aux[i-1].y];
414
+ mm_reg1_t *r = &regs[aux[i+1].y];
415
+ inv->mapq = l->mapq < r->mapq? l->mapq : r->mapq;
416
+ }
417
+ }
418
+ kfree(km, aux);
419
+ }
420
+
421
+ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
422
+ {
423
+ static const float q_coef = 40.0f;
424
+ int64_t sum_sc = 0;
425
+ float uniq_ratio;
426
+ int i;
427
+ if (n_regs == 0) return;
428
+ for (i = 0; i < n_regs; ++i)
429
+ if (regs[i].parent == regs[i].id)
430
+ sum_sc += regs[i].score;
431
+ uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
432
+ for (i = 0; i < n_regs; ++i) {
433
+ mm_reg1_t *r = &regs[i];
434
+ if (r->inv) {
435
+ r->mapq = 0;
436
+ } else if (r->parent == r->id) {
437
+ int mapq, subsc;
438
+ float pen_s1 = (r->score > 100? 1.0f : 0.01f * r->score) * uniq_ratio;
439
+ float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
440
+ pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
441
+ subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
442
+ if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
443
+ float identity = (float)r->mlen / r->blen;
444
+ float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
445
+ mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
446
+ if (!is_sr) {
447
+ int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
448
+ mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
449
+ }
450
+ } else {
451
+ float x = (float)subsc / r->score0;
452
+ if (r->p) {
453
+ float identity = (float)r->mlen / r->blen;
454
+ mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc));
455
+ } else {
456
+ mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
457
+ }
458
+ }
459
+ mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
460
+ mapq = mapq > 0? mapq : 0;
461
+ r->mapq = mapq < 60? mapq : 60;
462
+ if (r->p && r->p->dp_max > r->p->dp_max2 && r->mapq == 0) r->mapq = 1;
463
+ } else r->mapq = 0;
464
+ }
465
+ mm_set_inv_mapq(km, n_regs, regs);
466
+ }