minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,466 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include <math.h>
4
+ #include "mmpriv.h"
5
+ #include "kalloc.h"
6
+ #include "khash.h"
7
+
8
+ static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
9
+ {
10
+ int i;
11
+ r->mlen = r->blen = 0;
12
+ if (r->cnt <= 0) return;
13
+ r->mlen = r->blen = a[r->as].y>>32&0xff;
14
+ for (i = r->as + 1; i < r->as + r->cnt; ++i) {
15
+ int span = a[i].y>>32&0xff;
16
+ int tl = (int32_t)a[i].x - (int32_t)a[i-1].x;
17
+ int ql = (int32_t)a[i].y - (int32_t)a[i-1].y;
18
+ r->blen += tl > ql? tl : ql;
19
+ r->mlen += tl > span && ql > span? span : tl < ql? tl : ql;
20
+ }
21
+ }
22
+
23
+ static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a, int is_qstrand)
24
+ { // NB: r->as and r->cnt MUST BE set correctly for this function to work
25
+ int32_t k = r->as, q_span = (int32_t)(a[k].y>>32&0xff);
26
+ r->rev = a[k].x>>63;
27
+ r->rid = a[k].x<<1>>33;
28
+ r->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // NB: target span may be shorter, so this test is necessary
29
+ r->re = (int32_t)a[k + r->cnt - 1].x + 1;
30
+ if (!r->rev || is_qstrand) {
31
+ r->qs = (int32_t)a[k].y + 1 - q_span;
32
+ r->qe = (int32_t)a[k + r->cnt - 1].y + 1;
33
+ } else {
34
+ r->qs = qlen - ((int32_t)a[k + r->cnt - 1].y + 1);
35
+ r->qe = qlen - ((int32_t)a[k].y + 1 - q_span);
36
+ }
37
+ mm_cal_fuzzy_len(r, a);
38
+ }
39
+
40
+ static inline uint64_t hash64(uint64_t key)
41
+ {
42
+ key = (~key + (key << 21));
43
+ key = key ^ key >> 24;
44
+ key = ((key + (key << 3)) + (key << 8));
45
+ key = key ^ key >> 14;
46
+ key = ((key + (key << 2)) + (key << 4));
47
+ key = key ^ key >> 28;
48
+ key = (key + (key << 31));
49
+ return key;
50
+ }
51
+
52
+ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand) // convert chains to hits
53
+ {
54
+ mm128_t *z, tmp;
55
+ mm_reg1_t *r;
56
+ int i, k;
57
+
58
+ if (n_u == 0) return 0;
59
+
60
+ // sort by score
61
+ z = (mm128_t*)kmalloc(km, n_u * 16);
62
+ for (i = k = 0; i < n_u; ++i) {
63
+ uint32_t h;
64
+ h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
65
+ z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
66
+ z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
67
+ k += (int32_t)u[i];
68
+ }
69
+ radix_sort_128x(z, z + n_u);
70
+ for (i = 0; i < n_u>>1; ++i) // reverse, s.t. larger score first
71
+ tmp = z[i], z[i] = z[n_u-1-i], z[n_u-1-i] = tmp;
72
+
73
+ // populate r[]
74
+ r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
75
+ for (i = 0; i < n_u; ++i) {
76
+ mm_reg1_t *ri = &r[i];
77
+ ri->id = i;
78
+ ri->parent = MM_PARENT_UNSET;
79
+ ri->score = ri->score0 = z[i].x >> 32;
80
+ ri->hash = (uint32_t)z[i].x;
81
+ ri->cnt = (int32_t)z[i].y;
82
+ ri->as = z[i].y >> 32;
83
+ ri->div = -1.0f;
84
+ mm_reg_set_coor(ri, qlen, a, is_qstrand);
85
+ }
86
+ kfree(km, z);
87
+ return r;
88
+ }
89
+
90
+ void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
91
+ {
92
+ int i;
93
+ if (mi->n_alt == 0) return;
94
+ for (i = 0; i < n; ++i)
95
+ if (mi->seq[r[i].rid].is_alt)
96
+ r[i].is_alt = 1;
97
+ }
98
+
99
+ static inline int mm_alt_score(int score, float alt_diff_frac)
100
+ {
101
+ if (score < 0) return score;
102
+ score = (int)(score * (1.0 - alt_diff_frac) + .499);
103
+ return score > 0? score : 1;
104
+ }
105
+
106
+ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand)
107
+ {
108
+ if (n <= 0 || n >= r->cnt) return;
109
+ *r2 = *r;
110
+ r2->id = -1;
111
+ r2->sam_pri = 0;
112
+ r2->p = 0;
113
+ r2->split_inv = 0;
114
+ r2->cnt = r->cnt - n;
115
+ r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
116
+ r2->as = r->as + n;
117
+ if (r->parent == r->id) r2->parent = MM_PARENT_TMP_PRI;
118
+ mm_reg_set_coor(r2, qlen, a, is_qstrand);
119
+ r->cnt -= r2->cnt;
120
+ r->score -= r2->score;
121
+ mm_reg_set_coor(r, qlen, a, is_qstrand);
122
+ r->split |= 1, r2->split |= 2;
123
+ }
124
+
125
+ void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
126
+ {
127
+ int i, j, k, *w;
128
+ uint64_t *cov;
129
+ if (n <= 0) return;
130
+ for (i = 0; i < n; ++i) r[i].id = i;
131
+ cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
132
+ w = (int*)kmalloc(km, n * sizeof(int));
133
+ w[0] = 0, r[0].parent = 0;
134
+ for (i = 1, k = 1; i < n; ++i) {
135
+ mm_reg1_t *ri = &r[i];
136
+ int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
137
+ if (hard_mask_level) goto skip_uncov;
138
+ for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
139
+ mm_reg1_t *rp = &r[w[j]];
140
+ int sj = rp->qs, ej = rp->qe;
141
+ if (ej <= si || sj >= ei) continue;
142
+ if (sj < si) sj = si;
143
+ if (ej > ei) ej = ei;
144
+ cov[n_cov++] = (uint64_t)sj<<32 | ej;
145
+ }
146
+ if (n_cov == 0) {
147
+ goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
148
+ } else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
149
+ int j, x = si;
150
+ radix_sort_64(cov, cov + n_cov);
151
+ for (j = 0; j < n_cov; ++j) {
152
+ if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
153
+ x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
154
+ }
155
+ if (ei > x) uncov_len += ei - x;
156
+ }
157
+ skip_uncov:
158
+ for (j = 0; j < k; ++j) { // traverse existing primary hits again
159
+ mm_reg1_t *rp = &r[w[j]];
160
+ int sj = rp->qs, ej = rp->qe, min, max, ol;
161
+ if (ej <= si || sj >= ei) continue; // no overlap
162
+ min = ej - sj < ei - si? ej - sj : ei - si;
163
+ max = ej - sj > ei - si? ej - sj : ei - si;
164
+ ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
165
+ if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
166
+ int cnt_sub = 0, sci = ri->score;
167
+ ri->parent = rp->parent;
168
+ if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
169
+ rp->subsc = rp->subsc > sci? rp->subsc : sci;
170
+ if (ri->cnt >= rp->cnt) cnt_sub = 1;
171
+ if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
172
+ sci = ri->p->dp_max;
173
+ if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
174
+ rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
175
+ if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
176
+ }
177
+ if (cnt_sub) ++rp->n_sub;
178
+ break;
179
+ }
180
+ }
181
+ set_parent_test:
182
+ if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
183
+ }
184
+ kfree(km, cov);
185
+ kfree(km, w);
186
+ }
187
+
188
+ void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
189
+ {
190
+ int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
191
+ mm128_t *aux;
192
+ mm_reg1_t *t;
193
+
194
+ if (n <= 1) return;
195
+ aux = (mm128_t*)kmalloc(km, n * 16);
196
+ t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
197
+ for (i = n_aux = 0; i < n; ++i) {
198
+ if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
199
+ int score;
200
+ if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
201
+ else score = r[i].score, no_cigar = 1;
202
+ if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
203
+ aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
204
+ aux[n_aux++].y = i;
205
+ } else if (r[i].p) {
206
+ free(r[i].p);
207
+ r[i].p = 0;
208
+ }
209
+ }
210
+ assert(has_cigar + no_cigar == 1);
211
+ radix_sort_128x(aux, aux + n_aux);
212
+ for (i = n_aux - 1; i >= 0; --i)
213
+ t[n_aux - 1 - i] = r[aux[i].y];
214
+ memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
215
+ *n_regs = n_aux;
216
+ kfree(km, aux);
217
+ kfree(km, t);
218
+ }
219
+
220
+ int mm_set_sam_pri(int n, mm_reg1_t *r)
221
+ {
222
+ int i, n_pri = 0;
223
+ for (i = 0; i < n; ++i)
224
+ if (r[i].id == r[i].parent) {
225
+ ++n_pri;
226
+ r[i].sam_pri = (n_pri == 1);
227
+ } else r[i].sam_pri = 0;
228
+ return n_pri;
229
+ }
230
+
231
+ void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id
232
+ {
233
+ int *tmp, i, max_id = -1, n_tmp;
234
+ if (n_regs <= 0) return;
235
+ for (i = 0; i < n_regs; ++i) // NB: doesn't work if mm_reg1_t::id is negative
236
+ max_id = max_id > regs[i].id? max_id : regs[i].id;
237
+ n_tmp = max_id + 1;
238
+ tmp = (int*)kmalloc(km, n_tmp * sizeof(int));
239
+ for (i = 0; i < n_tmp; ++i) tmp[i] = -1;
240
+ for (i = 0; i < n_regs; ++i)
241
+ if (regs[i].id >= 0) tmp[regs[i].id] = i;
242
+ for (i = 0; i < n_regs; ++i) {
243
+ mm_reg1_t *r = &regs[i];
244
+ r->id = i;
245
+ if (r->parent == MM_PARENT_TMP_PRI)
246
+ r->parent = i;
247
+ else if (r->parent >= 0 && tmp[r->parent] >= 0)
248
+ r->parent = tmp[r->parent];
249
+ else r->parent = MM_PARENT_UNSET;
250
+ }
251
+ kfree(km, tmp);
252
+ mm_set_sam_pri(n_regs, regs);
253
+ }
254
+
255
+ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r)
256
+ {
257
+ if (pri_ratio > 0.0f && *n_ > 0) {
258
+ int i, k, n = *n_, n_2nd = 0;
259
+ for (i = k = 0; i < n; ++i) {
260
+ int p = r[i].parent;
261
+ if (p == i || r[i].inv) { // primary or inversion
262
+ r[k++] = r[i];
263
+ } else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
264
+ if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rid == r[p].rid && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
265
+ r[k++] = r[i], ++n_2nd;
266
+ else if (r[i].p) free(r[i].p);
267
+ } else if (check_strand && n_2nd < best_n && r[i].score > min_strand_sc && r[i].rev != r[p].rev) {
268
+ r[i].strand_retained = 1;
269
+ r[k++] = r[i], ++n_2nd;
270
+ } else if (r[i].p) free(r[i].p);
271
+ }
272
+ if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
273
+ *n_ = k;
274
+ }
275
+ }
276
+
277
+ int mm_filter_strand_retained(int n_regs, mm_reg1_t *r)
278
+ {
279
+ int i, k;
280
+ for (i = k = 0; i < n_regs; ++i) {
281
+ int p = r[i].parent;
282
+ if (!r[i].strand_retained || r[i].div < r[p].div * 5.0f || r[i].div < 0.01f) {
283
+ if (k < i) r[k++] = r[i];
284
+ else ++k;
285
+ }
286
+ }
287
+ return k;
288
+ }
289
+
290
+ void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs)
291
+ { // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
292
+ int i, k;
293
+ for (i = k = 0; i < *n_regs; ++i) {
294
+ mm_reg1_t *r = &regs[i];
295
+ int flt = 0;
296
+ if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1;
297
+ if (r->p) { // these filters are only applied when base-alignment is available
298
+ if (r->mlen < opt->min_chain_score) flt = 1;
299
+ else if (r->p->dp_max < opt->min_dp_max) flt = 1;
300
+ else if (r->qs > qlen * opt->max_clip_ratio && qlen - r->qe > qlen * opt->max_clip_ratio) flt = 1;
301
+ if (flt) free(r->p);
302
+ }
303
+ if (!flt) {
304
+ if (k < i) regs[k++] = regs[i];
305
+ else ++k;
306
+ }
307
+ }
308
+ *n_regs = k;
309
+ }
310
+
311
+ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
312
+ { // squeeze out regions in a[] that are not referenced by regs[]
313
+ int i, as = 0;
314
+ uint64_t *aux;
315
+ aux = (uint64_t*)kmalloc(km, n_regs * 8);
316
+ for (i = 0; i < n_regs; ++i)
317
+ aux[i] = (uint64_t)regs[i].as << 32 | i;
318
+ radix_sort_64(aux, aux + n_regs);
319
+ for (i = 0; i < n_regs; ++i) {
320
+ mm_reg1_t *r = &regs[(int32_t)aux[i]];
321
+ if (r->as != as) {
322
+ memmove(&a[as], &a[r->as], r->cnt * 16);
323
+ r->as = as;
324
+ }
325
+ as += r->cnt;
326
+ }
327
+ kfree(km, aux);
328
+ return as;
329
+ }
330
+
331
+ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
332
+ {
333
+ int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
334
+ mm_seg_t *seg;
335
+
336
+ assert(n_segs <= MM_MAX_SEG);
337
+ for (s = 1, acc_qlen[0] = 0; s < n_segs; ++s)
338
+ acc_qlen[s] = acc_qlen[s-1] + qlens[s-1];
339
+ qlen_sum = acc_qlen[n_segs - 1] + qlens[n_segs - 1];
340
+
341
+ seg = (mm_seg_t*)kcalloc(km, n_segs, sizeof(mm_seg_t));
342
+ for (s = 0; s < n_segs; ++s) {
343
+ seg[s].u = (uint64_t*)kmalloc(km, n_regs0 * 8);
344
+ for (i = 0; i < n_regs0; ++i)
345
+ seg[s].u[i] = (uint64_t)regs0[i].score << 32;
346
+ }
347
+ for (i = 0; i < n_regs0; ++i) {
348
+ const mm_reg1_t *r = &regs0[i];
349
+ for (j = 0; j < r->cnt; ++j) {
350
+ int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
351
+ ++seg[sid].u[i];
352
+ ++seg[sid].n_a;
353
+ }
354
+ }
355
+ for (s = 0; s < n_segs; ++s) {
356
+ mm_seg_t *sr = &seg[s];
357
+ for (i = 0, sr->n_u = 0; i < n_regs0; ++i) // squeeze out zero-length per-segment chains
358
+ if ((int32_t)sr->u[i] != 0)
359
+ sr->u[sr->n_u++] = sr->u[i];
360
+ sr->a = (mm128_t*)kmalloc(km, sr->n_a * sizeof(mm128_t));
361
+ sr->n_a = 0;
362
+ }
363
+
364
+ for (i = 0; i < n_regs0; ++i) {
365
+ const mm_reg1_t *r = &regs0[i];
366
+ for (j = 0; j < r->cnt; ++j) {
367
+ int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
368
+ mm128_t a1 = a[r->as + j];
369
+ // on reverse strand, the segment position is:
370
+ // x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
371
+ // (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
372
+ a1.y -= a1.x>>63? qlen_sum - (qlens[sid] + acc_qlen[sid]) : acc_qlen[sid];
373
+ seg[sid].a[seg[sid].n_a++] = a1;
374
+ }
375
+ }
376
+ for (s = 0; s < n_segs; ++s) {
377
+ regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a, 0);
378
+ n_regs[s] = seg[s].n_u;
379
+ for (i = 0; i < n_regs[s]; ++i) {
380
+ regs[s][i].seg_split = 1;
381
+ regs[s][i].seg_id = s;
382
+ }
383
+ }
384
+ return seg;
385
+ }
386
+
387
+ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs)
388
+ {
389
+ int i;
390
+ for (i = 0; i < n_segs; ++i) kfree(km, segs[i].u);
391
+ for (i = 0; i < n_segs; ++i) kfree(km, segs[i].a);
392
+ kfree(km, segs);
393
+ }
394
+
395
+ static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
396
+ {
397
+ int i, n_aux;
398
+ mm128_t *aux;
399
+ if (n_regs < 3) return;
400
+ for (i = 0; i < n_regs; ++i)
401
+ if (regs[i].inv) break;
402
+ if (i == n_regs) return; // no inversion hits
403
+
404
+ aux = (mm128_t*)kmalloc(km, n_regs * 16);
405
+ for (i = n_aux = 0; i < n_regs; ++i)
406
+ if (regs[i].parent == i || regs[i].parent < 0)
407
+ aux[n_aux].y = i, aux[n_aux++].x = (uint64_t)regs[i].rid << 32 | regs[i].rs;
408
+ radix_sort_128x(aux, aux + n_aux);
409
+
410
+ for (i = 1; i < n_aux - 1; ++i) {
411
+ mm_reg1_t *inv = &regs[aux[i].y];
412
+ if (inv->inv) {
413
+ mm_reg1_t *l = &regs[aux[i-1].y];
414
+ mm_reg1_t *r = &regs[aux[i+1].y];
415
+ inv->mapq = l->mapq < r->mapq? l->mapq : r->mapq;
416
+ }
417
+ }
418
+ kfree(km, aux);
419
+ }
420
+
421
+ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
422
+ {
423
+ static const float q_coef = 40.0f;
424
+ int64_t sum_sc = 0;
425
+ float uniq_ratio;
426
+ int i;
427
+ if (n_regs == 0) return;
428
+ for (i = 0; i < n_regs; ++i)
429
+ if (regs[i].parent == regs[i].id)
430
+ sum_sc += regs[i].score;
431
+ uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
432
+ for (i = 0; i < n_regs; ++i) {
433
+ mm_reg1_t *r = &regs[i];
434
+ if (r->inv) {
435
+ r->mapq = 0;
436
+ } else if (r->parent == r->id) {
437
+ int mapq, subsc;
438
+ float pen_s1 = (r->score > 100? 1.0f : 0.01f * r->score) * uniq_ratio;
439
+ float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
440
+ pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
441
+ subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
442
+ if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
443
+ float identity = (float)r->mlen / r->blen;
444
+ float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
445
+ mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
446
+ if (!is_sr) {
447
+ int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
448
+ mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
449
+ }
450
+ } else {
451
+ float x = (float)subsc / r->score0;
452
+ if (r->p) {
453
+ float identity = (float)r->mlen / r->blen;
454
+ mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc));
455
+ } else {
456
+ mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
457
+ }
458
+ }
459
+ mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
460
+ mapq = mapq > 0? mapq : 0;
461
+ r->mapq = mapq < 60? mapq : 60;
462
+ if (r->p && r->p->dp_max > r->p->dp_max2 && r->mapq == 0) r->mapq = 1;
463
+ } else r->mapq = 0;
464
+ }
465
+ mm_set_inv_mapq(km, n_regs, regs);
466
+ }