minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,416 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <assert.h>
4
+ #include "ksw2.h"
5
+
6
+ #ifdef __SSE2__
7
+ #ifdef USE_SIMDE
8
+ #include <simde/x86/sse2.h>
9
+ #else
10
+ #include <emmintrin.h>
11
+ #endif
12
+ #ifdef KSW_SSE2_ONLY
13
+ #undef __SSE4_1__
14
+ #endif
15
+
16
+ #ifdef __SSE4_1__
17
+ #ifdef USE_SIMDE
18
+ #include <simde/x86/sse4.1.h>
19
+ #else
20
+ #include <smmintrin.h>
21
+ #endif
22
+ #endif
23
+
24
+ #ifdef KSW_CPU_DISPATCH
25
+ #ifdef __SSE4_1__
26
+ void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
27
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
28
+ #else
29
+ void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
30
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
31
+ #endif
32
+ #else
33
+ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
34
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
35
+ #endif // ~KSW_CPU_DISPATCH
36
+ {
37
+ #define __dp_code_block1 \
38
+ z = _mm_load_si128(&s[t]); \
39
+ xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \
40
+ tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \
41
+ xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
42
+ x1_ = tmp; \
43
+ vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \
44
+ tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \
45
+ vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
46
+ v1_ = tmp; \
47
+ a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
48
+ ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \
49
+ b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
50
+ x2t1= _mm_load_si128(&x2[t]); \
51
+ tmp = _mm_srli_si128(x2t1, 15); \
52
+ x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
53
+ x21_= tmp; \
54
+ a2 = _mm_add_epi8(x2t1, vt1); \
55
+ a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t]));
56
+
57
+ #define __dp_code_block2 \
58
+ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
59
+ _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
60
+ tmp = _mm_sub_epi8(z, q_); \
61
+ a = _mm_sub_epi8(a, tmp); \
62
+ b = _mm_sub_epi8(b, tmp); \
63
+ a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
64
+
65
+ int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
66
+ int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
67
+ int32_t *H = 0, H0 = 0, last_H0_t = 0;
68
+ uint8_t *qr, *sf, *mem, *mem2 = 0;
69
+ __m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, sc_N_, m1_;
70
+ __m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
71
+
72
+ ksw_reset_extz(ez);
73
+ if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
74
+
75
+ zero_ = _mm_set1_epi8(0);
76
+ q_ = _mm_set1_epi8(q);
77
+ q2_ = _mm_set1_epi8(q2);
78
+ qe_ = _mm_set1_epi8(q + e);
79
+ sc_mch_ = _mm_set1_epi8(mat[0]);
80
+ sc_mis_ = _mm_set1_epi8(mat[1]);
81
+ sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
82
+ m1_ = _mm_set1_epi8(m - 1); // wildcard
83
+
84
+ tlen_ = (tlen + 15) / 16;
85
+ n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1;
86
+ qlen_ = (qlen + 15) / 16;
87
+ for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
88
+ max_sc = max_sc > mat[t]? max_sc : mat[t];
89
+ min_sc = min_sc < mat[t]? min_sc : mat[t];
90
+ }
91
+ if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
92
+
93
+ long_thres = (q2 - q) / e - 1;
94
+ if (q2 > q + e + long_thres * e)
95
+ ++long_thres;
96
+ long_diff = long_thres * e - (q2 - q);
97
+
98
+ mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16);
99
+ u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
100
+ v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_;
101
+ donor = x2 + tlen_, acceptor = donor + tlen_;
102
+ s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
103
+ memset(u, -q - e, tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array)
104
+ memset(x2, -q2, tlen_ * 16);
105
+ if (!approx_max) {
106
+ H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
107
+ for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
108
+ }
109
+ if (with_cigar) {
110
+ mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
111
+ p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
112
+ off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
113
+ off_end = off + qlen + tlen - 1;
114
+ }
115
+
116
+ for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
117
+ memcpy(sf, target, tlen);
118
+
119
+ // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
120
+ if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
121
+ int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
122
+ memset(donor, -noncan, tlen_ * 16);
123
+ memset(acceptor, -noncan, tlen_ * 16);
124
+ if (!(flag & KSW_EZ_REV_CIGAR)) {
125
+ for (t = 0; t < tlen - 4; ++t) {
126
+ int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG
127
+ if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr...
128
+ if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr...
129
+ if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2;
130
+ if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost;
131
+ }
132
+ if (junc)
133
+ for (t = 0; t < tlen - 1; ++t)
134
+ if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8)))
135
+ ((int8_t*)donor)[t] += junc_bonus;
136
+ for (t = 2; t < tlen; ++t) {
137
+ int can_type = 0;
138
+ if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG
139
+ if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC
140
+ if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2;
141
+ if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost;
142
+ }
143
+ if (junc)
144
+ for (t = 0; t < tlen; ++t)
145
+ if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4)))
146
+ ((int8_t*)acceptor)[t] += junc_bonus;
147
+ } else {
148
+ for (t = 0; t < tlen - 4; ++t) {
149
+ int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG
150
+ if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 0) can_type = 1; // GAy...
151
+ if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 0) can_type = 1; // CAy...
152
+ if (can_type && (target[t+3] == 1 || target[t+3] == 3)) can_type = 2;
153
+ if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost;
154
+ }
155
+ if (junc)
156
+ for (t = 0; t < tlen - 1; ++t)
157
+ if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4)))
158
+ ((int8_t*)donor)[t] += junc_bonus;
159
+ for (t = 2; t < tlen; ++t) {
160
+ int can_type = 0;
161
+ if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 3 && target[t] == 2) can_type = 1; // ...rTG
162
+ if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 3 && target[t] == 1) can_type = 1; // ...rTC
163
+ if (can_type && (target[t-2] == 0 || target[t-2] == 2)) can_type = 2;
164
+ if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost;
165
+ }
166
+ if (junc)
167
+ for (t = 0; t < tlen; ++t)
168
+ if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8)))
169
+ ((int8_t*)acceptor)[t] += junc_bonus;
170
+ }
171
+ }
172
+
173
+ for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
174
+ int st = 0, en = tlen - 1, st0, en0, st_, en_;
175
+ int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v;
176
+ uint8_t *qrr = qr + (qlen - 1 - r);
177
+ __m128i x1_, x21_, v1_;
178
+ // find the boundaries
179
+ if (st < r - qlen + 1) st = r - qlen + 1;
180
+ if (en > r) en = r;
181
+ st0 = st, en0 = en;
182
+ st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
183
+ // set boundary conditions
184
+ if (st > 0) {
185
+ if (st - 1 >= last_st && st - 1 <= last_en)
186
+ x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
187
+ else x1 = -q - e, x21 = -q2, v1 = -q - e;
188
+ } else {
189
+ x1 = -q - e, x21 = -q2;
190
+ v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
191
+ }
192
+ if (en >= r) {
193
+ ((int8_t*)y)[r] = -q - e;
194
+ u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
195
+ }
196
+ // loop fission: set scores first
197
+ if (!(flag & KSW_EZ_GENERIC_SC)) {
198
+ for (t = st0; t <= en0; t += 16) {
199
+ __m128i sq, st, tmp, mask;
200
+ sq = _mm_loadu_si128((__m128i*)&sf[t]);
201
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
202
+ mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
203
+ tmp = _mm_cmpeq_epi8(sq, st);
204
+ #ifdef __SSE4_1__
205
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
206
+ tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
207
+ #else
208
+ tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
209
+ tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
210
+ #endif
211
+ _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
212
+ }
213
+ } else {
214
+ for (t = st0; t <= en0; ++t)
215
+ ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
216
+ }
217
+ // core loop
218
+ x1_ = _mm_cvtsi32_si128((uint8_t)x1);
219
+ x21_ = _mm_cvtsi32_si128((uint8_t)x21);
220
+ v1_ = _mm_cvtsi32_si128((uint8_t)v1);
221
+ st_ = st / 16, en_ = en / 16;
222
+ assert(en_ - st_ + 1 <= n_col_);
223
+ if (!with_cigar) { // score only
224
+ for (t = st_; t <= en_; ++t) {
225
+ __m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
226
+ __dp_code_block1;
227
+ #ifdef __SSE4_1__
228
+ z = _mm_max_epi8(z, a);
229
+ z = _mm_max_epi8(z, b);
230
+ z = _mm_max_epi8(z, a2a);
231
+ __dp_code_block2; // save u[] and v[]; update a, b and a2
232
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_));
233
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
234
+ tmp = _mm_load_si128(&donor[t]);
235
+ _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
236
+ #else
237
+ tmp = _mm_cmpgt_epi8(a, z);
238
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
239
+ tmp = _mm_cmpgt_epi8(b, z);
240
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
241
+ tmp = _mm_cmpgt_epi8(a2a, z);
242
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
243
+ __dp_code_block2;
244
+ tmp = _mm_cmpgt_epi8(a, zero_);
245
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
246
+ tmp = _mm_cmpgt_epi8(b, zero_);
247
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
248
+ tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
249
+ tmp = _mm_cmpgt_epi8(a2, tmp);
250
+ tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
251
+ _mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
252
+ #endif
253
+ }
254
+ } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
255
+ __m128i *pr = p + r * n_col_ - st_;
256
+ off[r] = st, off_end[r] = en;
257
+ for (t = st_; t <= en_; ++t) {
258
+ __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
259
+ __dp_code_block1;
260
+ #ifdef __SSE4_1__
261
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0
262
+ z = _mm_max_epi8(z, a);
263
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d
264
+ z = _mm_max_epi8(z, b);
265
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
266
+ z = _mm_max_epi8(z, a2a);
267
+ #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
268
+ tmp = _mm_cmpgt_epi8(a, z);
269
+ d = _mm_and_si128(tmp, _mm_set1_epi8(1));
270
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
271
+ tmp = _mm_cmpgt_epi8(b, z);
272
+ d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
273
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
274
+ tmp = _mm_cmpgt_epi8(a2a, z);
275
+ d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
276
+ z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
277
+ #endif
278
+ __dp_code_block2;
279
+ tmp = _mm_cmpgt_epi8(a, zero_);
280
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
281
+ d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
282
+ tmp = _mm_cmpgt_epi8(b, zero_);
283
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
284
+ d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
285
+
286
+ tmp2 = _mm_load_si128(&donor[t]);
287
+ tmp = _mm_cmpgt_epi8(a2, tmp2);
288
+ #ifdef __SSE4_1__
289
+ tmp2 = _mm_max_epi8(a2, tmp2);
290
+ #else
291
+ tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
292
+ #endif
293
+ _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
294
+ d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
295
+ _mm_store_si128(&pr[t], d);
296
+ }
297
+ } else { // gap right-alignment
298
+ __m128i *pr = p + r * n_col_ - st_;
299
+ off[r] = st, off_end[r] = en;
300
+ for (t = st_; t <= en_; ++t) {
301
+ __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
302
+ __dp_code_block1;
303
+ #ifdef __SSE4_1__
304
+ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1
305
+ z = _mm_max_epi8(z, a);
306
+ d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2
307
+ z = _mm_max_epi8(z, b);
308
+ d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
309
+ z = _mm_max_epi8(z, a2a);
310
+ #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
311
+ tmp = _mm_cmpgt_epi8(z, a);
312
+ d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
313
+ z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
314
+ tmp = _mm_cmpgt_epi8(z, b);
315
+ d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
316
+ z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
317
+ tmp = _mm_cmpgt_epi8(z, a2a);
318
+ d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
319
+ z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
320
+ #endif
321
+ __dp_code_block2;
322
+ tmp = _mm_cmpgt_epi8(zero_, a);
323
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_));
324
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
325
+ tmp = _mm_cmpgt_epi8(zero_, b);
326
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_));
327
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
328
+
329
+ tmp2 = _mm_load_si128(&donor[t]);
330
+ tmp = _mm_cmpgt_epi8(tmp2, a2);
331
+ #ifdef __SSE4_1__
332
+ tmp2 = _mm_max_epi8(tmp2, a2);
333
+ #else
334
+ tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
335
+ #endif
336
+ _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
337
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
338
+ _mm_store_si128(&pr[t], d);
339
+ }
340
+ }
341
+ if (!approx_max) { // find the exact max with a 32-bit score array
342
+ int32_t max_H, max_t;
343
+ // compute H[], max_H and max_t
344
+ if (r > 0) {
345
+ int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
346
+ __m128i max_H_, max_t_;
347
+ max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
348
+ max_t = en0;
349
+ max_H_ = _mm_set1_epi32(max_H);
350
+ max_t_ = _mm_set1_epi32(max_t);
351
+ for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
352
+ __m128i H1, tmp, t_;
353
+ H1 = _mm_loadu_si128((__m128i*)&H[t]);
354
+ t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
355
+ H1 = _mm_add_epi32(H1, t_);
356
+ _mm_storeu_si128((__m128i*)&H[t], H1);
357
+ t_ = _mm_set1_epi32(t);
358
+ tmp = _mm_cmpgt_epi32(H1, max_H_);
359
+ #ifdef __SSE4_1__
360
+ max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
361
+ max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
362
+ #else
363
+ max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
364
+ max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
365
+ #endif
366
+ }
367
+ _mm_storeu_si128((__m128i*)HH, max_H_);
368
+ _mm_storeu_si128((__m128i*)tt, max_t_);
369
+ for (i = 0; i < 4; ++i)
370
+ if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
371
+ for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
372
+ H[t] += (int32_t)v8[t];
373
+ if (H[t] > max_H)
374
+ max_H = H[t], max_t = t;
375
+ }
376
+ } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
377
+ // update ez
378
+ if (en0 == tlen - 1 && H[en0] > ez->mte)
379
+ ez->mte = H[en0], ez->mte_q = r - en;
380
+ if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
381
+ ez->mqe = H[st0], ez->mqe_t = st0;
382
+ if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break;
383
+ if (r == qlen + tlen - 2 && en0 == tlen - 1)
384
+ ez->score = H[tlen - 1];
385
+ } else { // find approximate max; Z-drop might be inaccurate, too.
386
+ if (r > 0) {
387
+ if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
388
+ int32_t d0 = v8[last_H0_t];
389
+ int32_t d1 = u8[last_H0_t + 1];
390
+ if (d0 > d1) H0 += d0;
391
+ else H0 += d1, ++last_H0_t;
392
+ } else if (last_H0_t >= st0 && last_H0_t <= en0) {
393
+ H0 += v8[last_H0_t];
394
+ } else {
395
+ ++last_H0_t, H0 += u8[last_H0_t];
396
+ }
397
+ } else H0 = v8[0] - qe, last_H0_t = 0;
398
+ if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, 0)) break;
399
+ if (r == qlen + tlen - 2 && en0 == tlen - 1)
400
+ ez->score = H0;
401
+ }
402
+ last_st = st, last_en = en;
403
+ //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
404
+ }
405
+ kfree(km, mem);
406
+ if (!approx_max) kfree(km, H);
407
+ if (with_cigar) { // backtrack
408
+ int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
409
+ if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
410
+ ksw_backtrack(km, 1, rev_cigar, long_thres, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
411
+ else if (ez->max_t >= 0 && ez->max_q >= 0)
412
+ ksw_backtrack(km, 1, rev_cigar, long_thres, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
413
+ kfree(km, mem2); kfree(km, off);
414
+ }
415
+ }
416
+ #endif // __SSE2__