minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,313 @@
1
+ #include <string.h>
2
+ #include <assert.h>
3
+ #include "ksw2.h"
4
+
5
+ #ifdef __SSE2__
6
+ #ifdef USE_SIMDE
7
+ #include <simde/x86/sse2.h>
8
+ #else
9
+ #include <emmintrin.h>
10
+ #endif
11
+
12
+ #ifdef KSW_SSE2_ONLY
13
+ #undef __SSE4_1__
14
+ #endif
15
+
16
+ #ifdef __SSE4_1__
17
+ #ifdef USE_SIMDE
18
+ #include <simde/x86/sse4.1.h>
19
+ #else
20
+ #include <smmintrin.h>
21
+ #endif
22
+ #endif
23
+
24
+ #ifdef KSW_CPU_DISPATCH
25
+ #ifdef __SSE4_1__
26
+ void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
27
+ #else
28
+ void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
29
+ #endif
30
+ #else
31
+ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
32
+ #endif // ~KSW_CPU_DISPATCH
33
+ {
34
+ #define __dp_code_block1 \
35
+ z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
36
+ xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \
37
+ tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \
38
+ xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
39
+ x1_ = tmp; \
40
+ vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \
41
+ tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \
42
+ vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
43
+ v1_ = tmp; \
44
+ a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
45
+ ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \
46
+ b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
47
+
48
+ #define __dp_code_block2 \
49
+ z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \
50
+ z = _mm_min_epu8(z, max_sc_); \
51
+ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
52
+ _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
53
+ z = _mm_sub_epi8(z, q_); \
54
+ a = _mm_sub_epi8(a, z); \
55
+ b = _mm_sub_epi8(b, z);
56
+
57
+ int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
58
+ int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
59
+ int32_t *H = 0, H0 = 0, last_H0_t = 0;
60
+ uint8_t *qr, *sf, *mem, *mem2 = 0;
61
+ __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
62
+ __m128i *u, *v, *x, *y, *s, *p = 0;
63
+
64
+ ksw_reset_extz(ez);
65
+ if (m <= 0 || qlen <= 0 || tlen <= 0) return;
66
+
67
+ zero_ = _mm_set1_epi8(0);
68
+ q_ = _mm_set1_epi8(q);
69
+ qe2_ = _mm_set1_epi8((q + e) * 2);
70
+ flag1_ = _mm_set1_epi8(1);
71
+ flag2_ = _mm_set1_epi8(2);
72
+ flag8_ = _mm_set1_epi8(0x08);
73
+ flag16_ = _mm_set1_epi8(0x10);
74
+ sc_mch_ = _mm_set1_epi8(mat[0]);
75
+ sc_mis_ = _mm_set1_epi8(mat[1]);
76
+ sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
77
+ m1_ = _mm_set1_epi8(m - 1); // wildcard
78
+ max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
79
+
80
+ if (w < 0) w = tlen > qlen? tlen : qlen;
81
+ wl = wr = w;
82
+ tlen_ = (tlen + 15) / 16;
83
+ n_col_ = qlen < tlen? qlen : tlen;
84
+ n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
85
+ qlen_ = (qlen + 15) / 16;
86
+ for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
87
+ max_sc = max_sc > mat[t]? max_sc : mat[t];
88
+ min_sc = min_sc < mat[t]? min_sc : mat[t];
89
+ }
90
+ if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
91
+
92
+ mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
93
+ u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
94
+ v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
95
+ if (!approx_max) {
96
+ H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
97
+ for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
98
+ }
99
+ if (with_cigar) {
100
+ mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
101
+ p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
102
+ off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
103
+ off_end = off + qlen + tlen - 1;
104
+ }
105
+
106
+ for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
107
+ memcpy(sf, target, tlen);
108
+
109
+ for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
110
+ int st = 0, en = tlen - 1, st0, en0, st_, en_;
111
+ int8_t x1, v1;
112
+ uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v;
113
+ __m128i x1_, v1_;
114
+ // find the boundaries
115
+ if (st < r - qlen + 1) st = r - qlen + 1;
116
+ if (en > r) en = r;
117
+ if (st < (r-wr+1)>>1) st = (r-wr+1)>>1; // take the ceil
118
+ if (en > (r+wl)>>1) en = (r+wl)>>1; // take the floor
119
+ if (st > en) {
120
+ ez->zdropped = 1;
121
+ break;
122
+ }
123
+ st0 = st, en0 = en;
124
+ st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
125
+ // set boundary conditions
126
+ if (st > 0) {
127
+ if (st - 1 >= last_st && st - 1 <= last_en)
128
+ x1 = ((uint8_t*)x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
129
+ else x1 = v1 = 0; // not calculated; set to zeros
130
+ } else x1 = 0, v1 = r? q : 0;
131
+ if (en >= r) ((uint8_t*)y)[r] = 0, u8[r] = r? q : 0;
132
+ // loop fission: set scores first
133
+ if (!(flag & KSW_EZ_GENERIC_SC)) {
134
+ for (t = st0; t <= en0; t += 16) {
135
+ __m128i sq, st, tmp, mask;
136
+ sq = _mm_loadu_si128((__m128i*)&sf[t]);
137
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
138
+ mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
139
+ tmp = _mm_cmpeq_epi8(sq, st);
140
+ #ifdef __SSE4_1__
141
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
142
+ tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
143
+ #else
144
+ tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
145
+ tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
146
+ #endif
147
+ _mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
148
+ }
149
+ } else {
150
+ for (t = st0; t <= en0; ++t)
151
+ ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
152
+ }
153
+ // core loop
154
+ x1_ = _mm_cvtsi32_si128(x1);
155
+ v1_ = _mm_cvtsi32_si128(v1);
156
+ st_ = st / 16, en_ = en / 16;
157
+ assert(en_ - st_ + 1 <= n_col_);
158
+ if (!with_cigar) { // score only
159
+ for (t = st_; t <= en_; ++t) {
160
+ __m128i z, a, b, xt1, vt1, ut, tmp;
161
+ __dp_code_block1;
162
+ #ifdef __SSE4_1__
163
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
164
+ #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
165
+ z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
166
+ z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
167
+ #endif
168
+ __dp_code_block2;
169
+ #ifdef __SSE4_1__
170
+ _mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
171
+ _mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
172
+ #else
173
+ tmp = _mm_cmpgt_epi8(a, zero_);
174
+ _mm_store_si128(&x[t], _mm_and_si128(a, tmp));
175
+ tmp = _mm_cmpgt_epi8(b, zero_);
176
+ _mm_store_si128(&y[t], _mm_and_si128(b, tmp));
177
+ #endif
178
+ }
179
+ } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
180
+ __m128i *pr = p + (size_t)r * n_col_ - st_;
181
+ off[r] = st, off_end[r] = en;
182
+ for (t = st_; t <= en_; ++t) {
183
+ __m128i d, z, a, b, xt1, vt1, ut, tmp;
184
+ __dp_code_block1;
185
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
186
+ #ifdef __SSE4_1__
187
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
188
+ tmp = _mm_cmpgt_epi8(b, z);
189
+ d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d
190
+ #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
191
+ z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
192
+ z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
193
+ tmp = _mm_cmpgt_epi8(b, z);
194
+ d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
195
+ #endif
196
+ __dp_code_block2;
197
+ tmp = _mm_cmpgt_epi8(a, zero_);
198
+ _mm_store_si128(&x[t], _mm_and_si128(tmp, a));
199
+ d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0
200
+ tmp = _mm_cmpgt_epi8(b, zero_);
201
+ _mm_store_si128(&y[t], _mm_and_si128(tmp, b));
202
+ d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
203
+ _mm_store_si128(&pr[t], d);
204
+ }
205
+ } else { // gap right-alignment
206
+ __m128i *pr = p + (size_t)r * n_col_ - st_;
207
+ off[r] = st, off_end[r] = en;
208
+ for (t = st_; t <= en_; ++t) {
209
+ __m128i d, z, a, b, xt1, vt1, ut, tmp;
210
+ __dp_code_block1;
211
+ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
212
+ #ifdef __SSE4_1__
213
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
214
+ tmp = _mm_cmpgt_epi8(z, b);
215
+ d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2
216
+ #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
217
+ z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
218
+ z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
219
+ tmp = _mm_cmpgt_epi8(z, b);
220
+ d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
221
+ #endif
222
+ __dp_code_block2;
223
+ tmp = _mm_cmpgt_epi8(zero_, a);
224
+ _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
225
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08
226
+ tmp = _mm_cmpgt_epi8(zero_, b);
227
+ _mm_store_si128(&y[t], _mm_andnot_si128(tmp, b));
228
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
229
+ _mm_store_si128(&pr[t], d);
230
+ }
231
+ }
232
+ if (!approx_max) { // find the exact max with a 32-bit score array
233
+ int32_t max_H, max_t;
234
+ // compute H[], max_H and max_t
235
+ if (r > 0) {
236
+ int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
237
+ __m128i max_H_, max_t_, qe_;
238
+ max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element
239
+ max_t = en0;
240
+ max_H_ = _mm_set1_epi32(max_H);
241
+ max_t_ = _mm_set1_epi32(max_t);
242
+ qe_ = _mm_set1_epi32(q + e);
243
+ for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
244
+ __m128i H1, tmp, t_;
245
+ H1 = _mm_loadu_si128((__m128i*)&H[t]);
246
+ t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
247
+ H1 = _mm_add_epi32(H1, t_);
248
+ H1 = _mm_sub_epi32(H1, qe_);
249
+ _mm_storeu_si128((__m128i*)&H[t], H1);
250
+ t_ = _mm_set1_epi32(t);
251
+ tmp = _mm_cmpgt_epi32(H1, max_H_);
252
+ #ifdef __SSE4_1__
253
+ max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
254
+ max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
255
+ #else
256
+ max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
257
+ max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
258
+ #endif
259
+ }
260
+ _mm_storeu_si128((__m128i*)HH, max_H_);
261
+ _mm_storeu_si128((__m128i*)tt, max_t_);
262
+ for (i = 0; i < 4; ++i)
263
+ if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
264
+ for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
265
+ H[t] += (int32_t)v8[t] - qe;
266
+ if (H[t] > max_H)
267
+ max_H = H[t], max_t = t;
268
+ }
269
+ } else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0
270
+ // update ez
271
+ if (en0 == tlen - 1 && H[en0] > ez->mte)
272
+ ez->mte = H[en0], ez->mte_q = r - en;
273
+ if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
274
+ ez->mqe = H[st0], ez->mqe_t = st0;
275
+ if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break;
276
+ if (r == qlen + tlen - 2 && en0 == tlen - 1)
277
+ ez->score = H[tlen - 1];
278
+ } else { // find approximate max; Z-drop might be inaccurate, too.
279
+ if (r > 0) {
280
+ if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
281
+ int32_t d0 = v8[last_H0_t] - qe;
282
+ int32_t d1 = u8[last_H0_t + 1] - qe;
283
+ if (d0 > d1) H0 += d0;
284
+ else H0 += d1, ++last_H0_t;
285
+ } else if (last_H0_t >= st0 && last_H0_t <= en0) {
286
+ H0 += v8[last_H0_t] - qe;
287
+ } else {
288
+ ++last_H0_t, H0 += u8[last_H0_t] - qe;
289
+ }
290
+ if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e)) break;
291
+ } else H0 = v8[0] - qe - qe, last_H0_t = 0;
292
+ if (r == qlen + tlen - 2 && en0 == tlen - 1)
293
+ ez->score = H0;
294
+ }
295
+ last_st = st, last_en = en;
296
+ //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
297
+ }
298
+ kfree(km, mem);
299
+ if (!approx_max) kfree(km, H);
300
+ if (with_cigar) { // backtrack
301
+ int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
302
+ if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) {
303
+ ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
304
+ } else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) {
305
+ ez->reach_end = 1;
306
+ ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
307
+ } else if (ez->max_t >= 0 && ez->max_q >= 0) {
308
+ ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
309
+ }
310
+ kfree(km, mem2); kfree(km, off);
311
+ }
312
+ }
313
+ #endif // __SSE2__
@@ -0,0 +1,152 @@
1
+ #include <stdlib.h>
2
+ #include <stdint.h>
3
+ #include <string.h>
4
+ #include "ksw2.h"
5
+
6
+ #ifdef USE_SIMDE
7
+ #include <simde/x86/sse2.h>
8
+ #else
9
+ #include <emmintrin.h>
10
+ #endif
11
+
12
+ #ifdef __GNUC__
13
+ #define LIKELY(x) __builtin_expect((x),1)
14
+ #define UNLIKELY(x) __builtin_expect((x),0)
15
+ #else
16
+ #define LIKELY(x) (x)
17
+ #define UNLIKELY(x) (x)
18
+ #endif
19
+
20
+ typedef struct {
21
+ int qlen, slen;
22
+ uint8_t shift, mdiff, max, size;
23
+ __m128i *qp, *H0, *H1, *E, *Hmax;
24
+ } kswq_t;
25
+
26
+ /**
27
+ * Initialize the query data structure
28
+ *
29
+ * @param size Number of bytes used to store a score; valid valures are 1 or 2
30
+ * @param qlen Length of the query sequence
31
+ * @param query Query sequence
32
+ * @param m Size of the alphabet
33
+ * @param mat Scoring matrix in a one-dimension array
34
+ *
35
+ * @return Query data structure
36
+ */
37
+ void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
38
+ {
39
+ kswq_t *q;
40
+ int slen, a, tmp, p;
41
+
42
+ size = size > 1? 2 : 1;
43
+ p = 8 * (3 - size); // # values per __m128i
44
+ slen = (qlen + p - 1) / p; // segmented length
45
+ q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
46
+ q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
47
+ q->H0 = q->qp + slen * m;
48
+ q->H1 = q->H0 + slen;
49
+ q->E = q->H1 + slen;
50
+ q->Hmax = q->E + slen;
51
+ q->slen = slen; q->qlen = qlen; q->size = size;
52
+ // compute shift
53
+ tmp = m * m;
54
+ for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
55
+ if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
56
+ if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
57
+ }
58
+ q->max = q->mdiff;
59
+ q->shift = 256 - q->shift; // NB: q->shift is uint8_t
60
+ q->mdiff += q->shift; // this is the difference between the min and max scores
61
+ // An example: p=8, qlen=19, slen=3 and segmentation:
62
+ // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
63
+ if (size == 1) {
64
+ int8_t *t = (int8_t*)q->qp;
65
+ for (a = 0; a < m; ++a) {
66
+ int i, k, nlen = slen * p;
67
+ const int8_t *ma = mat + a * m;
68
+ for (i = 0; i < slen; ++i)
69
+ for (k = i; k < nlen; k += slen) // p iterations
70
+ *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
71
+ }
72
+ } else {
73
+ int16_t *t = (int16_t*)q->qp;
74
+ for (a = 0; a < m; ++a) {
75
+ int i, k, nlen = slen * p;
76
+ const int8_t *ma = mat + a * m;
77
+ for (i = 0; i < slen; ++i)
78
+ for (k = i; k < nlen; k += slen) // p iterations
79
+ *t++ = (k >= qlen? 0 : ma[query[k]]);
80
+ }
81
+ }
82
+ return q;
83
+ }
84
+
85
+ int ksw_ll_i16(void *q_, int tlen, const uint8_t *target, int _gapo, int _gape, int *qe, int *te)
86
+ {
87
+ kswq_t *q = (kswq_t*)q_;
88
+ int slen, i, gmax = 0, qlen8;
89
+ __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
90
+ uint16_t *H8;
91
+
92
+ #define __max_8(ret, xx) do { \
93
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
94
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
95
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
96
+ (ret) = _mm_extract_epi16((xx), 0); \
97
+ } while (0)
98
+
99
+ // initialization
100
+ *qe = *te = -1;
101
+ zero = _mm_set1_epi32(0);
102
+ gapoe = _mm_set1_epi16(_gapo + _gape);
103
+ gape = _mm_set1_epi16(_gape);
104
+ H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
105
+ slen = q->slen, qlen8 = slen * 8;
106
+ memset(E, 0, slen * sizeof(__m128i));
107
+ memset(H0, 0, slen * sizeof(__m128i));
108
+ memset(Hmax, 0, slen * sizeof(__m128i));
109
+ // the core loop
110
+ for (i = 0; i < tlen; ++i) {
111
+ int j, k, imax;
112
+ __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
113
+ h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
114
+ h = _mm_slli_si128(h, 2);
115
+ for (j = 0; LIKELY(j < slen); ++j) {
116
+ h = _mm_adds_epi16(h, *S++);
117
+ e = _mm_load_si128(E + j);
118
+ h = _mm_max_epi16(h, e);
119
+ h = _mm_max_epi16(h, f);
120
+ max = _mm_max_epi16(max, h);
121
+ _mm_store_si128(H1 + j, h);
122
+ h = _mm_subs_epu16(h, gapoe);
123
+ e = _mm_subs_epu16(e, gape);
124
+ e = _mm_max_epi16(e, h);
125
+ _mm_store_si128(E + j, e);
126
+ f = _mm_subs_epu16(f, gape);
127
+ f = _mm_max_epi16(f, h);
128
+ h = _mm_load_si128(H0 + j);
129
+ }
130
+ for (k = 0; LIKELY(k < 8); ++k) {
131
+ f = _mm_slli_si128(f, 2);
132
+ for (j = 0; LIKELY(j < slen); ++j) {
133
+ h = _mm_load_si128(H1 + j);
134
+ h = _mm_max_epi16(h, f);
135
+ _mm_store_si128(H1 + j, h);
136
+ h = _mm_subs_epu16(h, gapoe);
137
+ f = _mm_subs_epu16(f, gape);
138
+ if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16;
139
+ }
140
+ }
141
+ end_loop_i16:
142
+ __max_8(imax, max);
143
+ if (imax >= gmax) {
144
+ gmax = imax; *te = i;
145
+ memcpy(Hmax, H1, slen * sizeof(__m128i));
146
+ }
147
+ S = H1; H1 = H0; H0 = S;
148
+ }
149
+ for (i = 0, H8 = (uint16_t*)Hmax; i < qlen8; ++i)
150
+ if ((int)H8[i] == gmax) *qe = i / 8 + i % 8 * slen;
151
+ return gmax;
152
+ }
@@ -0,0 +1,159 @@
1
+ #include <pthread.h>
2
+ #include <stdlib.h>
3
+ #include <limits.h>
4
+ #include <stdint.h>
5
+ #include "kthread.h"
6
+
7
+ #if (defined(WIN32) || defined(_WIN32)) && defined(_MSC_VER)
8
+ #define __sync_fetch_and_add(ptr, addend) _InterlockedExchangeAdd((void*)ptr, addend)
9
+ #endif
10
+
11
+ /************
12
+ * kt_for() *
13
+ ************/
14
+
15
+ struct kt_for_t;
16
+
17
+ typedef struct {
18
+ struct kt_for_t *t;
19
+ long i;
20
+ } ktf_worker_t;
21
+
22
+ typedef struct kt_for_t {
23
+ int n_threads;
24
+ long n;
25
+ ktf_worker_t *w;
26
+ void (*func)(void*,long,int);
27
+ void *data;
28
+ } kt_for_t;
29
+
30
+ static inline long steal_work(kt_for_t *t)
31
+ {
32
+ int i, min_i = -1;
33
+ long k, min = LONG_MAX;
34
+ for (i = 0; i < t->n_threads; ++i)
35
+ if (min > t->w[i].i) min = t->w[i].i, min_i = i;
36
+ k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
37
+ return k >= t->n? -1 : k;
38
+ }
39
+
40
+ static void *ktf_worker(void *data)
41
+ {
42
+ ktf_worker_t *w = (ktf_worker_t*)data;
43
+ long i;
44
+ for (;;) {
45
+ i = __sync_fetch_and_add(&w->i, w->t->n_threads);
46
+ if (i >= w->t->n) break;
47
+ w->t->func(w->t->data, i, w - w->t->w);
48
+ }
49
+ while ((i = steal_work(w->t)) >= 0)
50
+ w->t->func(w->t->data, i, w - w->t->w);
51
+ pthread_exit(0);
52
+ }
53
+
54
+ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
55
+ {
56
+ if (n_threads > 1) {
57
+ int i;
58
+ kt_for_t t;
59
+ pthread_t *tid;
60
+ t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
61
+ t.w = (ktf_worker_t*)calloc(n_threads, sizeof(ktf_worker_t));
62
+ tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
63
+ for (i = 0; i < n_threads; ++i)
64
+ t.w[i].t = &t, t.w[i].i = i;
65
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
66
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
67
+ free(tid); free(t.w);
68
+ } else {
69
+ long j;
70
+ for (j = 0; j < n; ++j) func(data, j, 0);
71
+ }
72
+ }
73
+
74
+ /*****************
75
+ * kt_pipeline() *
76
+ *****************/
77
+
78
+ struct ktp_t;
79
+
80
+ typedef struct {
81
+ struct ktp_t *pl;
82
+ int64_t index;
83
+ int step;
84
+ void *data;
85
+ } ktp_worker_t;
86
+
87
+ typedef struct ktp_t {
88
+ void *shared;
89
+ void *(*func)(void*, int, void*);
90
+ int64_t index;
91
+ int n_workers, n_steps;
92
+ ktp_worker_t *workers;
93
+ pthread_mutex_t mutex;
94
+ pthread_cond_t cv;
95
+ } ktp_t;
96
+
97
+ static void *ktp_worker(void *data)
98
+ {
99
+ ktp_worker_t *w = (ktp_worker_t*)data;
100
+ ktp_t *p = w->pl;
101
+ while (w->step < p->n_steps) {
102
+ // test whether we can kick off the job with this worker
103
+ pthread_mutex_lock(&p->mutex);
104
+ for (;;) {
105
+ int i;
106
+ // test whether another worker is doing the same step
107
+ for (i = 0; i < p->n_workers; ++i) {
108
+ if (w == &p->workers[i]) continue; // ignore itself
109
+ if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
110
+ break;
111
+ }
112
+ if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
113
+ pthread_cond_wait(&p->cv, &p->mutex);
114
+ }
115
+ pthread_mutex_unlock(&p->mutex);
116
+
117
+ // working on w->step
118
+ w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
119
+
120
+ // update step and let other workers know
121
+ pthread_mutex_lock(&p->mutex);
122
+ w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
123
+ if (w->step == 0) w->index = p->index++;
124
+ pthread_cond_broadcast(&p->cv);
125
+ pthread_mutex_unlock(&p->mutex);
126
+ }
127
+ pthread_exit(0);
128
+ }
129
+
130
+ void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
131
+ {
132
+ ktp_t aux;
133
+ pthread_t *tid;
134
+ int i;
135
+
136
+ if (n_threads < 1) n_threads = 1;
137
+ aux.n_workers = n_threads;
138
+ aux.n_steps = n_steps;
139
+ aux.func = func;
140
+ aux.shared = shared_data;
141
+ aux.index = 0;
142
+ pthread_mutex_init(&aux.mutex, 0);
143
+ pthread_cond_init(&aux.cv, 0);
144
+
145
+ aux.workers = (ktp_worker_t*)calloc(n_threads, sizeof(ktp_worker_t));
146
+ for (i = 0; i < n_threads; ++i) {
147
+ ktp_worker_t *w = &aux.workers[i];
148
+ w->step = 0; w->pl = &aux; w->data = 0;
149
+ w->index = aux.index++;
150
+ }
151
+
152
+ tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
153
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
154
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
155
+ free(tid); free(aux.workers);
156
+
157
+ pthread_mutex_destroy(&aux.mutex);
158
+ pthread_cond_destroy(&aux.cv);
159
+ }
@@ -0,0 +1,15 @@
1
+ #ifndef KTHREAD_H
2
+ #define KTHREAD_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
9
+ void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
10
+
11
+ #ifdef __cplusplus
12
+ }
13
+ #endif
14
+
15
+ #endif