minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,313 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "ksw2.h"
|
4
|
+
|
5
|
+
#ifdef __SSE2__
|
6
|
+
#ifdef USE_SIMDE
|
7
|
+
#include <simde/x86/sse2.h>
|
8
|
+
#else
|
9
|
+
#include <emmintrin.h>
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef KSW_SSE2_ONLY
|
13
|
+
#undef __SSE4_1__
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#ifdef __SSE4_1__
|
17
|
+
#ifdef USE_SIMDE
|
18
|
+
#include <simde/x86/sse4.1.h>
|
19
|
+
#else
|
20
|
+
#include <smmintrin.h>
|
21
|
+
#endif
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#ifdef KSW_CPU_DISPATCH
|
25
|
+
#ifdef __SSE4_1__
|
26
|
+
void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
|
27
|
+
#else
|
28
|
+
void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
|
29
|
+
#endif
|
30
|
+
#else
|
31
|
+
void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
|
32
|
+
#endif // ~KSW_CPU_DISPATCH
|
33
|
+
{
|
34
|
+
#define __dp_code_block1 \
|
35
|
+
z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
|
36
|
+
xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \
|
37
|
+
tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \
|
38
|
+
xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
|
39
|
+
x1_ = tmp; \
|
40
|
+
vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \
|
41
|
+
tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \
|
42
|
+
vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
|
43
|
+
v1_ = tmp; \
|
44
|
+
a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
|
45
|
+
ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \
|
46
|
+
b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
|
47
|
+
|
48
|
+
#define __dp_code_block2 \
|
49
|
+
z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \
|
50
|
+
z = _mm_min_epu8(z, max_sc_); \
|
51
|
+
_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
|
52
|
+
_mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
|
53
|
+
z = _mm_sub_epi8(z, q_); \
|
54
|
+
a = _mm_sub_epi8(a, z); \
|
55
|
+
b = _mm_sub_epi8(b, z);
|
56
|
+
|
57
|
+
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
|
58
|
+
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
|
59
|
+
int32_t *H = 0, H0 = 0, last_H0_t = 0;
|
60
|
+
uint8_t *qr, *sf, *mem, *mem2 = 0;
|
61
|
+
__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
|
62
|
+
__m128i *u, *v, *x, *y, *s, *p = 0;
|
63
|
+
|
64
|
+
ksw_reset_extz(ez);
|
65
|
+
if (m <= 0 || qlen <= 0 || tlen <= 0) return;
|
66
|
+
|
67
|
+
zero_ = _mm_set1_epi8(0);
|
68
|
+
q_ = _mm_set1_epi8(q);
|
69
|
+
qe2_ = _mm_set1_epi8((q + e) * 2);
|
70
|
+
flag1_ = _mm_set1_epi8(1);
|
71
|
+
flag2_ = _mm_set1_epi8(2);
|
72
|
+
flag8_ = _mm_set1_epi8(0x08);
|
73
|
+
flag16_ = _mm_set1_epi8(0x10);
|
74
|
+
sc_mch_ = _mm_set1_epi8(mat[0]);
|
75
|
+
sc_mis_ = _mm_set1_epi8(mat[1]);
|
76
|
+
sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
|
77
|
+
m1_ = _mm_set1_epi8(m - 1); // wildcard
|
78
|
+
max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
|
79
|
+
|
80
|
+
if (w < 0) w = tlen > qlen? tlen : qlen;
|
81
|
+
wl = wr = w;
|
82
|
+
tlen_ = (tlen + 15) / 16;
|
83
|
+
n_col_ = qlen < tlen? qlen : tlen;
|
84
|
+
n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
|
85
|
+
qlen_ = (qlen + 15) / 16;
|
86
|
+
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
|
87
|
+
max_sc = max_sc > mat[t]? max_sc : mat[t];
|
88
|
+
min_sc = min_sc < mat[t]? min_sc : mat[t];
|
89
|
+
}
|
90
|
+
if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
|
91
|
+
|
92
|
+
mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
|
93
|
+
u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
|
94
|
+
v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
|
95
|
+
if (!approx_max) {
|
96
|
+
H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
|
97
|
+
for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
|
98
|
+
}
|
99
|
+
if (with_cigar) {
|
100
|
+
mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
|
101
|
+
p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
|
102
|
+
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
|
103
|
+
off_end = off + qlen + tlen - 1;
|
104
|
+
}
|
105
|
+
|
106
|
+
for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
|
107
|
+
memcpy(sf, target, tlen);
|
108
|
+
|
109
|
+
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
|
110
|
+
int st = 0, en = tlen - 1, st0, en0, st_, en_;
|
111
|
+
int8_t x1, v1;
|
112
|
+
uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v;
|
113
|
+
__m128i x1_, v1_;
|
114
|
+
// find the boundaries
|
115
|
+
if (st < r - qlen + 1) st = r - qlen + 1;
|
116
|
+
if (en > r) en = r;
|
117
|
+
if (st < (r-wr+1)>>1) st = (r-wr+1)>>1; // take the ceil
|
118
|
+
if (en > (r+wl)>>1) en = (r+wl)>>1; // take the floor
|
119
|
+
if (st > en) {
|
120
|
+
ez->zdropped = 1;
|
121
|
+
break;
|
122
|
+
}
|
123
|
+
st0 = st, en0 = en;
|
124
|
+
st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
|
125
|
+
// set boundary conditions
|
126
|
+
if (st > 0) {
|
127
|
+
if (st - 1 >= last_st && st - 1 <= last_en)
|
128
|
+
x1 = ((uint8_t*)x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
|
129
|
+
else x1 = v1 = 0; // not calculated; set to zeros
|
130
|
+
} else x1 = 0, v1 = r? q : 0;
|
131
|
+
if (en >= r) ((uint8_t*)y)[r] = 0, u8[r] = r? q : 0;
|
132
|
+
// loop fission: set scores first
|
133
|
+
if (!(flag & KSW_EZ_GENERIC_SC)) {
|
134
|
+
for (t = st0; t <= en0; t += 16) {
|
135
|
+
__m128i sq, st, tmp, mask;
|
136
|
+
sq = _mm_loadu_si128((__m128i*)&sf[t]);
|
137
|
+
st = _mm_loadu_si128((__m128i*)&qrr[t]);
|
138
|
+
mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
|
139
|
+
tmp = _mm_cmpeq_epi8(sq, st);
|
140
|
+
#ifdef __SSE4_1__
|
141
|
+
tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
|
142
|
+
tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
|
143
|
+
#else
|
144
|
+
tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
|
145
|
+
tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
|
146
|
+
#endif
|
147
|
+
_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
|
148
|
+
}
|
149
|
+
} else {
|
150
|
+
for (t = st0; t <= en0; ++t)
|
151
|
+
((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
|
152
|
+
}
|
153
|
+
// core loop
|
154
|
+
x1_ = _mm_cvtsi32_si128(x1);
|
155
|
+
v1_ = _mm_cvtsi32_si128(v1);
|
156
|
+
st_ = st / 16, en_ = en / 16;
|
157
|
+
assert(en_ - st_ + 1 <= n_col_);
|
158
|
+
if (!with_cigar) { // score only
|
159
|
+
for (t = st_; t <= en_; ++t) {
|
160
|
+
__m128i z, a, b, xt1, vt1, ut, tmp;
|
161
|
+
__dp_code_block1;
|
162
|
+
#ifdef __SSE4_1__
|
163
|
+
z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
|
164
|
+
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
|
165
|
+
z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
|
166
|
+
z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
|
167
|
+
#endif
|
168
|
+
__dp_code_block2;
|
169
|
+
#ifdef __SSE4_1__
|
170
|
+
_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
|
171
|
+
_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
|
172
|
+
#else
|
173
|
+
tmp = _mm_cmpgt_epi8(a, zero_);
|
174
|
+
_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
|
175
|
+
tmp = _mm_cmpgt_epi8(b, zero_);
|
176
|
+
_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
|
177
|
+
#endif
|
178
|
+
}
|
179
|
+
} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
|
180
|
+
__m128i *pr = p + (size_t)r * n_col_ - st_;
|
181
|
+
off[r] = st, off_end[r] = en;
|
182
|
+
for (t = st_; t <= en_; ++t) {
|
183
|
+
__m128i d, z, a, b, xt1, vt1, ut, tmp;
|
184
|
+
__dp_code_block1;
|
185
|
+
d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
|
186
|
+
#ifdef __SSE4_1__
|
187
|
+
z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
|
188
|
+
tmp = _mm_cmpgt_epi8(b, z);
|
189
|
+
d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d
|
190
|
+
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
|
191
|
+
z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
|
192
|
+
z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
|
193
|
+
tmp = _mm_cmpgt_epi8(b, z);
|
194
|
+
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
|
195
|
+
#endif
|
196
|
+
__dp_code_block2;
|
197
|
+
tmp = _mm_cmpgt_epi8(a, zero_);
|
198
|
+
_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
|
199
|
+
d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0
|
200
|
+
tmp = _mm_cmpgt_epi8(b, zero_);
|
201
|
+
_mm_store_si128(&y[t], _mm_and_si128(tmp, b));
|
202
|
+
d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
|
203
|
+
_mm_store_si128(&pr[t], d);
|
204
|
+
}
|
205
|
+
} else { // gap right-alignment
|
206
|
+
__m128i *pr = p + (size_t)r * n_col_ - st_;
|
207
|
+
off[r] = st, off_end[r] = en;
|
208
|
+
for (t = st_; t <= en_; ++t) {
|
209
|
+
__m128i d, z, a, b, xt1, vt1, ut, tmp;
|
210
|
+
__dp_code_block1;
|
211
|
+
d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
|
212
|
+
#ifdef __SSE4_1__
|
213
|
+
z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
|
214
|
+
tmp = _mm_cmpgt_epi8(z, b);
|
215
|
+
d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2
|
216
|
+
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
|
217
|
+
z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
|
218
|
+
z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
|
219
|
+
tmp = _mm_cmpgt_epi8(z, b);
|
220
|
+
d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
|
221
|
+
#endif
|
222
|
+
__dp_code_block2;
|
223
|
+
tmp = _mm_cmpgt_epi8(zero_, a);
|
224
|
+
_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
|
225
|
+
d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08
|
226
|
+
tmp = _mm_cmpgt_epi8(zero_, b);
|
227
|
+
_mm_store_si128(&y[t], _mm_andnot_si128(tmp, b));
|
228
|
+
d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
|
229
|
+
_mm_store_si128(&pr[t], d);
|
230
|
+
}
|
231
|
+
}
|
232
|
+
if (!approx_max) { // find the exact max with a 32-bit score array
|
233
|
+
int32_t max_H, max_t;
|
234
|
+
// compute H[], max_H and max_t
|
235
|
+
if (r > 0) {
|
236
|
+
int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
|
237
|
+
__m128i max_H_, max_t_, qe_;
|
238
|
+
max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element
|
239
|
+
max_t = en0;
|
240
|
+
max_H_ = _mm_set1_epi32(max_H);
|
241
|
+
max_t_ = _mm_set1_epi32(max_t);
|
242
|
+
qe_ = _mm_set1_epi32(q + e);
|
243
|
+
for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
|
244
|
+
__m128i H1, tmp, t_;
|
245
|
+
H1 = _mm_loadu_si128((__m128i*)&H[t]);
|
246
|
+
t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
|
247
|
+
H1 = _mm_add_epi32(H1, t_);
|
248
|
+
H1 = _mm_sub_epi32(H1, qe_);
|
249
|
+
_mm_storeu_si128((__m128i*)&H[t], H1);
|
250
|
+
t_ = _mm_set1_epi32(t);
|
251
|
+
tmp = _mm_cmpgt_epi32(H1, max_H_);
|
252
|
+
#ifdef __SSE4_1__
|
253
|
+
max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
|
254
|
+
max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
|
255
|
+
#else
|
256
|
+
max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
|
257
|
+
max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
|
258
|
+
#endif
|
259
|
+
}
|
260
|
+
_mm_storeu_si128((__m128i*)HH, max_H_);
|
261
|
+
_mm_storeu_si128((__m128i*)tt, max_t_);
|
262
|
+
for (i = 0; i < 4; ++i)
|
263
|
+
if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
|
264
|
+
for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
|
265
|
+
H[t] += (int32_t)v8[t] - qe;
|
266
|
+
if (H[t] > max_H)
|
267
|
+
max_H = H[t], max_t = t;
|
268
|
+
}
|
269
|
+
} else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0
|
270
|
+
// update ez
|
271
|
+
if (en0 == tlen - 1 && H[en0] > ez->mte)
|
272
|
+
ez->mte = H[en0], ez->mte_q = r - en;
|
273
|
+
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
|
274
|
+
ez->mqe = H[st0], ez->mqe_t = st0;
|
275
|
+
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break;
|
276
|
+
if (r == qlen + tlen - 2 && en0 == tlen - 1)
|
277
|
+
ez->score = H[tlen - 1];
|
278
|
+
} else { // find approximate max; Z-drop might be inaccurate, too.
|
279
|
+
if (r > 0) {
|
280
|
+
if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
|
281
|
+
int32_t d0 = v8[last_H0_t] - qe;
|
282
|
+
int32_t d1 = u8[last_H0_t + 1] - qe;
|
283
|
+
if (d0 > d1) H0 += d0;
|
284
|
+
else H0 += d1, ++last_H0_t;
|
285
|
+
} else if (last_H0_t >= st0 && last_H0_t <= en0) {
|
286
|
+
H0 += v8[last_H0_t] - qe;
|
287
|
+
} else {
|
288
|
+
++last_H0_t, H0 += u8[last_H0_t] - qe;
|
289
|
+
}
|
290
|
+
if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e)) break;
|
291
|
+
} else H0 = v8[0] - qe - qe, last_H0_t = 0;
|
292
|
+
if (r == qlen + tlen - 2 && en0 == tlen - 1)
|
293
|
+
ez->score = H0;
|
294
|
+
}
|
295
|
+
last_st = st, last_en = en;
|
296
|
+
//for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
|
297
|
+
}
|
298
|
+
kfree(km, mem);
|
299
|
+
if (!approx_max) kfree(km, H);
|
300
|
+
if (with_cigar) { // backtrack
|
301
|
+
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
302
|
+
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) {
|
303
|
+
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
304
|
+
} else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) {
|
305
|
+
ez->reach_end = 1;
|
306
|
+
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
307
|
+
} else if (ez->max_t >= 0 && ez->max_q >= 0) {
|
308
|
+
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
309
|
+
}
|
310
|
+
kfree(km, mem2); kfree(km, off);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
#endif // __SSE2__
|
@@ -0,0 +1,152 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "ksw2.h"
|
5
|
+
|
6
|
+
#ifdef USE_SIMDE
|
7
|
+
#include <simde/x86/sse2.h>
|
8
|
+
#else
|
9
|
+
#include <emmintrin.h>
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef __GNUC__
|
13
|
+
#define LIKELY(x) __builtin_expect((x),1)
|
14
|
+
#define UNLIKELY(x) __builtin_expect((x),0)
|
15
|
+
#else
|
16
|
+
#define LIKELY(x) (x)
|
17
|
+
#define UNLIKELY(x) (x)
|
18
|
+
#endif
|
19
|
+
|
20
|
+
typedef struct {
|
21
|
+
int qlen, slen;
|
22
|
+
uint8_t shift, mdiff, max, size;
|
23
|
+
__m128i *qp, *H0, *H1, *E, *Hmax;
|
24
|
+
} kswq_t;
|
25
|
+
|
26
|
+
/**
|
27
|
+
* Initialize the query data structure
|
28
|
+
*
|
29
|
+
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
30
|
+
* @param qlen Length of the query sequence
|
31
|
+
* @param query Query sequence
|
32
|
+
* @param m Size of the alphabet
|
33
|
+
* @param mat Scoring matrix in a one-dimension array
|
34
|
+
*
|
35
|
+
* @return Query data structure
|
36
|
+
*/
|
37
|
+
void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
|
38
|
+
{
|
39
|
+
kswq_t *q;
|
40
|
+
int slen, a, tmp, p;
|
41
|
+
|
42
|
+
size = size > 1? 2 : 1;
|
43
|
+
p = 8 * (3 - size); // # values per __m128i
|
44
|
+
slen = (qlen + p - 1) / p; // segmented length
|
45
|
+
q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
|
46
|
+
q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
|
47
|
+
q->H0 = q->qp + slen * m;
|
48
|
+
q->H1 = q->H0 + slen;
|
49
|
+
q->E = q->H1 + slen;
|
50
|
+
q->Hmax = q->E + slen;
|
51
|
+
q->slen = slen; q->qlen = qlen; q->size = size;
|
52
|
+
// compute shift
|
53
|
+
tmp = m * m;
|
54
|
+
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
|
55
|
+
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
|
56
|
+
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
|
57
|
+
}
|
58
|
+
q->max = q->mdiff;
|
59
|
+
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
|
60
|
+
q->mdiff += q->shift; // this is the difference between the min and max scores
|
61
|
+
// An example: p=8, qlen=19, slen=3 and segmentation:
|
62
|
+
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
|
63
|
+
if (size == 1) {
|
64
|
+
int8_t *t = (int8_t*)q->qp;
|
65
|
+
for (a = 0; a < m; ++a) {
|
66
|
+
int i, k, nlen = slen * p;
|
67
|
+
const int8_t *ma = mat + a * m;
|
68
|
+
for (i = 0; i < slen; ++i)
|
69
|
+
for (k = i; k < nlen; k += slen) // p iterations
|
70
|
+
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
|
71
|
+
}
|
72
|
+
} else {
|
73
|
+
int16_t *t = (int16_t*)q->qp;
|
74
|
+
for (a = 0; a < m; ++a) {
|
75
|
+
int i, k, nlen = slen * p;
|
76
|
+
const int8_t *ma = mat + a * m;
|
77
|
+
for (i = 0; i < slen; ++i)
|
78
|
+
for (k = i; k < nlen; k += slen) // p iterations
|
79
|
+
*t++ = (k >= qlen? 0 : ma[query[k]]);
|
80
|
+
}
|
81
|
+
}
|
82
|
+
return q;
|
83
|
+
}
|
84
|
+
|
85
|
+
int ksw_ll_i16(void *q_, int tlen, const uint8_t *target, int _gapo, int _gape, int *qe, int *te)
|
86
|
+
{
|
87
|
+
kswq_t *q = (kswq_t*)q_;
|
88
|
+
int slen, i, gmax = 0, qlen8;
|
89
|
+
__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
|
90
|
+
uint16_t *H8;
|
91
|
+
|
92
|
+
#define __max_8(ret, xx) do { \
|
93
|
+
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
94
|
+
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
|
95
|
+
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
|
96
|
+
(ret) = _mm_extract_epi16((xx), 0); \
|
97
|
+
} while (0)
|
98
|
+
|
99
|
+
// initialization
|
100
|
+
*qe = *te = -1;
|
101
|
+
zero = _mm_set1_epi32(0);
|
102
|
+
gapoe = _mm_set1_epi16(_gapo + _gape);
|
103
|
+
gape = _mm_set1_epi16(_gape);
|
104
|
+
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
105
|
+
slen = q->slen, qlen8 = slen * 8;
|
106
|
+
memset(E, 0, slen * sizeof(__m128i));
|
107
|
+
memset(H0, 0, slen * sizeof(__m128i));
|
108
|
+
memset(Hmax, 0, slen * sizeof(__m128i));
|
109
|
+
// the core loop
|
110
|
+
for (i = 0; i < tlen; ++i) {
|
111
|
+
int j, k, imax;
|
112
|
+
__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
113
|
+
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
114
|
+
h = _mm_slli_si128(h, 2);
|
115
|
+
for (j = 0; LIKELY(j < slen); ++j) {
|
116
|
+
h = _mm_adds_epi16(h, *S++);
|
117
|
+
e = _mm_load_si128(E + j);
|
118
|
+
h = _mm_max_epi16(h, e);
|
119
|
+
h = _mm_max_epi16(h, f);
|
120
|
+
max = _mm_max_epi16(max, h);
|
121
|
+
_mm_store_si128(H1 + j, h);
|
122
|
+
h = _mm_subs_epu16(h, gapoe);
|
123
|
+
e = _mm_subs_epu16(e, gape);
|
124
|
+
e = _mm_max_epi16(e, h);
|
125
|
+
_mm_store_si128(E + j, e);
|
126
|
+
f = _mm_subs_epu16(f, gape);
|
127
|
+
f = _mm_max_epi16(f, h);
|
128
|
+
h = _mm_load_si128(H0 + j);
|
129
|
+
}
|
130
|
+
for (k = 0; LIKELY(k < 8); ++k) {
|
131
|
+
f = _mm_slli_si128(f, 2);
|
132
|
+
for (j = 0; LIKELY(j < slen); ++j) {
|
133
|
+
h = _mm_load_si128(H1 + j);
|
134
|
+
h = _mm_max_epi16(h, f);
|
135
|
+
_mm_store_si128(H1 + j, h);
|
136
|
+
h = _mm_subs_epu16(h, gapoe);
|
137
|
+
f = _mm_subs_epu16(f, gape);
|
138
|
+
if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
end_loop_i16:
|
142
|
+
__max_8(imax, max);
|
143
|
+
if (imax >= gmax) {
|
144
|
+
gmax = imax; *te = i;
|
145
|
+
memcpy(Hmax, H1, slen * sizeof(__m128i));
|
146
|
+
}
|
147
|
+
S = H1; H1 = H0; H0 = S;
|
148
|
+
}
|
149
|
+
for (i = 0, H8 = (uint16_t*)Hmax; i < qlen8; ++i)
|
150
|
+
if ((int)H8[i] == gmax) *qe = i / 8 + i % 8 * slen;
|
151
|
+
return gmax;
|
152
|
+
}
|
@@ -0,0 +1,159 @@
|
|
1
|
+
#include <pthread.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <limits.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "kthread.h"
|
6
|
+
|
7
|
+
#if (defined(WIN32) || defined(_WIN32)) && defined(_MSC_VER)
|
8
|
+
#define __sync_fetch_and_add(ptr, addend) _InterlockedExchangeAdd((void*)ptr, addend)
|
9
|
+
#endif
|
10
|
+
|
11
|
+
/************
|
12
|
+
* kt_for() *
|
13
|
+
************/
|
14
|
+
|
15
|
+
struct kt_for_t;
|
16
|
+
|
17
|
+
typedef struct {
|
18
|
+
struct kt_for_t *t;
|
19
|
+
long i;
|
20
|
+
} ktf_worker_t;
|
21
|
+
|
22
|
+
typedef struct kt_for_t {
|
23
|
+
int n_threads;
|
24
|
+
long n;
|
25
|
+
ktf_worker_t *w;
|
26
|
+
void (*func)(void*,long,int);
|
27
|
+
void *data;
|
28
|
+
} kt_for_t;
|
29
|
+
|
30
|
+
static inline long steal_work(kt_for_t *t)
|
31
|
+
{
|
32
|
+
int i, min_i = -1;
|
33
|
+
long k, min = LONG_MAX;
|
34
|
+
for (i = 0; i < t->n_threads; ++i)
|
35
|
+
if (min > t->w[i].i) min = t->w[i].i, min_i = i;
|
36
|
+
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
|
37
|
+
return k >= t->n? -1 : k;
|
38
|
+
}
|
39
|
+
|
40
|
+
static void *ktf_worker(void *data)
|
41
|
+
{
|
42
|
+
ktf_worker_t *w = (ktf_worker_t*)data;
|
43
|
+
long i;
|
44
|
+
for (;;) {
|
45
|
+
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
|
46
|
+
if (i >= w->t->n) break;
|
47
|
+
w->t->func(w->t->data, i, w - w->t->w);
|
48
|
+
}
|
49
|
+
while ((i = steal_work(w->t)) >= 0)
|
50
|
+
w->t->func(w->t->data, i, w - w->t->w);
|
51
|
+
pthread_exit(0);
|
52
|
+
}
|
53
|
+
|
54
|
+
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
|
55
|
+
{
|
56
|
+
if (n_threads > 1) {
|
57
|
+
int i;
|
58
|
+
kt_for_t t;
|
59
|
+
pthread_t *tid;
|
60
|
+
t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
|
61
|
+
t.w = (ktf_worker_t*)calloc(n_threads, sizeof(ktf_worker_t));
|
62
|
+
tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
|
63
|
+
for (i = 0; i < n_threads; ++i)
|
64
|
+
t.w[i].t = &t, t.w[i].i = i;
|
65
|
+
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
|
66
|
+
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
67
|
+
free(tid); free(t.w);
|
68
|
+
} else {
|
69
|
+
long j;
|
70
|
+
for (j = 0; j < n; ++j) func(data, j, 0);
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
/*****************
|
75
|
+
* kt_pipeline() *
|
76
|
+
*****************/
|
77
|
+
|
78
|
+
struct ktp_t;
|
79
|
+
|
80
|
+
typedef struct {
|
81
|
+
struct ktp_t *pl;
|
82
|
+
int64_t index;
|
83
|
+
int step;
|
84
|
+
void *data;
|
85
|
+
} ktp_worker_t;
|
86
|
+
|
87
|
+
typedef struct ktp_t {
|
88
|
+
void *shared;
|
89
|
+
void *(*func)(void*, int, void*);
|
90
|
+
int64_t index;
|
91
|
+
int n_workers, n_steps;
|
92
|
+
ktp_worker_t *workers;
|
93
|
+
pthread_mutex_t mutex;
|
94
|
+
pthread_cond_t cv;
|
95
|
+
} ktp_t;
|
96
|
+
|
97
|
+
static void *ktp_worker(void *data)
|
98
|
+
{
|
99
|
+
ktp_worker_t *w = (ktp_worker_t*)data;
|
100
|
+
ktp_t *p = w->pl;
|
101
|
+
while (w->step < p->n_steps) {
|
102
|
+
// test whether we can kick off the job with this worker
|
103
|
+
pthread_mutex_lock(&p->mutex);
|
104
|
+
for (;;) {
|
105
|
+
int i;
|
106
|
+
// test whether another worker is doing the same step
|
107
|
+
for (i = 0; i < p->n_workers; ++i) {
|
108
|
+
if (w == &p->workers[i]) continue; // ignore itself
|
109
|
+
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
|
113
|
+
pthread_cond_wait(&p->cv, &p->mutex);
|
114
|
+
}
|
115
|
+
pthread_mutex_unlock(&p->mutex);
|
116
|
+
|
117
|
+
// working on w->step
|
118
|
+
w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
|
119
|
+
|
120
|
+
// update step and let other workers know
|
121
|
+
pthread_mutex_lock(&p->mutex);
|
122
|
+
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
|
123
|
+
if (w->step == 0) w->index = p->index++;
|
124
|
+
pthread_cond_broadcast(&p->cv);
|
125
|
+
pthread_mutex_unlock(&p->mutex);
|
126
|
+
}
|
127
|
+
pthread_exit(0);
|
128
|
+
}
|
129
|
+
|
130
|
+
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
|
131
|
+
{
|
132
|
+
ktp_t aux;
|
133
|
+
pthread_t *tid;
|
134
|
+
int i;
|
135
|
+
|
136
|
+
if (n_threads < 1) n_threads = 1;
|
137
|
+
aux.n_workers = n_threads;
|
138
|
+
aux.n_steps = n_steps;
|
139
|
+
aux.func = func;
|
140
|
+
aux.shared = shared_data;
|
141
|
+
aux.index = 0;
|
142
|
+
pthread_mutex_init(&aux.mutex, 0);
|
143
|
+
pthread_cond_init(&aux.cv, 0);
|
144
|
+
|
145
|
+
aux.workers = (ktp_worker_t*)calloc(n_threads, sizeof(ktp_worker_t));
|
146
|
+
for (i = 0; i < n_threads; ++i) {
|
147
|
+
ktp_worker_t *w = &aux.workers[i];
|
148
|
+
w->step = 0; w->pl = &aux; w->data = 0;
|
149
|
+
w->index = aux.index++;
|
150
|
+
}
|
151
|
+
|
152
|
+
tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
|
153
|
+
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
|
154
|
+
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
155
|
+
free(tid); free(aux.workers);
|
156
|
+
|
157
|
+
pthread_mutex_destroy(&aux.mutex);
|
158
|
+
pthread_cond_destroy(&aux.cv);
|
159
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#ifndef KTHREAD_H
|
2
|
+
#define KTHREAD_H
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
|
9
|
+
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
10
|
+
|
11
|
+
#ifdef __cplusplus
|
12
|
+
}
|
13
|
+
#endif
|
14
|
+
|
15
|
+
#endif
|