minimap2 0.2.24.3 → 0.2.24.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  3. data/ext/minimap2/lib/simde/COPYING +20 -0
  4. data/ext/minimap2/lib/simde/README.md +333 -0
  5. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  6. data/ext/minimap2/lib/simde/meson.build +33 -0
  7. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  8. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  9. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  10. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  11. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  12. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  13. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  14. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  15. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  16. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  17. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  18. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  19. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  20. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  21. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  29. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  30. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  31. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  32. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  33. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  34. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  35. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  36. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  37. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  38. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  39. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  40. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  41. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  42. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  43. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  44. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  45. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  46. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  47. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  48. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  49. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  50. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  51. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  52. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  53. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  54. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  55. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  56. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  57. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  58. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  59. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  60. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  61. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  62. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  63. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  64. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  65. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  66. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  67. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  68. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  69. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  70. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  71. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  72. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  73. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  74. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  75. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  76. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  77. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  78. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  79. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  80. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  81. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  82. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  83. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  84. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  85. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  86. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  87. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  88. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  89. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  90. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  91. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  92. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  93. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  94. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  95. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  96. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  97. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  98. data/lib/minimap2/aligner.rb +2 -2
  99. data/lib/minimap2/ffi/constants.rb +3 -0
  100. data/lib/minimap2/version.rb +1 -1
  101. metadata +99 -3
@@ -0,0 +1,2402 @@
1
+ /* Permission is hereby granted, free of charge, to any person
2
+ * obtaining a copy of this software and associated documentation
3
+ * files (the "Software"), to deal in the Software without
4
+ * restriction, including without limitation the rights to use, copy,
5
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
6
+ * of the Software, and to permit persons to whom the Software is
7
+ * furnished to do so, subject to the following conditions:
8
+ *
9
+ * The above copyright notice and this permission notice shall be
10
+ * included in all copies or substantial portions of the Software.
11
+ *
12
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ * SOFTWARE.
20
+ *
21
+ * Copyright:
22
+ * 2018 Evan Nemerson <evan@nemerson.com>
23
+ * 2019 Michael R. Crusoe <michael.crusoe@gmail.com>
24
+ */
25
+
26
+ #include "sse4.1.h"
27
+ #include "sse4.2.h"
28
+ #if !defined(SIMDE__AVX2_H)
29
+ # if !defined(SIMDE__AVX2_H)
30
+ # define SIMDE__AVX2_H
31
+ # endif
32
+ # include "avx.h"
33
+
34
+ HEDLEY_DIAGNOSTIC_PUSH
35
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
36
+
37
+ # if defined(SIMDE_AVX2_NATIVE)
38
+ # undef SIMDE_AVX2_NATIVE
39
+ # endif
40
+ # if defined(SIMDE_ARCH_X86_AVX2) && !defined(SIMDE_AVX2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
41
+ # define SIMDE_AVX2_NATIVE
42
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_AVX2_NO_NEON) && !defined(SIMDE_NO_NEON)
43
+ # define SIMDE_AVX2_NEON
44
+ # endif
45
+
46
+ # if defined(SIMDE_AVX2_NATIVE) && !defined(SIMDE_AVX_NATIVE)
47
+ # if defined(SIMDE_AVX2_FORCE_NATIVE)
48
+ # error Native AVX2 support requires native AVX support
49
+ # else
50
+ HEDLEY_WARNING("Native AVX2 support requires native AVX support, disabling")
51
+ # undef SIMDE_AVX2_NATIVE
52
+ # endif
53
+ # elif defined(SIMDE_AVX2_NEON) && !defined(SIMDE_AVX_NEON)
54
+ HEDLEY_WARNING("AVX2 NEON support requires AVX NEON support, disabling")
55
+ # undef SIMDE_AVX_NEON
56
+ # endif
57
+
58
+ # if defined(SIMDE_AVX2_NATIVE)
59
+ # include <immintrin.h>
60
+ # endif
61
+
62
+ # if !defined(SIMDE_AVX2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
63
+ # define SIMDE_AVX2_ENABLE_NATIVE_ALIASES
64
+ # endif
65
+
66
+ # include <stdint.h>
67
+
68
+ SIMDE__BEGIN_DECLS
69
+
70
+ SIMDE__FUNCTION_ATTRIBUTES
71
+ simde__m256i
72
+ simde_mm256_abs_epi8 (simde__m256i a) {
73
+ #if defined(SIMDE_AVX2_NATIVE)
74
+ return _mm256_abs_epi8(a);
75
+ #else
76
+ simde__m256i_private
77
+ r_,
78
+ a_ = simde__m256i_to_private(a);
79
+
80
+ SIMDE__VECTORIZE
81
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
82
+ r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
83
+ }
84
+
85
+ return simde__m256i_from_private(r_);
86
+ #endif
87
+ }
88
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
89
+ # define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
90
+ #endif
91
+
92
+ SIMDE__FUNCTION_ATTRIBUTES
93
+ simde__m256i
94
+ simde_mm256_abs_epi16 (simde__m256i a) {
95
+ #if defined(SIMDE_AVX2_NATIVE)
96
+ return _mm256_abs_epi16(a);
97
+ #else
98
+ simde__m256i_private
99
+ r_,
100
+ a_ = simde__m256i_to_private(a);
101
+
102
+ SIMDE__VECTORIZE
103
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
104
+ r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
105
+ }
106
+
107
+ return simde__m256i_from_private(r_);
108
+ #endif
109
+ }
110
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
111
+ # define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
112
+ #endif
113
+
114
+ SIMDE__FUNCTION_ATTRIBUTES
115
+ simde__m256i
116
+ simde_mm256_abs_epi32(simde__m256i a) {
117
+ #if defined(SIMDE_AVX2_NATIVE)
118
+ return _mm256_abs_epi32(a);
119
+ #else
120
+ simde__m256i_private
121
+ r_,
122
+ a_ = simde__m256i_to_private(a);
123
+
124
+ SIMDE__VECTORIZE
125
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
126
+ r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
127
+ }
128
+
129
+ return simde__m256i_from_private(r_);
130
+ #endif
131
+ }
132
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
133
+ # define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
134
+ #endif
135
+
136
+ SIMDE__FUNCTION_ATTRIBUTES
137
+ simde__m256i
138
+ simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
139
+ #if defined(SIMDE_AVX2_NATIVE)
140
+ return _mm256_add_epi8(a, b);
141
+ #else
142
+ simde__m256i_private
143
+ r_,
144
+ a_ = simde__m256i_to_private(a),
145
+ b_ = simde__m256i_to_private(b);
146
+
147
+ #if defined(SIMDE_ARCH_X86_SSE2)
148
+ r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
149
+ r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
150
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
151
+ r_.i8 = a_.i8 + b_.i8;
152
+ #else
153
+ SIMDE__VECTORIZE
154
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
155
+ r_.i8[i] = a_.i8[i] + b_.i8[i];
156
+ }
157
+ #endif
158
+
159
+ return simde__m256i_from_private(r_);
160
+ #endif
161
+ }
162
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
163
+ # define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
164
+ #endif
165
+
166
+ SIMDE__FUNCTION_ATTRIBUTES
167
+ simde__m256i
168
+ simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
169
+ #if defined(SIMDE_AVX2_NATIVE)
170
+ return _mm256_add_epi16(a, b);
171
+ #else
172
+ simde__m256i_private
173
+ r_,
174
+ a_ = simde__m256i_to_private(a),
175
+ b_ = simde__m256i_to_private(b);
176
+
177
+ #if defined(SIMDE_ARCH_X86_SSE2)
178
+ r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
179
+ r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
180
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
181
+ r_.i16 = a_.i16 + b_.i16;
182
+ #else
183
+ SIMDE__VECTORIZE
184
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
185
+ r_.i16[i] = a_.i16[i] + b_.i16[i];
186
+ }
187
+ #endif
188
+
189
+ return simde__m256i_from_private(r_);
190
+ #endif
191
+ }
192
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
193
+ # define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
194
+ #endif
195
+
196
+ SIMDE__FUNCTION_ATTRIBUTES
197
+ simde__m256i
198
+ simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
199
+ #if defined(SIMDE_AVX2_NATIVE)
200
+ return _mm256_add_epi32(a, b);
201
+ #else
202
+ simde__m256i_private
203
+ r_,
204
+ a_ = simde__m256i_to_private(a),
205
+ b_ = simde__m256i_to_private(b);
206
+
207
+ #if defined(SIMDE_ARCH_X86_SSE2)
208
+ r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
209
+ r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
210
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
211
+ r_.i32 = a_.i32 + b_.i32;
212
+ #else
213
+ SIMDE__VECTORIZE
214
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
215
+ r_.i32[i] = a_.i32[i] + b_.i32[i];
216
+ }
217
+ #endif
218
+
219
+ return simde__m256i_from_private(r_);
220
+ #endif
221
+ }
222
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
223
+ # define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
224
+ #endif
225
+
226
+ SIMDE__FUNCTION_ATTRIBUTES
227
+ simde__m256i
228
+ simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
229
+ #if defined(SIMDE_AVX2_NATIVE)
230
+ return _mm256_add_epi64(a, b);
231
+ #else
232
+ simde__m256i_private
233
+ r_,
234
+ a_ = simde__m256i_to_private(a),
235
+ b_ = simde__m256i_to_private(b);
236
+
237
+ #if defined(SIMDE_ARCH_X86_SSE2)
238
+ r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
239
+ r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
240
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
241
+ r_.i64 = a_.i64 + b_.i64;
242
+ #else
243
+ SIMDE__VECTORIZE
244
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
245
+ r_.i64[i] = a_.i64[i] + b_.i64[i];
246
+ }
247
+ #endif
248
+
249
+ return simde__m256i_from_private(r_);
250
+ #endif
251
+ }
252
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
253
+ # define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
254
+ #endif
255
+
256
+ SIMDE__FUNCTION_ATTRIBUTES
257
+ simde__m256i
258
+ simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) {
259
+ simde__m256i_private
260
+ r_,
261
+ a_ = simde__m256i_to_private(a),
262
+ b_ = simde__m256i_to_private(b);
263
+
264
+ if (HEDLEY_UNLIKELY(count > 31))
265
+ return simde_mm256_setzero_si256();
266
+
267
+ for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) {
268
+ SIMDE__VECTORIZE
269
+ for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
270
+ const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
271
+ if (srcpos > 31) {
272
+ r_.m128i_private[h].i8[i] = 0;
273
+ } else if (srcpos > 15) {
274
+ r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
275
+ } else {
276
+ r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
277
+ }
278
+ }
279
+ }
280
+
281
+ return simde__m256i_from_private(r_);
282
+ }
283
+ #if defined(SIMDE_AVX2_NATIVE)
284
+ # define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
285
+ #elif defined(SIMDE_ARCH_X86_SSSE3)
286
+ # define simde_mm256_alignr_epi8(a, b, count) \
287
+ simde_mm256_set_m128i( \
288
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
289
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
290
+ #endif
291
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
292
+ # define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
293
+ #endif
294
+
295
+ SIMDE__FUNCTION_ATTRIBUTES
296
+ simde__m256i
297
+ simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
298
+ #if defined(SIMDE_AVX2_NATIVE)
299
+ return _mm256_and_si256(a, b);
300
+ #else
301
+ simde__m256i_private
302
+ r_,
303
+ a_ = simde__m256i_to_private(a),
304
+ b_ = simde__m256i_to_private(b);
305
+
306
+ #if defined(SIMDE_ARCH_X86_SSE2)
307
+ r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
308
+ r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
309
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
310
+ r_.i32f = a_.i32f & b_.i32f;
311
+ #else
312
+ SIMDE__VECTORIZE
313
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
314
+ r_.i64[i] = a_.i64[i] & b_.i64[i];
315
+ }
316
+ #endif
317
+
318
+ return simde__m256i_from_private(r_);
319
+ #endif
320
+ }
321
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
322
+ # define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
323
+ #endif
324
+
325
+ SIMDE__FUNCTION_ATTRIBUTES
326
+ simde__m256i
327
+ simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
328
+ #if defined(SIMDE_AVX2_NATIVE)
329
+ return _mm256_andnot_si256(a, b);
330
+ #else
331
+ simde__m256i_private
332
+ r_,
333
+ a_ = simde__m256i_to_private(a),
334
+ b_ = simde__m256i_to_private(b);
335
+
336
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
337
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_andnot_si128(simde__m128i_from_private(a_.m128i_private[0]), simde__m128i_from_private(b_.m128i_private[0])));
338
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_andnot_si128(simde__m128i_from_private(a_.m128i_private[1]), simde__m128i_from_private(b_.m128i_private[1])));
339
+ #else
340
+ SIMDE__VECTORIZE
341
+ for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
342
+ r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
343
+ }
344
+ #endif
345
+
346
+ return simde__m256i_from_private(r_);
347
+ #endif
348
+ }
349
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
350
+ # define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
351
+ #endif
352
+
353
+ SIMDE__FUNCTION_ATTRIBUTES
354
+ simde__m256i
355
+ simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
356
+ #if defined(SIMDE_AVX2_NATIVE)
357
+ return _mm256_adds_epi8(a, b);
358
+ #else
359
+ simde__m256i_private
360
+ r_,
361
+ a_ = simde__m256i_to_private(a),
362
+ b_ = simde__m256i_to_private(b);
363
+
364
+ #if defined(SIMDE_ARCH_X86_SSE2) && !defined(HEDLEY_INTEL_VERSION)
365
+ SIMDE__VECTORIZE
366
+ for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
367
+ r_.m128i[i] = simde_mm_adds_epi8(a_.m128i[i], b_.m128i[i]);
368
+ }
369
+ #else
370
+ SIMDE__VECTORIZE
371
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
372
+ const int32_t tmp =
373
+ HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
374
+ HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
375
+ r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
376
+ }
377
+ #endif
378
+
379
+ return simde__m256i_from_private(r_);
380
+ #endif
381
+ }
382
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
383
+ # define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
384
+ #endif
385
+
386
+ SIMDE__FUNCTION_ATTRIBUTES
387
+ simde__m256i
388
+ simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
389
+ #if defined(SIMDE_AVX2_NATIVE)
390
+ return _mm256_adds_epi16(a, b);
391
+ #else
392
+ simde__m256i_private
393
+ r_,
394
+ a_ = simde__m256i_to_private(a),
395
+ b_ = simde__m256i_to_private(b);
396
+
397
+ #if defined(SIMDE_ARCH_X86_SSE2) && !defined(HEDLEY_INTEL_VERSION)
398
+ SIMDE__VECTORIZE
399
+ for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
400
+ r_.m128i[i] = simde_mm_adds_epi16(a_.m128i[i], b_.m128i[i]);
401
+ }
402
+ #else
403
+ SIMDE__VECTORIZE
404
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
405
+ const int32_t tmp =
406
+ HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
407
+ HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
408
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
409
+ }
410
+ #endif
411
+
412
+ return simde__m256i_from_private(r_);
413
+ #endif
414
+ }
415
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
416
+ # define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
417
+ #endif
418
+
419
+ SIMDE__FUNCTION_ATTRIBUTES
420
+ simde__m256i
421
+ simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
422
+ #if defined(SIMDE_AVX2_NATIVE)
423
+ return _mm256_adds_epu8(a, b);
424
+ #else
425
+ simde__m256i_private
426
+ r_,
427
+ a_ = simde__m256i_to_private(a),
428
+ b_ = simde__m256i_to_private(b);
429
+
430
+ #if defined(SIMDE_ARCH_X86_SSE2)
431
+ r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
432
+ r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
433
+ #else
434
+ SIMDE__VECTORIZE
435
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
436
+ r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
437
+ }
438
+ #endif
439
+
440
+ return simde__m256i_from_private(r_);
441
+ #endif
442
+ }
443
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
444
+ # define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
445
+ #endif
446
+
447
+ SIMDE__FUNCTION_ATTRIBUTES
448
+ simde__m256i
449
+ simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
450
+ #if defined(SIMDE_AVX2_NATIVE)
451
+ return _mm256_adds_epu16(a, b);
452
+ #else
453
+ simde__m256i_private
454
+ r_,
455
+ a_ = simde__m256i_to_private(a),
456
+ b_ = simde__m256i_to_private(b);
457
+
458
+ #if defined(SIMDE_ARCH_X86_SSE2)
459
+ r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
460
+ r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
461
+ #else
462
+ SIMDE__VECTORIZE
463
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
464
+ r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
465
+ }
466
+ #endif
467
+
468
+ return simde__m256i_from_private(r_);
469
+ #endif
470
+ }
471
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
472
+ # define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
473
+ #endif
474
+
475
+ SIMDE__FUNCTION_ATTRIBUTES
476
+ simde__m256i
477
+ simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
478
+ #if defined(SIMDE_AVX2_NATIVE)
479
+ return _mm256_avg_epu8(a, b);
480
+ #else
481
+ simde__m256i_private
482
+ r_,
483
+ a_ = simde__m256i_to_private(a),
484
+ b_ = simde__m256i_to_private(b);
485
+
486
+ SIMDE__VECTORIZE
487
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
488
+ r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
489
+ }
490
+
491
+ return simde__m256i_from_private(r_);
492
+ #endif
493
+ }
494
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
495
+ # define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
496
+ #endif
497
+
498
+ SIMDE__FUNCTION_ATTRIBUTES
499
+ simde__m256i
500
+ simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
501
+ #if defined(SIMDE_AVX2_NATIVE)
502
+ return _mm256_avg_epu16(a, b);
503
+ #else
504
+ simde__m256i_private
505
+ r_,
506
+ a_ = simde__m256i_to_private(a),
507
+ b_ = simde__m256i_to_private(b);
508
+
509
+ SIMDE__VECTORIZE
510
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
511
+ r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
512
+ }
513
+
514
+ return simde__m256i_from_private(r_);
515
+ #endif
516
+ }
517
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
518
+ # define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
519
+ #endif
520
+
521
+ SIMDE__FUNCTION_ATTRIBUTES
522
+ simde__m128i
523
+ simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
524
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
525
+ simde__m128i_private
526
+ r_,
527
+ a_ = simde__m128i_to_private(a),
528
+ b_ = simde__m128i_to_private(b);
529
+
530
+ SIMDE__VECTORIZE
531
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
532
+ r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
533
+ }
534
+
535
+ return simde__m128i_from_private(r_);
536
+ }
537
+ #if defined(SIMDE_AVX2_NATIVE)
538
+ # define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8);
539
+ #endif
540
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
541
+ # define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
542
+ #endif
543
+
544
+ SIMDE__FUNCTION_ATTRIBUTES
545
+ simde__m256i
546
+ simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
547
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
548
+ simde__m256i_private
549
+ r_,
550
+ a_ = simde__m256i_to_private(a),
551
+ b_ = simde__m256i_to_private(b);
552
+
553
+ SIMDE__VECTORIZE
554
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
555
+ r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
556
+ }
557
+
558
+ return simde__m256i_from_private(r_);
559
+ }
560
+ #if defined(SIMDE_AVX2_NATIVE)
561
+ # define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8);
562
+ #endif
563
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
564
+ # define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
565
+ #endif
566
+
567
+
568
+ SIMDE__FUNCTION_ATTRIBUTES
569
+ simde__m256i
570
+ simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
571
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
572
+ simde__m256i_private
573
+ r_,
574
+ a_ = simde__m256i_to_private(a),
575
+ b_ = simde__m256i_to_private(b);
576
+
577
+ SIMDE__VECTORIZE
578
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
579
+ r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
580
+ }
581
+
582
+ return simde__m256i_from_private(r_);
583
+ }
584
+ #if defined(SIMDE_AVX2_NATIVE)
585
+ # define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8);
586
+ #endif
587
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
588
+ # define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
589
+ #endif
590
+
591
+
592
+ SIMDE__FUNCTION_ATTRIBUTES
593
+ simde__m256i
594
+ simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
595
+ #if defined(SIMDE_AVX2_NATIVE)
596
+ return _mm256_blendv_epi8(a, b, mask);
597
+ #else
598
+ simde__m256i_private
599
+ r_,
600
+ a_ = simde__m256i_to_private(a),
601
+ b_ = simde__m256i_to_private(b),
602
+ mask_ = simde__m256i_to_private(mask);
603
+
604
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
605
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_blendv_epi8(simde__m128i_from_private(a_.m128i_private[0]), simde__m128i_from_private(b_.m128i_private[0]), simde__m128i_from_private(mask_.m128i_private[0])));
606
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_blendv_epi8(simde__m128i_from_private(a_.m128i_private[1]), simde__m128i_from_private(b_.m128i_private[1]), simde__m128i_from_private(mask_.m128i_private[1])));
607
+ #else
608
+ SIMDE__VECTORIZE
609
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
610
+ if (mask_.u8[i] & 0x80) {
611
+ r_.u8[i] = b_.u8[i];
612
+ } else {
613
+ r_.u8[i] = a_.u8[i];
614
+ }
615
+ }
616
+ #endif
617
+
618
+ return simde__m256i_from_private(r_);
619
+ #endif
620
+ }
621
+ #if defined(SIMDE_AVX2_NATIVE)
622
+ # define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8);
623
+ #endif
624
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
625
+ # define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
626
+ #endif
627
+
628
+ SIMDE__FUNCTION_ATTRIBUTES
629
+ simde__m128i
630
+ simde_mm_broadcastb_epi8 (simde__m128i a) {
631
+ #if defined(SIMDE_AVX2_NATIVE)
632
+ return _mm_broadcastb_epi8(a);
633
+ #else
634
+ simde__m128i_private r_;
635
+ simde__m128i_private a_= simde__m128i_to_private(a);
636
+
637
+ SIMDE__VECTORIZE
638
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
639
+ r_.i8[i] = a_.i8[0];
640
+ }
641
+
642
+ return simde__m128i_from_private(r_);
643
+ #endif
644
+ }
645
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
646
+ # define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
647
+ #endif
648
+
649
+ SIMDE__FUNCTION_ATTRIBUTES
650
+ simde__m256i
651
+ simde_mm256_broadcastb_epi8 (simde__m128i a) {
652
+ #if defined(SIMDE_AVX2_NATIVE)
653
+ return _mm256_broadcastb_epi8(a);
654
+ #else
655
+ simde__m256i_private r_;
656
+ simde__m128i_private a_= simde__m128i_to_private(a);
657
+
658
+ SIMDE__VECTORIZE
659
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
660
+ r_.i8[i] = a_.i8[0];
661
+ }
662
+
663
+ return simde__m256i_from_private(r_);
664
+ #endif
665
+ }
666
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
667
+ # define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
668
+ #endif
669
+
670
+ SIMDE__FUNCTION_ATTRIBUTES
671
+ simde__m256i
672
+ simde_mm256_broadcastsi128_si256 (simde__m128i a) {
673
+ #if defined(SIMDE_AVX2_NATIVE)
674
+ return _mm256_broadcastsi128_si256(a);
675
+ #else
676
+ simde__m256i_private r_;
677
+ simde__m128i_private a_ = simde__m128i_to_private(a);
678
+
679
+ #if defined(SIMDE_ARCH_X86_SSE2)
680
+ r_.m128i_private[0] = a_;
681
+ r_.m128i_private[1] = a_;
682
+ #else
683
+ r_.i64[0] = a_.i64[0];
684
+ r_.i64[1] = a_.i64[1];
685
+ r_.i64[2] = a_.i64[0];
686
+ r_.i64[3] = a_.i64[1];
687
+ #endif
688
+
689
+ return simde__m256i_from_private(r_);
690
+ #endif
691
+ }
692
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
693
+ # define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
694
+ #endif
695
+
696
+ SIMDE__FUNCTION_ATTRIBUTES
697
+ simde__m256i
698
+ simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
699
+ #if defined(SIMDE_AVX2_NATIVE)
700
+ return _mm256_cmpeq_epi8(a, b);
701
+ #else
702
+ simde__m256i_private
703
+ r_,
704
+ a_ = simde__m256i_to_private(a),
705
+ b_ = simde__m256i_to_private(b);
706
+
707
+ #if defined(SIMDE_ARCH_X86_SSE2)
708
+ r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
709
+ r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
710
+ #else
711
+ SIMDE__VECTORIZE
712
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
713
+ r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
714
+ }
715
+ #endif
716
+
717
+ return simde__m256i_from_private(r_);
718
+ #endif
719
+ }
720
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
721
+ # define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
722
+ #endif
723
+
724
+ SIMDE__FUNCTION_ATTRIBUTES
725
+ simde__m256i
726
+ simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
727
+ #if defined(SIMDE_AVX2_NATIVE)
728
+ return _mm256_cmpeq_epi16(a, b);
729
+ #else
730
+ simde__m256i_private
731
+ r_,
732
+ a_ = simde__m256i_to_private(a),
733
+ b_ = simde__m256i_to_private(b);
734
+
735
+ SIMDE__VECTORIZE
736
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
737
+ r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
738
+ }
739
+
740
+ return simde__m256i_from_private(r_);
741
+ #endif
742
+ }
743
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
744
+ # define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
745
+ #endif
746
+
747
+ SIMDE__FUNCTION_ATTRIBUTES
748
+ simde__m256i
749
+ simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
750
+ #if defined(SIMDE_AVX2_NATIVE)
751
+ return _mm256_cmpeq_epi32(a, b);
752
+ #else
753
+ simde__m256i_private
754
+ r_,
755
+ a_ = simde__m256i_to_private(a),
756
+ b_ = simde__m256i_to_private(b);
757
+
758
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
759
+ r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
760
+ r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
761
+ #else
762
+ SIMDE__VECTORIZE
763
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
764
+ r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
765
+ }
766
+ #endif
767
+
768
+ return simde__m256i_from_private(r_);
769
+ #endif
770
+ }
771
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
772
+ # define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
773
+ #endif
774
+
775
+ SIMDE__FUNCTION_ATTRIBUTES
776
+ simde__m256i
777
+ simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
778
+ #if defined(SIMDE_AVX2_NATIVE)
779
+ return _mm256_cmpeq_epi64(a, b);
780
+ #else
781
+ simde__m256i_private
782
+ r_,
783
+ a_ = simde__m256i_to_private(a),
784
+ b_ = simde__m256i_to_private(b);
785
+
786
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
787
+ r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
788
+ r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
789
+ #else
790
+ SIMDE__VECTORIZE
791
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
792
+ r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
793
+ }
794
+ #endif
795
+
796
+ return simde__m256i_from_private(r_);
797
+ #endif
798
+ }
799
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
800
+ # define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
801
+ #endif
802
+
803
+ SIMDE__FUNCTION_ATTRIBUTES
804
+ simde__m256i
805
+ simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
806
+ #if defined(SIMDE_AVX2_NATIVE)
807
+ return _mm256_cmpgt_epi8(a, b);
808
+ #else
809
+ simde__m256i_private
810
+ r_,
811
+ a_ = simde__m256i_to_private(a),
812
+ b_ = simde__m256i_to_private(b);
813
+
814
+ #if defined(SIMDE_ARCH_X86_SSE2)
815
+ r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
816
+ r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
817
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
818
+ r_.i8 = a_.i8 > b_.i8;
819
+ #else
820
+ SIMDE__VECTORIZE
821
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
822
+ r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
823
+ }
824
+ #endif
825
+
826
+ return simde__m256i_from_private(r_);
827
+ #endif
828
+ }
829
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
830
+ # define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
831
+ #endif
832
+
833
+ SIMDE__FUNCTION_ATTRIBUTES
834
+ simde__m256i
835
+ simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
836
+ #if defined(SIMDE_AVX2_NATIVE)
837
+ return _mm256_cmpgt_epi16(a, b);
838
+ #else
839
+ simde__m256i_private
840
+ r_,
841
+ a_ = simde__m256i_to_private(a),
842
+ b_ = simde__m256i_to_private(b);
843
+
844
+ #if defined(SIMDE_ARCH_X86_SSE2)
845
+ r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
846
+ r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
847
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
848
+ r_.i16 = a_.i16 > b_.i16;
849
+ #else
850
+ SIMDE__VECTORIZE
851
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
852
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
853
+ }
854
+ #endif
855
+
856
+ return simde__m256i_from_private(r_);
857
+ #endif
858
+ }
859
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
860
+ # define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
861
+ #endif
862
+
863
+ SIMDE__FUNCTION_ATTRIBUTES
864
+ simde__m256i
865
+ simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
866
+ #if defined(SIMDE_AVX2_NATIVE)
867
+ return _mm256_cmpgt_epi32(a, b);
868
+ #else
869
+ simde__m256i_private
870
+ r_,
871
+ a_ = simde__m256i_to_private(a),
872
+ b_ = simde__m256i_to_private(b);
873
+
874
+ #if defined(SIMDE_ARCH_X86_SSE2)
875
+ r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
876
+ r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
877
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
878
+ r_.i32 = a_.i32 > b_.i32;
879
+ #else
880
+ SIMDE__VECTORIZE
881
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
882
+ r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
883
+ }
884
+ #endif
885
+
886
+ return simde__m256i_from_private(r_);
887
+ #endif
888
+ }
889
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
890
+ # define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
891
+ #endif
892
+
893
+ SIMDE__FUNCTION_ATTRIBUTES
894
+ simde__m256i
895
+ simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
896
+ #if defined(SIMDE_AVX2_NATIVE)
897
+ return _mm256_cmpgt_epi64(a, b);
898
+ #else
899
+ simde__m256i_private
900
+ r_,
901
+ a_ = simde__m256i_to_private(a),
902
+ b_ = simde__m256i_to_private(b);
903
+
904
+ #if defined(SIMDE_ARCH_X86_SSE2)
905
+ r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
906
+ r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
907
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
908
+ r_.i64 = a_.i64 > b_.i64;
909
+ #else
910
+ SIMDE__VECTORIZE
911
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
912
+ r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
913
+ }
914
+ #endif
915
+
916
+ return simde__m256i_from_private(r_);
917
+ #endif
918
+ }
919
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
920
+ # define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
921
+ #endif
922
+
923
+ SIMDE__FUNCTION_ATTRIBUTES
924
+ simde__m256i
925
+ simde_mm256_cvtepi8_epi16 (simde__m128i a) {
926
+ #if defined(SIMDE_AVX2_NATIVE)
927
+ return _mm256_cvtepi8_epi16(a);
928
+ #else
929
+ simde__m256i_private r_;
930
+ simde__m128i_private a_ = simde__m128i_to_private(a);
931
+
932
+ #if defined(SIMDE__CONVERT_VECTOR)
933
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.i8);
934
+ #else
935
+ SIMDE__VECTORIZE
936
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
937
+ r_.i16[i] = a_.i8[i];
938
+ }
939
+ #endif
940
+
941
+ return simde__m256i_from_private(r_);
942
+ #endif
943
+ }
944
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
945
+ # define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
946
+ #endif
947
+
948
+ SIMDE__FUNCTION_ATTRIBUTES
949
+ simde__m256i
950
+ simde_mm256_cvtepi8_epi32 (simde__m128i a) {
951
+ #if defined(SIMDE_AVX2_NATIVE)
952
+ return _mm256_cvtepi8_epi32(a);
953
+ #else
954
+ simde__m256i_private r_;
955
+ simde__m128i_private a_ = simde__m128i_to_private(a);
956
+
957
+ #if defined(SIMDE__CONVERT_VECTOR)
958
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].i8);
959
+ #else
960
+ SIMDE__VECTORIZE
961
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
962
+ r_.i32[i] = a_.i8[i];
963
+ }
964
+ #endif
965
+
966
+ return simde__m256i_from_private(r_);
967
+ #endif
968
+ }
969
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
970
+ # define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
971
+ #endif
972
+
973
+ SIMDE__FUNCTION_ATTRIBUTES
974
+ simde__m256i
975
+ simde_mm256_cvtepi8_epi64 (simde__m128i a) {
976
+ #if defined(SIMDE_AVX2_NATIVE)
977
+ return _mm256_cvtepi8_epi64(a);
978
+ #else
979
+ simde__m256i_private r_;
980
+ simde__m128i_private a_ = simde__m128i_to_private(a);
981
+
982
+ SIMDE__VECTORIZE
983
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
984
+ r_.i64[i] = a_.i8[i];
985
+ }
986
+
987
+ return simde__m256i_from_private(r_);
988
+ #endif
989
+ }
990
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
991
+ # define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
992
+ #endif
993
+
994
+ SIMDE__FUNCTION_ATTRIBUTES
995
+ simde__m256i
996
+ simde_mm256_cvtepi16_epi32 (simde__m128i a) {
997
+ #if defined(SIMDE_AVX2_NATIVE)
998
+ return _mm256_cvtepi16_epi32(a);
999
+ #else
1000
+ simde__m256i_private r_;
1001
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1002
+
1003
+ #if defined(SIMDE__CONVERT_VECTOR)
1004
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.i16);
1005
+ #else
1006
+ SIMDE__VECTORIZE
1007
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1008
+ r_.i32[i] = a_.i16[i];
1009
+ }
1010
+ #endif
1011
+
1012
+ return simde__m256i_from_private(r_);
1013
+ #endif
1014
+ }
1015
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1016
+ # define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
1017
+ #endif
1018
+
1019
+ SIMDE__FUNCTION_ATTRIBUTES
1020
+ simde__m256i
1021
+ simde_mm256_cvtepi16_epi64 (simde__m128i a) {
1022
+ #if defined(SIMDE_AVX2_NATIVE)
1023
+ return _mm256_cvtepi16_epi64(a);
1024
+ #else
1025
+ simde__m256i_private r_;
1026
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1027
+
1028
+ #if defined(SIMDE__CONVERT_VECTOR)
1029
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].i16);
1030
+ #else
1031
+ SIMDE__VECTORIZE
1032
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1033
+ r_.i64[i] = a_.i16[i];
1034
+ }
1035
+ #endif
1036
+
1037
+ return simde__m256i_from_private(r_);
1038
+ #endif
1039
+ }
1040
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1041
+ # define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
1042
+ #endif
1043
+
1044
+ SIMDE__FUNCTION_ATTRIBUTES
1045
+ simde__m256i
1046
+ simde_mm256_cvtepi32_epi64 (simde__m128i a) {
1047
+ #if defined(SIMDE_AVX2_NATIVE)
1048
+ return _mm256_cvtepi32_epi64(a);
1049
+ #else
1050
+ simde__m256i_private r_;
1051
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1052
+
1053
+ #if defined(SIMDE__CONVERT_VECTOR)
1054
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.i32);
1055
+ #else
1056
+ SIMDE__VECTORIZE
1057
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1058
+ r_.i64[i] = a_.i32[i];
1059
+ }
1060
+ #endif
1061
+
1062
+ return simde__m256i_from_private(r_);
1063
+ #endif
1064
+ }
1065
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1066
+ # define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
1067
+ #endif
1068
+
1069
+ SIMDE__FUNCTION_ATTRIBUTES
1070
+ simde__m256i
1071
+ simde_mm256_cvtepu8_epi16 (simde__m128i a) {
1072
+ #if defined(SIMDE_AVX2_NATIVE)
1073
+ return _mm256_cvtepu8_epi16(a);
1074
+ #else
1075
+ simde__m256i_private r_;
1076
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1077
+
1078
+ #if defined(SIMDE__CONVERT_VECTOR)
1079
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.u8);
1080
+ #else
1081
+ SIMDE__VECTORIZE
1082
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1083
+ r_.i16[i] = a_.u8[i];
1084
+ }
1085
+ #endif
1086
+
1087
+ return simde__m256i_from_private(r_);
1088
+ #endif
1089
+ }
1090
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1091
+ # define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
1092
+ #endif
1093
+
1094
+ SIMDE__FUNCTION_ATTRIBUTES
1095
+ simde__m256i
1096
+ simde_mm256_cvtepu8_epi32 (simde__m128i a) {
1097
+ #if defined(SIMDE_AVX2_NATIVE)
1098
+ return _mm256_cvtepu8_epi32(a);
1099
+ #else
1100
+ simde__m256i_private r_;
1101
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1102
+
1103
+ #if defined(SIMDE__CONVERT_VECTOR)
1104
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].u8);
1105
+ #else
1106
+ SIMDE__VECTORIZE
1107
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1108
+ r_.i32[i] = a_.u8[i];
1109
+ }
1110
+ #endif
1111
+
1112
+ return simde__m256i_from_private(r_);
1113
+ #endif
1114
+ }
1115
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1116
+ # define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
1117
+ #endif
1118
+
1119
+ SIMDE__FUNCTION_ATTRIBUTES
1120
+ simde__m256i
1121
+ simde_mm256_cvtepu8_epi64 (simde__m128i a) {
1122
+ #if defined(SIMDE_AVX2_NATIVE)
1123
+ return _mm256_cvtepu8_epi64(a);
1124
+ #else
1125
+ simde__m256i_private r_;
1126
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1127
+
1128
+ SIMDE__VECTORIZE
1129
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1130
+ r_.i64[i] = a_.u8[i];
1131
+ }
1132
+
1133
+ return simde__m256i_from_private(r_);
1134
+ #endif
1135
+ }
1136
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1137
+ # define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
1138
+ #endif
1139
+
1140
+ SIMDE__FUNCTION_ATTRIBUTES
1141
+ simde__m256i
1142
+ simde_mm256_cvtepu16_epi32 (simde__m128i a) {
1143
+ #if defined(SIMDE_AVX2_NATIVE)
1144
+ return _mm256_cvtepu16_epi32(a);
1145
+ #else
1146
+ simde__m256i_private r_;
1147
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1148
+
1149
+ #if defined(SIMDE__CONVERT_VECTOR)
1150
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.u16);
1151
+ #else
1152
+ SIMDE__VECTORIZE
1153
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1154
+ r_.i32[i] = a_.u16[i];
1155
+ }
1156
+ #endif
1157
+
1158
+ return simde__m256i_from_private(r_);
1159
+ #endif
1160
+ }
1161
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1162
+ # define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
1163
+ #endif
1164
+
1165
+ SIMDE__FUNCTION_ATTRIBUTES
1166
+ simde__m256i
1167
+ simde_mm256_cvtepu16_epi64 (simde__m128i a) {
1168
+ #if defined(SIMDE_AVX2_NATIVE)
1169
+ return _mm256_cvtepu16_epi64(a);
1170
+ #else
1171
+ simde__m256i_private r_;
1172
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1173
+
1174
+ #if defined(SIMDE__CONVERT_VECTOR)
1175
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].u16);
1176
+ #else
1177
+ SIMDE__VECTORIZE
1178
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1179
+ r_.i64[i] = a_.u16[i];
1180
+ }
1181
+ #endif
1182
+
1183
+ return simde__m256i_from_private(r_);
1184
+ #endif
1185
+ }
1186
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1187
+ # define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
1188
+ #endif
1189
+
1190
+ SIMDE__FUNCTION_ATTRIBUTES
1191
+ simde__m256i
1192
+ simde_mm256_cvtepu32_epi64 (simde__m128i a) {
1193
+ #if defined(SIMDE_AVX2_NATIVE)
1194
+ return _mm256_cvtepu32_epi64(a);
1195
+ #else
1196
+ simde__m256i_private r_;
1197
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1198
+
1199
+ #if defined(SIMDE__CONVERT_VECTOR)
1200
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.u32);
1201
+ #else
1202
+ SIMDE__VECTORIZE
1203
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1204
+ r_.i64[i] = a_.u32[i];
1205
+ }
1206
+ #endif
1207
+
1208
+ return simde__m256i_from_private(r_);
1209
+ #endif
1210
+ }
1211
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1212
+ # define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
1213
+ #endif
1214
+
1215
+ SIMDE__FUNCTION_ATTRIBUTES
1216
+ int
1217
+ simde_mm256_extract_epi8 (simde__m256i a, const int index)
1218
+ HEDLEY_REQUIRE_MSG((index & 31) == index, "index must be in range [0, 31]"){
1219
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1220
+ return a_.i8[index];
1221
+ }
1222
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1223
+ # define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
1224
+ #endif
1225
+
1226
+ SIMDE__FUNCTION_ATTRIBUTES
1227
+ int
1228
+ simde_mm256_extract_epi16 (simde__m256i a, const int index)
1229
+ HEDLEY_REQUIRE_MSG((index & 0xf) == index, "index must be in range [0, 15]") {
1230
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1231
+ return a_.i16[index];
1232
+ }
1233
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1234
+ # define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
1235
+ #endif
1236
+
1237
+ SIMDE__FUNCTION_ATTRIBUTES
1238
+ simde__m128i
1239
+ simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
1240
+ HEDLEY_REQUIRE_MSG((imm8 & 1) == imm8, "imm8 must be 0 or 1") {
1241
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1242
+ return a_.m128i[imm8];
1243
+ }
1244
+ #if defined(SIMDE_AVX2_NATIVE)
1245
+ # define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
1246
+ #endif
1247
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1248
+ # define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
1249
+ #endif
1250
+
1251
+ SIMDE__FUNCTION_ATTRIBUTES
1252
+ simde__m256i
1253
+ simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) {
1254
+ #if defined(SIMDE_AVX2_NATIVE)
1255
+ return _mm256_madd_epi16(a, b);
1256
+ #else
1257
+ simde__m256i_private
1258
+ r_,
1259
+ a_ = simde__m256i_to_private(a),
1260
+ b_ = simde__m256i_to_private(b);
1261
+
1262
+ r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]);
1263
+ r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]);
1264
+
1265
+ return simde__m256i_from_private(r_);
1266
+ #endif
1267
+ }
1268
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1269
+ # define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
1270
+ #endif
1271
+
1272
+ SIMDE__FUNCTION_ATTRIBUTES
1273
+ simde__m256i
1274
+ simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) {
1275
+ #if defined(SIMDE_AVX2_NATIVE) && !defined(__PGI)
1276
+ return _mm256_max_epi8(a, b);
1277
+ #else
1278
+ simde__m256i_private
1279
+ r_,
1280
+ a_ = simde__m256i_to_private(a),
1281
+ b_ = simde__m256i_to_private(b);
1282
+
1283
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1284
+ r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]);
1285
+ r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]);
1286
+ #else
1287
+ SIMDE__VECTORIZE
1288
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1289
+ r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1290
+ }
1291
+ #endif
1292
+
1293
+ return simde__m256i_from_private(r_);
1294
+ #endif
1295
+ }
1296
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1297
+ # define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b)
1298
+ #endif
1299
+
1300
+ SIMDE__FUNCTION_ATTRIBUTES
1301
+ simde__m256i
1302
+ simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) {
1303
+ #if defined(SIMDE_AVX2_NATIVE)
1304
+ return _mm256_max_epu8(a, b);
1305
+ #else
1306
+ simde__m256i_private
1307
+ r_,
1308
+ a_ = simde__m256i_to_private(a),
1309
+ b_ = simde__m256i_to_private(b);
1310
+
1311
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1312
+ r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]);
1313
+ r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]);
1314
+ #else
1315
+ SIMDE__VECTORIZE
1316
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1317
+ r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
1318
+ }
1319
+ #endif
1320
+
1321
+ return simde__m256i_from_private(r_);
1322
+ #endif
1323
+ }
1324
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1325
+ # define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b)
1326
+ #endif
1327
+
1328
+ SIMDE__FUNCTION_ATTRIBUTES
1329
+ simde__m256i
1330
+ simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) {
1331
+ #if defined(SIMDE_AVX2_NATIVE)
1332
+ return _mm256_max_epu16(a, b);
1333
+ #else
1334
+ simde__m256i_private
1335
+ r_,
1336
+ a_ = simde__m256i_to_private(a),
1337
+ b_ = simde__m256i_to_private(b);
1338
+
1339
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1340
+ r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]);
1341
+ r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]);
1342
+ #else
1343
+ SIMDE__VECTORIZE
1344
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1345
+ r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i];
1346
+ }
1347
+ #endif
1348
+
1349
+ return simde__m256i_from_private(r_);
1350
+ #endif
1351
+ }
1352
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1353
+ # define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b)
1354
+ #endif
1355
+
1356
+ SIMDE__FUNCTION_ATTRIBUTES
1357
+ simde__m256i
1358
+ simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) {
1359
+ #if defined(SIMDE_AVX2_NATIVE)
1360
+ return _mm256_max_epu32(a, b);
1361
+ #else
1362
+ simde__m256i_private
1363
+ r_,
1364
+ a_ = simde__m256i_to_private(a),
1365
+ b_ = simde__m256i_to_private(b);
1366
+
1367
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1368
+ r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]);
1369
+ r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]);
1370
+ #else
1371
+ SIMDE__VECTORIZE
1372
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1373
+ r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i];
1374
+ }
1375
+ #endif
1376
+
1377
+ return simde__m256i_from_private(r_);
1378
+ #endif
1379
+ }
1380
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1381
+ # define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b)
1382
+ #endif
1383
+
1384
+ SIMDE__FUNCTION_ATTRIBUTES
1385
+ simde__m256i
1386
+ simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) {
1387
+ #if defined(SIMDE_AVX2_NATIVE)
1388
+ return _mm256_max_epi16(a, b);
1389
+ #else
1390
+ simde__m256i_private
1391
+ r_,
1392
+ a_ = simde__m256i_to_private(a),
1393
+ b_ = simde__m256i_to_private(b);
1394
+
1395
+ #if defined(SIMDE_ARCH_X86_SSE2)
1396
+ r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]);
1397
+ r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]);
1398
+ #else
1399
+ SIMDE__VECTORIZE
1400
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1401
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
1402
+ }
1403
+ #endif
1404
+
1405
+ return simde__m256i_from_private(r_);
1406
+ #endif
1407
+ }
1408
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1409
+ # define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b)
1410
+ #endif
1411
+
1412
+ SIMDE__FUNCTION_ATTRIBUTES
1413
+ simde__m256i
1414
+ simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) {
1415
+ #if defined(SIMDE_AVX2_NATIVE)
1416
+ return _mm256_max_epi32(a, b);
1417
+ #else
1418
+ simde__m256i_private
1419
+ r_,
1420
+ a_ = simde__m256i_to_private(a),
1421
+ b_ = simde__m256i_to_private(b);
1422
+
1423
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1424
+ r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]);
1425
+ r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]);
1426
+ #else
1427
+ SIMDE__VECTORIZE
1428
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1429
+ r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1430
+ }
1431
+ #endif
1432
+
1433
+ return simde__m256i_from_private(r_);
1434
+ #endif
1435
+ }
1436
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1437
+ # define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b)
1438
+ #endif
1439
+
1440
+ SIMDE__FUNCTION_ATTRIBUTES
1441
+ simde__m256i
1442
+ simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) {
1443
+ #if defined(SIMDE_AVX2_NATIVE) && !defined(__PGI)
1444
+ return _mm256_min_epi8(a, b);
1445
+ #else
1446
+ simde__m256i_private
1447
+ r_,
1448
+ a_ = simde__m256i_to_private(a),
1449
+ b_ = simde__m256i_to_private(b);
1450
+
1451
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1452
+ r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]);
1453
+ r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]);
1454
+ #else
1455
+ SIMDE__VECTORIZE
1456
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1457
+ r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1458
+ }
1459
+ #endif
1460
+
1461
+ return simde__m256i_from_private(r_);
1462
+ #endif
1463
+ }
1464
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1465
+ # define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b)
1466
+ #endif
1467
+
1468
+ SIMDE__FUNCTION_ATTRIBUTES
1469
+ simde__m256i
1470
+ simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) {
1471
+ #if defined(SIMDE_AVX2_NATIVE)
1472
+ return _mm256_min_epi16(a, b);
1473
+ #else
1474
+ simde__m256i_private
1475
+ r_,
1476
+ a_ = simde__m256i_to_private(a),
1477
+ b_ = simde__m256i_to_private(b);
1478
+
1479
+ #if defined(SIMDE_ARCH_X86_SSE2)
1480
+ r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]);
1481
+ r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]);
1482
+ #else
1483
+ SIMDE__VECTORIZE
1484
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1485
+ r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
1486
+ }
1487
+ #endif
1488
+
1489
+ return simde__m256i_from_private(r_);
1490
+ #endif
1491
+ }
1492
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1493
+ # define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b)
1494
+ #endif
1495
+
1496
+ SIMDE__FUNCTION_ATTRIBUTES
1497
+ simde__m256i
1498
+ simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) {
1499
+ #if defined(SIMDE_AVX2_NATIVE)
1500
+ return _mm256_min_epi32(a, b);
1501
+ #else
1502
+ simde__m256i_private
1503
+ r_,
1504
+ a_ = simde__m256i_to_private(a),
1505
+ b_ = simde__m256i_to_private(b);
1506
+
1507
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1508
+ r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]);
1509
+ r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]);
1510
+ #else
1511
+ SIMDE__VECTORIZE
1512
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1513
+ r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1514
+ }
1515
+ #endif
1516
+
1517
+ return simde__m256i_from_private(r_);
1518
+ #endif
1519
+ }
1520
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1521
+ # define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b)
1522
+ #endif
1523
+
1524
+ SIMDE__FUNCTION_ATTRIBUTES
1525
+ simde__m256i
1526
+ simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) {
1527
+ #if defined(SIMDE_AVX2_NATIVE)
1528
+ return _mm256_min_epu8(a, b);
1529
+ #else
1530
+ simde__m256i_private
1531
+ r_,
1532
+ a_ = simde__m256i_to_private(a),
1533
+ b_ = simde__m256i_to_private(b);
1534
+
1535
+ #if defined(SIMDE_ARCH_X86_SSE2)
1536
+ r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]);
1537
+ r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]);
1538
+ #else
1539
+ SIMDE__VECTORIZE
1540
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1541
+ r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
1542
+ }
1543
+ #endif
1544
+
1545
+ return simde__m256i_from_private(r_);
1546
+ #endif
1547
+ }
1548
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1549
+ # define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b)
1550
+ #endif
1551
+
1552
+ SIMDE__FUNCTION_ATTRIBUTES
1553
+ simde__m256i
1554
+ simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) {
1555
+ #if defined(SIMDE_AVX2_NATIVE)
1556
+ return _mm256_min_epu16(a, b);
1557
+ #else
1558
+ simde__m256i_private
1559
+ r_,
1560
+ a_ = simde__m256i_to_private(a),
1561
+ b_ = simde__m256i_to_private(b);
1562
+
1563
+ #if defined(SIMDE_ARCH_X86_SSE2)
1564
+ r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]);
1565
+ r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]);
1566
+ #else
1567
+ SIMDE__VECTORIZE
1568
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1569
+ r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i];
1570
+ }
1571
+ #endif
1572
+
1573
+ return simde__m256i_from_private(r_);
1574
+ #endif
1575
+ }
1576
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1577
+ # define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b)
1578
+ #endif
1579
+
1580
+ SIMDE__FUNCTION_ATTRIBUTES
1581
+ simde__m256i
1582
+ simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) {
1583
+ #if defined(SIMDE_AVX2_NATIVE)
1584
+ return _mm256_min_epu32(a, b);
1585
+ #else
1586
+ simde__m256i_private
1587
+ r_,
1588
+ a_ = simde__m256i_to_private(a),
1589
+ b_ = simde__m256i_to_private(b);
1590
+
1591
+ #if defined(SIMDE_ARCH_X86_SSE2)
1592
+ r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]);
1593
+ r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]);
1594
+ #else
1595
+ SIMDE__VECTORIZE
1596
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1597
+ r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i];
1598
+ }
1599
+ #endif
1600
+
1601
+ return simde__m256i_from_private(r_);
1602
+ #endif
1603
+ }
1604
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1605
+ # define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b)
1606
+ #endif
1607
+
1608
+ SIMDE__FUNCTION_ATTRIBUTES
1609
+ int32_t
1610
+ simde_mm256_movemask_epi8 (simde__m256i a) {
1611
+ #if defined(SIMDE_AVX2_NATIVE)
1612
+ return _mm256_movemask_epi8(a);
1613
+ #else
1614
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1615
+ int32_t r;
1616
+
1617
+ #if defined(SIMDE_ARCH_X86_SSE2)
1618
+ r = simde_mm_movemask_epi8(a_.m128i[1]);
1619
+ r = (r << 16) | simde_mm_movemask_epi8(a_.m128i[0]);
1620
+ #else
1621
+ r = 0;
1622
+ SIMDE__VECTORIZE_REDUCTION(|:r)
1623
+ for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
1624
+ r |= (a_.u8[31 - i] >> 7) << (31 - i);
1625
+ }
1626
+ #endif
1627
+
1628
+ return r;
1629
+ #endif
1630
+ }
1631
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1632
+ # define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a)
1633
+ #endif
1634
+
1635
+ SIMDE__FUNCTION_ATTRIBUTES
1636
+ simde__m256i
1637
+ simde_mm256_or_si256 (simde__m256i a, simde__m256i b) {
1638
+ #if defined(SIMDE_AVX2_NATIVE)
1639
+ return _mm256_or_si256(a, b);
1640
+ #else
1641
+ simde__m256i_private
1642
+ r_,
1643
+ a_ = simde__m256i_to_private(a),
1644
+ b_ = simde__m256i_to_private(b);
1645
+
1646
+ #if defined(SIMDE_ARCH_X86_SSE2)
1647
+ r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]);
1648
+ r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]);
1649
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1650
+ r_.i32f = a_.i32f | b_.i32f;
1651
+ #else
1652
+ SIMDE__VECTORIZE
1653
+ for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1654
+ r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
1655
+ }
1656
+ #endif
1657
+
1658
+ return simde__m256i_from_private(r_);
1659
+ #endif
1660
+ }
1661
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1662
+ # define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b)
1663
+ #endif
1664
+
1665
+ SIMDE__FUNCTION_ATTRIBUTES
1666
+ simde__m256i
1667
+ simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) {
1668
+ #if defined(SIMDE_AVX2_NATIVE)
1669
+ return _mm256_packs_epi32(a, b);
1670
+ #else
1671
+ simde__m256i_private
1672
+ r_,
1673
+ v_[] = {
1674
+ simde__m256i_to_private(a),
1675
+ simde__m256i_to_private(b)
1676
+ };
1677
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1678
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_packs_epi32(simde__m128i_from_private(v_[0].m128i_private[0]), simde__m128i_from_private(v_[1].m128i_private[0])));
1679
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_packs_epi32(simde__m128i_from_private(v_[0].m128i_private[1]), simde__m128i_from_private(v_[1].m128i_private[1])));
1680
+ #else
1681
+ SIMDE__VECTORIZE
1682
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1683
+ const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)];
1684
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v));
1685
+ }
1686
+ #endif
1687
+
1688
+ return simde__m256i_from_private(r_);
1689
+ #endif
1690
+ }
1691
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1692
+ # define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b)
1693
+ #endif
1694
+
1695
+ SIMDE__FUNCTION_ATTRIBUTES
1696
+ simde__m256i
1697
+ simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8)
1698
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
1699
+ simde__m256i_private
1700
+ r_,
1701
+ a_ = simde__m256i_to_private(a),
1702
+ b_ = simde__m256i_to_private(b);
1703
+
1704
+ r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]);
1705
+ r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
1706
+
1707
+ return simde__m256i_from_private(r_);
1708
+ }
1709
+ #if defined(SIMDE_AVX2_NATIVE)
1710
+ # define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8)
1711
+ #endif
1712
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1713
+ # define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8)
1714
+ #endif
1715
+
1716
+ SIMDE__FUNCTION_ATTRIBUTES
1717
+ simde__m256i
1718
+ simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8)
1719
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
1720
+ simde__m256i_private
1721
+ r_,
1722
+ a_ = simde__m256i_to_private(a);
1723
+
1724
+ r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1];
1725
+ r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1];
1726
+ r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1];
1727
+ r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1];
1728
+
1729
+ return simde__m256i_from_private(r_);
1730
+ }
1731
+ #if defined(SIMDE_AVX2_NATIVE)
1732
+ # define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8)
1733
+ #endif
1734
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1735
+ # define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8)
1736
+ #endif
1737
+
1738
+ SIMDE__FUNCTION_ATTRIBUTES
1739
+ simde__m256i
1740
+ simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) {
1741
+ #if defined(SIMDE_AVX2_NATIVE)
1742
+ return _mm256_shuffle_epi8(a, b);
1743
+ #else
1744
+ simde__m256i_private
1745
+ r_,
1746
+ a_ = simde__m256i_to_private(a),
1747
+ b_ = simde__m256i_to_private(b);
1748
+
1749
+ #if defined(SIMDE_ARCH_X86_SSSE3)
1750
+ r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]);
1751
+ r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]);
1752
+ #else
1753
+ SIMDE__VECTORIZE
1754
+ for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) {
1755
+ r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ];
1756
+ r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16];
1757
+ }
1758
+ #endif
1759
+
1760
+ return simde__m256i_from_private(r_);
1761
+ #endif
1762
+ }
1763
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1764
+ # define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b)
1765
+ #endif
1766
+
1767
+ SIMDE__FUNCTION_ATTRIBUTES
1768
+ simde__m256i
1769
+ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
1770
+ simde__m256i_private
1771
+ r_,
1772
+ a_ = simde__m256i_to_private(a);
1773
+
1774
+ for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
1775
+ r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
1776
+ }
1777
+ for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
1778
+ r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4];
1779
+ }
1780
+
1781
+ return simde__m256i_from_private(r_);
1782
+ }
1783
+ #if defined(SIMDE_AVX2_NATIVE)
1784
+ # define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8)
1785
+ #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
1786
+ # define simde_mm256_shuffle_epi32(a, imm8) \
1787
+ simde_mm256_set_m128i( \
1788
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1789
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
1790
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
1791
+ # define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
1792
+ const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
1793
+ simde__m256i_from_private((simde__m256i_private) { .i32 = \
1794
+ SIMDE__SHUFFLE_VECTOR(32, 32, \
1795
+ (simde__tmp_a_).i32, \
1796
+ (simde__tmp_a_).i32, \
1797
+ ((imm8) ) & 3, \
1798
+ ((imm8) >> 2) & 3, \
1799
+ ((imm8) >> 4) & 3, \
1800
+ ((imm8) >> 6) & 3, \
1801
+ (((imm8) ) & 3) + 4, \
1802
+ (((imm8) >> 2) & 3) + 4, \
1803
+ (((imm8) >> 4) & 3) + 4, \
1804
+ (((imm8) >> 6) & 3) + 4) }); }))
1805
+ #endif
1806
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1807
+ # define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8)
1808
+ #endif
1809
+
1810
+ #if defined(SIMDE_AVX2_NATIVE)
1811
+ # define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8)
1812
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1813
+ # define simde_mm256_shufflelo_epi16(a, imm8) \
1814
+ simde_mm256_set_m128i( \
1815
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1816
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
1817
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
1818
+ # define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
1819
+ const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
1820
+ simde__m256i_from_private((simde__m256i_private) { .i16 = \
1821
+ SIMDE__SHUFFLE_VECTOR(16, 32, \
1822
+ (simde__tmp_a_).i16, \
1823
+ (simde__tmp_a_).i16, \
1824
+ (((imm8) ) & 3), \
1825
+ (((imm8) >> 2) & 3), \
1826
+ (((imm8) >> 4) & 3), \
1827
+ (((imm8) >> 6) & 3), \
1828
+ 4, 5, 6, 7, \
1829
+ ((((imm8) ) & 3) + 8), \
1830
+ ((((imm8) >> 2) & 3) + 8), \
1831
+ ((((imm8) >> 4) & 3) + 8), \
1832
+ ((((imm8) >> 6) & 3) + 8), \
1833
+ 12, 13, 14, 15) }); }))
1834
+ #else
1835
+ # define simde_mm256_shufflelo_epi16(a, imm8) \
1836
+ simde_mm256_set_m128i( \
1837
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
1838
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
1839
+ #endif
1840
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1841
+ # define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
1842
+ #endif
1843
+
1844
+ SIMDE__FUNCTION_ATTRIBUTES
1845
+ simde__m256i
1846
+ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
1847
+ HEDLEY_REQUIRE_MSG((imm8 & 15) == imm8, "imm8 must be in range [0, 15]") {
1848
+ /* Note: There is no consistency in how compilers handle values outside of
1849
+ the expected range, hence the discrepancy between what we allow and what
1850
+ Intel specifies. Some compilers will return 0, others seem to just mask
1851
+ off everything outside of the range. */
1852
+ simde__m256i_private
1853
+ r_,
1854
+ a_ = simde__m256i_to_private(a);
1855
+
1856
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1857
+ r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8);
1858
+ #else
1859
+ SIMDE__VECTORIZE
1860
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1861
+ r_.i16[i] = a_.i16[i] << (imm8 & 0xff);
1862
+ }
1863
+ #endif
1864
+
1865
+ return simde__m256i_from_private(r_);
1866
+ }
1867
+ #if defined(SIMDE_AVX2_NATIVE)
1868
+ # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
1869
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1870
+ # define simde_mm256_slli_epi16(a, imm8) \
1871
+ simde_mm256_set_m128i( \
1872
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1873
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
1874
+ #endif
1875
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1876
+ # define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
1877
+ #endif
1878
+
1879
+ SIMDE__FUNCTION_ATTRIBUTES
1880
+ simde__m256i
1881
+ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
1882
+ HEDLEY_REQUIRE_MSG((imm8 & 31) == imm8, "imm8 must be in range [0, 31]") {
1883
+ simde__m256i_private
1884
+ r_,
1885
+ a_ = simde__m256i_to_private(a);
1886
+
1887
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1888
+ r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8);
1889
+ #else
1890
+ SIMDE__VECTORIZE
1891
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1892
+ r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
1893
+ }
1894
+ #endif
1895
+
1896
+ return simde__m256i_from_private(r_);
1897
+ }
1898
+ #if defined(SIMDE_AVX2_NATIVE)
1899
+ # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
1900
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1901
+ # define simde_mm256_slli_epi32(a, imm8) \
1902
+ simde_mm256_set_m128i( \
1903
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1904
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
1905
+ #endif
1906
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1907
+ # define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
1908
+ #endif
1909
+
1910
+ SIMDE__FUNCTION_ATTRIBUTES
1911
+ simde__m256i
1912
+ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
1913
+ HEDLEY_REQUIRE_MSG((imm8 & 15) == imm8, "imm8 must be in range [0, 63]") {
1914
+ simde__m256i_private
1915
+ r_,
1916
+ a_ = simde__m256i_to_private(a);
1917
+
1918
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1919
+ r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8);
1920
+ #else
1921
+ SIMDE__VECTORIZE
1922
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1923
+ r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
1924
+ }
1925
+ #endif
1926
+
1927
+ return simde__m256i_from_private(r_);
1928
+ }
1929
+ #if defined(SIMDE_AVX2_NATIVE)
1930
+ # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
1931
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1932
+ # define simde_mm256_slli_epi64(a, imm8) \
1933
+ simde_mm256_set_m128i( \
1934
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1935
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
1936
+ #endif
1937
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1938
+ # define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
1939
+ #endif
1940
+
1941
+ SIMDE__FUNCTION_ATTRIBUTES
1942
+ simde__m256i
1943
+ simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) {
1944
+ #if defined(SIMDE_AVX2_NATIVE)
1945
+ return _mm256_sub_epi8(a, b);
1946
+ #else
1947
+ simde__m256i_private
1948
+ r_,
1949
+ a_ = simde__m256i_to_private(a),
1950
+ b_ = simde__m256i_to_private(b);
1951
+
1952
+ #if defined(SIMDE_ARCH_X86_SSE2)
1953
+ r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]);
1954
+ r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]);
1955
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1956
+ r_.i8 = a_.i8 - b_.i8;
1957
+ #else
1958
+ SIMDE__VECTORIZE
1959
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1960
+ r_.i8[i] = a_.i8[i] - b_.i8[i];
1961
+ }
1962
+ #endif
1963
+
1964
+ return simde__m256i_from_private(r_);
1965
+ #endif
1966
+ }
1967
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1968
+ # define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b)
1969
+ #endif
1970
+
1971
+ SIMDE__FUNCTION_ATTRIBUTES
1972
+ simde__m256i
1973
+ simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) {
1974
+ #if defined(SIMDE_AVX2_NATIVE)
1975
+ return _mm256_sub_epi16(a, b);
1976
+ #else
1977
+ simde__m256i_private
1978
+ r_,
1979
+ a_ = simde__m256i_to_private(a),
1980
+ b_ = simde__m256i_to_private(b);
1981
+
1982
+ #if defined(SIMDE_ARCH_X86_SSE2)
1983
+ r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]);
1984
+ r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]);
1985
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1986
+ r_.i16 = a_.i16 - b_.i16;
1987
+ #else
1988
+ SIMDE__VECTORIZE
1989
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1990
+ r_.i16[i] = a_.i16[i] - b_.i16[i];
1991
+ }
1992
+ #endif
1993
+
1994
+ return simde__m256i_from_private(r_);
1995
+ #endif
1996
+ }
1997
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1998
+ # define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b)
1999
+ #endif
2000
+
2001
+ SIMDE__FUNCTION_ATTRIBUTES
2002
+ simde__m256i
2003
+ simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) {
2004
+ #if defined(SIMDE_AVX2_NATIVE)
2005
+ return _mm256_sub_epi32(a, b);
2006
+ #else
2007
+ simde__m256i_private
2008
+ r_,
2009
+ a_ = simde__m256i_to_private(a),
2010
+ b_ = simde__m256i_to_private(b);
2011
+
2012
+ #if defined(SIMDE_ARCH_X86_SSE2)
2013
+ r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]);
2014
+ r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]);
2015
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2016
+ r_.i32 = a_.i32 - b_.i32;
2017
+ #else
2018
+ SIMDE__VECTORIZE
2019
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2020
+ r_.i32[i] = a_.i32[i] - b_.i32[i];
2021
+ }
2022
+ #endif
2023
+
2024
+ return simde__m256i_from_private(r_);
2025
+ #endif
2026
+ }
2027
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2028
+ # define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b)
2029
+ #endif
2030
+
2031
+ SIMDE__FUNCTION_ATTRIBUTES
2032
+ simde__m256i
2033
+ simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) {
2034
+ #if defined(SIMDE_AVX2_NATIVE)
2035
+ return _mm256_sub_epi64(a, b);
2036
+ #else
2037
+ simde__m256i_private
2038
+ r_,
2039
+ a_ = simde__m256i_to_private(a),
2040
+ b_ = simde__m256i_to_private(b);
2041
+
2042
+ #if defined(SIMDE_ARCH_X86_SSE2)
2043
+ r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]);
2044
+ r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]);
2045
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2046
+ r_.i64 = a_.i64 - b_.i64;
2047
+ #else
2048
+ SIMDE__VECTORIZE
2049
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
2050
+ r_.i64[i] = a_.i64[i] - b_.i64[i];
2051
+ }
2052
+ #endif
2053
+
2054
+ return simde__m256i_from_private(r_);
2055
+ #endif
2056
+ }
2057
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2058
+ # define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b)
2059
+ #endif
2060
+
2061
+ SIMDE__FUNCTION_ATTRIBUTES
2062
+ simde__m256i
2063
+ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) {
2064
+ simde__m256i_private
2065
+ r_,
2066
+ a_ = simde__m256i_to_private(a);
2067
+
2068
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2069
+ r_.u64 = a_.u64 >> HEDLEY_STATIC_CAST(int32_t, imm8);
2070
+ #else
2071
+ SIMDE__VECTORIZE
2072
+ for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
2073
+ r_.u64[i] = a_.u64[i] >> imm8;
2074
+ }
2075
+ #endif
2076
+
2077
+ return simde__m256i_from_private(r_);
2078
+ }
2079
+ #if defined(SIMDE_AVX2_NATIVE)
2080
+ # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
2081
+ #elif defined(SIMDE_ARCH_X86_SSE2)
2082
+ # define simde_mm256_srli_epi64(a, imm8) \
2083
+ simde_mm256_set_m128i( \
2084
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2085
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
2086
+ #endif
2087
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2088
+ # define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
2089
+ #endif
2090
+
2091
+ SIMDE__FUNCTION_ATTRIBUTES
2092
+ simde__m256i
2093
+ simde_mm256_srli_si256 (simde__m256i a, const int imm8) {
2094
+ simde__m256i_private
2095
+ r_,
2096
+ a_ = simde__m256i_to_private(a);
2097
+
2098
+ for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
2099
+ SIMDE__VECTORIZE
2100
+ for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
2101
+ const int e = imm8 + HEDLEY_STATIC_CAST(int, i);
2102
+ r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0;
2103
+ }
2104
+ }
2105
+
2106
+ return simde__m256i_from_private(r_);
2107
+ }
2108
+ #if defined(SIMDE_AVX2_NATIVE)
2109
+ # define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8)
2110
+ #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
2111
+ # define simde_mm256_srli_si256(a, imm8) \
2112
+ simde_mm256_set_m128i( \
2113
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2114
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
2115
+ #elif defined(SIMDE_SSE2_NEON)
2116
+ # define simde_mm256_srli_si256(a, imm8) \
2117
+ simde_mm256_set_m128i( \
2118
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2119
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
2120
+ #endif
2121
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2122
+ # define _mm256_srli_si256(a, imm8) simde_mm_srli_si256(a, imm8)
2123
+ #endif
2124
+
2125
+ SIMDE__FUNCTION_ATTRIBUTES
2126
+ simde__m256i
2127
+ simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) {
2128
+ #if defined(SIMDE_AVX2_NATIVE)
2129
+ return _mm256_unpacklo_epi8(a, b);
2130
+ #else
2131
+ simde__m256i_private
2132
+ r_,
2133
+ a_ = simde__m256i_to_private(a),
2134
+ b_ = simde__m256i_to_private(b);
2135
+
2136
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2137
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 32, a_.i8, b_.i8,
2138
+ 0, 32, 1, 33, 2, 34, 3, 35,
2139
+ 4, 36, 5, 37, 6, 38, 7, 39,
2140
+ 16, 48, 17, 49, 18, 50, 19, 51,
2141
+ 20, 52, 21, 53, 22, 54, 23, 55);
2142
+ #else
2143
+ r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]);
2144
+ r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]);
2145
+ #endif
2146
+
2147
+ return simde__m256i_from_private(r_);
2148
+ #endif
2149
+ }
2150
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2151
+ # define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b)
2152
+ #endif
2153
+
2154
+ SIMDE__FUNCTION_ATTRIBUTES
2155
+ simde__m256i
2156
+ simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) {
2157
+ #if defined(SIMDE_AVX2_NATIVE)
2158
+ return _mm256_unpacklo_epi16(a, b);
2159
+ #else
2160
+ simde__m256i_private
2161
+ r_,
2162
+ a_ = simde__m256i_to_private(a),
2163
+ b_ = simde__m256i_to_private(b);
2164
+
2165
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2166
+ r_.i16 =SIMDE__SHUFFLE_VECTOR(16, 32, a_.i16, b_.i16,
2167
+ 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27);
2168
+ #else
2169
+ r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]);
2170
+ r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]);
2171
+ #endif
2172
+
2173
+ return simde__m256i_from_private(r_);
2174
+ #endif
2175
+ }
2176
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2177
+ # define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b)
2178
+ #endif
2179
+
2180
+ SIMDE__FUNCTION_ATTRIBUTES
2181
+ simde__m256i
2182
+ simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) {
2183
+ #if defined(SIMDE_AVX2_NATIVE)
2184
+ return _mm256_unpacklo_epi32(a, b);
2185
+ #else
2186
+ simde__m256i_private
2187
+ r_,
2188
+ a_ = simde__m256i_to_private(a),
2189
+ b_ = simde__m256i_to_private(b);
2190
+
2191
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2192
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 32, a_.i32, b_.i32,
2193
+ 0, 8, 1, 9, 4, 12, 5, 13);
2194
+ #else
2195
+ r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]);
2196
+ r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]);
2197
+ #endif
2198
+
2199
+ return simde__m256i_from_private(r_);
2200
+ #endif
2201
+ }
2202
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2203
+ # define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b)
2204
+ #endif
2205
+
2206
+ SIMDE__FUNCTION_ATTRIBUTES
2207
+ simde__m256i
2208
+ simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) {
2209
+ #if defined(SIMDE_AVX2_NATIVE)
2210
+ return _mm256_unpacklo_epi64(a, b);
2211
+ #else
2212
+ simde__m256i_private
2213
+ r_,
2214
+ a_ = simde__m256i_to_private(a),
2215
+ b_ = simde__m256i_to_private(b);
2216
+
2217
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2218
+ r_.i64 = SIMDE__SHUFFLE_VECTOR(64, 32, a_.i64, b_.i64, 0, 4, 2, 6);
2219
+ #else
2220
+ r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]);
2221
+ r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]);
2222
+ #endif
2223
+
2224
+ return simde__m256i_from_private(r_);
2225
+ #endif
2226
+ }
2227
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2228
+ # define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b)
2229
+ #endif
2230
+
2231
+ SIMDE__FUNCTION_ATTRIBUTES
2232
+ simde__m256i
2233
+ simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) {
2234
+ #if defined(SIMDE_AVX2_NATIVE)
2235
+ return _mm256_unpackhi_epi8(a, b);
2236
+ #else
2237
+ simde__m256i_private
2238
+ r_,
2239
+ a_ = simde__m256i_to_private(a),
2240
+ b_ = simde__m256i_to_private(b);
2241
+
2242
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2243
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 32, a_.i8, b_.i8,
2244
+ 8, 40, 9, 41, 10, 42, 11, 43,
2245
+ 12, 44, 13, 45, 14, 46, 15, 47,
2246
+ 24, 56, 25, 57, 26, 58, 27, 59,
2247
+ 28, 60, 29, 61, 30, 62, 31, 63);
2248
+ #else
2249
+ r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]);
2250
+ r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]);
2251
+ #endif
2252
+
2253
+ return simde__m256i_from_private(r_);
2254
+ #endif
2255
+ }
2256
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2257
+ # define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b)
2258
+ #endif
2259
+
2260
+ SIMDE__FUNCTION_ATTRIBUTES
2261
+ simde__m256i
2262
+ simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) {
2263
+ #if defined(SIMDE_AVX2_NATIVE)
2264
+ return _mm256_unpackhi_epi16(a, b);
2265
+ #else
2266
+ simde__m256i_private
2267
+ r_,
2268
+ a_ = simde__m256i_to_private(a),
2269
+ b_ = simde__m256i_to_private(b);
2270
+
2271
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2272
+ r_.i16 = SIMDE__SHUFFLE_VECTOR(16, 32, a_.i16, b_.i16,
2273
+ 4, 20, 5, 21, 6, 22, 7, 23,
2274
+ 12, 28, 13, 29, 14, 30, 15, 31);
2275
+ #else
2276
+ r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]);
2277
+ r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]);
2278
+ #endif
2279
+
2280
+ return simde__m256i_from_private(r_);
2281
+ #endif
2282
+ }
2283
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2284
+ # define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b)
2285
+ #endif
2286
+
2287
+ SIMDE__FUNCTION_ATTRIBUTES
2288
+ simde__m256i
2289
+ simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) {
2290
+ #if defined(SIMDE_AVX2_NATIVE)
2291
+ return _mm256_unpackhi_epi32(a, b);
2292
+ #else
2293
+ simde__m256i_private
2294
+ r_,
2295
+ a_ = simde__m256i_to_private(a),
2296
+ b_ = simde__m256i_to_private(b);
2297
+
2298
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2299
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 32, a_.i32, b_.i32,
2300
+ 2, 10, 3, 11, 6, 14, 7, 15);
2301
+ #else
2302
+ r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]);
2303
+ r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]);
2304
+ #endif
2305
+
2306
+ return simde__m256i_from_private(r_);
2307
+ #endif
2308
+ }
2309
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2310
+ # define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b)
2311
+ #endif
2312
+
2313
+ SIMDE__FUNCTION_ATTRIBUTES
2314
+ simde__m256i
2315
+ simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) {
2316
+ #if defined(SIMDE_AVX2_NATIVE)
2317
+ return _mm256_unpackhi_epi64(a, b);
2318
+ #else
2319
+ simde__m256i_private
2320
+ r_,
2321
+ a_ = simde__m256i_to_private(a),
2322
+ b_ = simde__m256i_to_private(b);
2323
+
2324
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2325
+ r_.i64 = SIMDE__SHUFFLE_VECTOR(64, 32, a_.i64, b_.i64, 1, 5, 3, 7);
2326
+ #else
2327
+ r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]);
2328
+ r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]);
2329
+ #endif
2330
+
2331
+ return simde__m256i_from_private(r_);
2332
+ #endif
2333
+ }
2334
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2335
+ # define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b)
2336
+ #endif
2337
+
2338
+ SIMDE__FUNCTION_ATTRIBUTES
2339
+ simde__m256i
2340
+ simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) {
2341
+ #if defined(SIMDE_AVX2_NATIVE)
2342
+ return _mm256_xor_si256(a, b);
2343
+ #else
2344
+ simde__m256i_private
2345
+ r_,
2346
+ a_ = simde__m256i_to_private(a),
2347
+ b_ = simde__m256i_to_private(b);
2348
+
2349
+ #if defined(SIMDE_ARCH_X86_SSE2)
2350
+ r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
2351
+ r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
2352
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2353
+ r_.i32f = a_.i32f ^ b_.i32f;
2354
+ #else
2355
+ SIMDE__VECTORIZE
2356
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
2357
+ r_.i64[i] = a_.i64[i] ^ b_.i64[i];
2358
+ }
2359
+ #endif
2360
+
2361
+ return simde__m256i_from_private(r_);
2362
+ #endif
2363
+ }
2364
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2365
+ # define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b)
2366
+ #endif
2367
+
2368
+ SIMDE__FUNCTION_ATTRIBUTES
2369
+ simde__m256i
2370
+ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) {
2371
+ simde__m256i_private
2372
+ r_,
2373
+ a_ = simde__m256i_to_private(a);
2374
+
2375
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2376
+ r_.u32 = a_.u32 >> HEDLEY_STATIC_CAST(int16_t, imm8);
2377
+ #else
2378
+ SIMDE__VECTORIZE
2379
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
2380
+ r_.u32[i] = a_.u32[i] >> imm8;
2381
+ }
2382
+ #endif
2383
+
2384
+ return simde__m256i_from_private(r_);
2385
+ }
2386
+ #if defined(SIMDE_AVX2_NATIVE)
2387
+ # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
2388
+ #elif defined(SIMDE_ARCH_X86_SSE2)
2389
+ # define simde_mm256_srli_epi32(a, imm8) \
2390
+ simde_mm256_set_m128i( \
2391
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2392
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
2393
+ #endif
2394
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2395
+ # define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
2396
+ #endif
2397
+
2398
+ SIMDE__END_DECLS
2399
+
2400
+ HEDLEY_DIAGNOSTIC_POP
2401
+
2402
+ #endif /* !defined(SIMDE__AVX2_H) */