minimap2 0.2.25.0 → 0.2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/ext/minimap2/Makefile +6 -2
  4. data/ext/minimap2/NEWS.md +38 -0
  5. data/ext/minimap2/README.md +9 -3
  6. data/ext/minimap2/align.c +5 -3
  7. data/ext/minimap2/cookbook.md +2 -2
  8. data/ext/minimap2/format.c +7 -4
  9. data/ext/minimap2/kalloc.c +20 -1
  10. data/ext/minimap2/kalloc.h +13 -2
  11. data/ext/minimap2/ksw2.h +1 -0
  12. data/ext/minimap2/ksw2_extd2_sse.c +1 -1
  13. data/ext/minimap2/ksw2_exts2_sse.c +79 -40
  14. data/ext/minimap2/ksw2_extz2_sse.c +1 -1
  15. data/ext/minimap2/lchain.c +15 -16
  16. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  17. data/ext/minimap2/lib/simde/COPYING +20 -0
  18. data/ext/minimap2/lib/simde/README.md +333 -0
  19. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  20. data/ext/minimap2/lib/simde/meson.build +33 -0
  21. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  29. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  30. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  31. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  32. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  33. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  34. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  35. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  36. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  37. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  38. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  39. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  40. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  41. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  42. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  43. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  44. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  45. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  46. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  47. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  48. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  49. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  50. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  51. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  52. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  53. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  54. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  55. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  56. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  57. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  58. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  59. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  60. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  61. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  62. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  63. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  64. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  65. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  66. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  67. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  68. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  69. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  70. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  71. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  72. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  73. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  74. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  75. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  76. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  77. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  78. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  79. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  80. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  81. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  82. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  83. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  84. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  85. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  86. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  87. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  88. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  89. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  90. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  91. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  92. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  93. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  94. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  95. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  96. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  97. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  98. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  99. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  100. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  101. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  102. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  103. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  104. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  105. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  106. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  107. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  108. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  109. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  110. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  111. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  112. data/ext/minimap2/main.c +13 -6
  113. data/ext/minimap2/map.c +0 -5
  114. data/ext/minimap2/minimap.h +40 -31
  115. data/ext/minimap2/minimap2.1 +19 -5
  116. data/ext/minimap2/misc/paftools.js +545 -24
  117. data/ext/minimap2/options.c +1 -1
  118. data/ext/minimap2/pyproject.toml +2 -0
  119. data/ext/minimap2/python/mappy.pyx +3 -1
  120. data/ext/minimap2/seed.c +1 -1
  121. data/ext/minimap2/setup.py +32 -22
  122. data/lib/minimap2/version.rb +1 -1
  123. metadata +100 -3
@@ -0,0 +1,2402 @@
1
+ /* Permission is hereby granted, free of charge, to any person
2
+ * obtaining a copy of this software and associated documentation
3
+ * files (the "Software"), to deal in the Software without
4
+ * restriction, including without limitation the rights to use, copy,
5
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
6
+ * of the Software, and to permit persons to whom the Software is
7
+ * furnished to do so, subject to the following conditions:
8
+ *
9
+ * The above copyright notice and this permission notice shall be
10
+ * included in all copies or substantial portions of the Software.
11
+ *
12
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ * SOFTWARE.
20
+ *
21
+ * Copyright:
22
+ * 2018 Evan Nemerson <evan@nemerson.com>
23
+ * 2019 Michael R. Crusoe <michael.crusoe@gmail.com>
24
+ */
25
+
26
+ #include "sse4.1.h"
27
+ #include "sse4.2.h"
28
+ #if !defined(SIMDE__AVX2_H)
29
+ # if !defined(SIMDE__AVX2_H)
30
+ # define SIMDE__AVX2_H
31
+ # endif
32
+ # include "avx.h"
33
+
34
+ HEDLEY_DIAGNOSTIC_PUSH
35
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
36
+
37
+ # if defined(SIMDE_AVX2_NATIVE)
38
+ # undef SIMDE_AVX2_NATIVE
39
+ # endif
40
+ # if defined(SIMDE_ARCH_X86_AVX2) && !defined(SIMDE_AVX2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
41
+ # define SIMDE_AVX2_NATIVE
42
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_AVX2_NO_NEON) && !defined(SIMDE_NO_NEON)
43
+ # define SIMDE_AVX2_NEON
44
+ # endif
45
+
46
+ # if defined(SIMDE_AVX2_NATIVE) && !defined(SIMDE_AVX_NATIVE)
47
+ # if defined(SIMDE_AVX2_FORCE_NATIVE)
48
+ # error Native AVX2 support requires native AVX support
49
+ # else
50
+ HEDLEY_WARNING("Native AVX2 support requires native AVX support, disabling")
51
+ # undef SIMDE_AVX2_NATIVE
52
+ # endif
53
+ # elif defined(SIMDE_AVX2_NEON) && !defined(SIMDE_AVX_NEON)
54
+ HEDLEY_WARNING("AVX2 NEON support requires AVX NEON support, disabling")
55
+ # undef SIMDE_AVX_NEON
56
+ # endif
57
+
58
+ # if defined(SIMDE_AVX2_NATIVE)
59
+ # include <immintrin.h>
60
+ # endif
61
+
62
+ # if !defined(SIMDE_AVX2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
63
+ # define SIMDE_AVX2_ENABLE_NATIVE_ALIASES
64
+ # endif
65
+
66
+ # include <stdint.h>
67
+
68
+ SIMDE__BEGIN_DECLS
69
+
70
+ SIMDE__FUNCTION_ATTRIBUTES
71
+ simde__m256i
72
+ simde_mm256_abs_epi8 (simde__m256i a) {
73
+ #if defined(SIMDE_AVX2_NATIVE)
74
+ return _mm256_abs_epi8(a);
75
+ #else
76
+ simde__m256i_private
77
+ r_,
78
+ a_ = simde__m256i_to_private(a);
79
+
80
+ SIMDE__VECTORIZE
81
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
82
+ r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
83
+ }
84
+
85
+ return simde__m256i_from_private(r_);
86
+ #endif
87
+ }
88
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
89
+ # define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
90
+ #endif
91
+
92
+ SIMDE__FUNCTION_ATTRIBUTES
93
+ simde__m256i
94
+ simde_mm256_abs_epi16 (simde__m256i a) {
95
+ #if defined(SIMDE_AVX2_NATIVE)
96
+ return _mm256_abs_epi16(a);
97
+ #else
98
+ simde__m256i_private
99
+ r_,
100
+ a_ = simde__m256i_to_private(a);
101
+
102
+ SIMDE__VECTORIZE
103
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
104
+ r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
105
+ }
106
+
107
+ return simde__m256i_from_private(r_);
108
+ #endif
109
+ }
110
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
111
+ # define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
112
+ #endif
113
+
114
+ SIMDE__FUNCTION_ATTRIBUTES
115
+ simde__m256i
116
+ simde_mm256_abs_epi32(simde__m256i a) {
117
+ #if defined(SIMDE_AVX2_NATIVE)
118
+ return _mm256_abs_epi32(a);
119
+ #else
120
+ simde__m256i_private
121
+ r_,
122
+ a_ = simde__m256i_to_private(a);
123
+
124
+ SIMDE__VECTORIZE
125
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
126
+ r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
127
+ }
128
+
129
+ return simde__m256i_from_private(r_);
130
+ #endif
131
+ }
132
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
133
+ # define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
134
+ #endif
135
+
136
+ SIMDE__FUNCTION_ATTRIBUTES
137
+ simde__m256i
138
+ simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
139
+ #if defined(SIMDE_AVX2_NATIVE)
140
+ return _mm256_add_epi8(a, b);
141
+ #else
142
+ simde__m256i_private
143
+ r_,
144
+ a_ = simde__m256i_to_private(a),
145
+ b_ = simde__m256i_to_private(b);
146
+
147
+ #if defined(SIMDE_ARCH_X86_SSE2)
148
+ r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
149
+ r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
150
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
151
+ r_.i8 = a_.i8 + b_.i8;
152
+ #else
153
+ SIMDE__VECTORIZE
154
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
155
+ r_.i8[i] = a_.i8[i] + b_.i8[i];
156
+ }
157
+ #endif
158
+
159
+ return simde__m256i_from_private(r_);
160
+ #endif
161
+ }
162
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
163
+ # define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
164
+ #endif
165
+
166
+ SIMDE__FUNCTION_ATTRIBUTES
167
+ simde__m256i
168
+ simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
169
+ #if defined(SIMDE_AVX2_NATIVE)
170
+ return _mm256_add_epi16(a, b);
171
+ #else
172
+ simde__m256i_private
173
+ r_,
174
+ a_ = simde__m256i_to_private(a),
175
+ b_ = simde__m256i_to_private(b);
176
+
177
+ #if defined(SIMDE_ARCH_X86_SSE2)
178
+ r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
179
+ r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
180
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
181
+ r_.i16 = a_.i16 + b_.i16;
182
+ #else
183
+ SIMDE__VECTORIZE
184
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
185
+ r_.i16[i] = a_.i16[i] + b_.i16[i];
186
+ }
187
+ #endif
188
+
189
+ return simde__m256i_from_private(r_);
190
+ #endif
191
+ }
192
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
193
+ # define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
194
+ #endif
195
+
196
+ SIMDE__FUNCTION_ATTRIBUTES
197
+ simde__m256i
198
+ simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
199
+ #if defined(SIMDE_AVX2_NATIVE)
200
+ return _mm256_add_epi32(a, b);
201
+ #else
202
+ simde__m256i_private
203
+ r_,
204
+ a_ = simde__m256i_to_private(a),
205
+ b_ = simde__m256i_to_private(b);
206
+
207
+ #if defined(SIMDE_ARCH_X86_SSE2)
208
+ r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
209
+ r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
210
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
211
+ r_.i32 = a_.i32 + b_.i32;
212
+ #else
213
+ SIMDE__VECTORIZE
214
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
215
+ r_.i32[i] = a_.i32[i] + b_.i32[i];
216
+ }
217
+ #endif
218
+
219
+ return simde__m256i_from_private(r_);
220
+ #endif
221
+ }
222
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
223
+ # define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
224
+ #endif
225
+
226
+ SIMDE__FUNCTION_ATTRIBUTES
227
+ simde__m256i
228
+ simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
229
+ #if defined(SIMDE_AVX2_NATIVE)
230
+ return _mm256_add_epi64(a, b);
231
+ #else
232
+ simde__m256i_private
233
+ r_,
234
+ a_ = simde__m256i_to_private(a),
235
+ b_ = simde__m256i_to_private(b);
236
+
237
+ #if defined(SIMDE_ARCH_X86_SSE2)
238
+ r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
239
+ r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
240
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
241
+ r_.i64 = a_.i64 + b_.i64;
242
+ #else
243
+ SIMDE__VECTORIZE
244
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
245
+ r_.i64[i] = a_.i64[i] + b_.i64[i];
246
+ }
247
+ #endif
248
+
249
+ return simde__m256i_from_private(r_);
250
+ #endif
251
+ }
252
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
253
+ # define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
254
+ #endif
255
+
256
+ SIMDE__FUNCTION_ATTRIBUTES
257
+ simde__m256i
258
+ simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) {
259
+ simde__m256i_private
260
+ r_,
261
+ a_ = simde__m256i_to_private(a),
262
+ b_ = simde__m256i_to_private(b);
263
+
264
+ if (HEDLEY_UNLIKELY(count > 31))
265
+ return simde_mm256_setzero_si256();
266
+
267
+ for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) {
268
+ SIMDE__VECTORIZE
269
+ for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
270
+ const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
271
+ if (srcpos > 31) {
272
+ r_.m128i_private[h].i8[i] = 0;
273
+ } else if (srcpos > 15) {
274
+ r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
275
+ } else {
276
+ r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
277
+ }
278
+ }
279
+ }
280
+
281
+ return simde__m256i_from_private(r_);
282
+ }
283
+ #if defined(SIMDE_AVX2_NATIVE)
284
+ # define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
285
+ #elif defined(SIMDE_ARCH_X86_SSSE3)
286
+ # define simde_mm256_alignr_epi8(a, b, count) \
287
+ simde_mm256_set_m128i( \
288
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
289
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
290
+ #endif
291
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
292
+ # define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
293
+ #endif
294
+
295
+ SIMDE__FUNCTION_ATTRIBUTES
296
+ simde__m256i
297
+ simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
298
+ #if defined(SIMDE_AVX2_NATIVE)
299
+ return _mm256_and_si256(a, b);
300
+ #else
301
+ simde__m256i_private
302
+ r_,
303
+ a_ = simde__m256i_to_private(a),
304
+ b_ = simde__m256i_to_private(b);
305
+
306
+ #if defined(SIMDE_ARCH_X86_SSE2)
307
+ r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
308
+ r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
309
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
310
+ r_.i32f = a_.i32f & b_.i32f;
311
+ #else
312
+ SIMDE__VECTORIZE
313
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
314
+ r_.i64[i] = a_.i64[i] & b_.i64[i];
315
+ }
316
+ #endif
317
+
318
+ return simde__m256i_from_private(r_);
319
+ #endif
320
+ }
321
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
322
+ # define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
323
+ #endif
324
+
325
+ SIMDE__FUNCTION_ATTRIBUTES
326
+ simde__m256i
327
+ simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
328
+ #if defined(SIMDE_AVX2_NATIVE)
329
+ return _mm256_andnot_si256(a, b);
330
+ #else
331
+ simde__m256i_private
332
+ r_,
333
+ a_ = simde__m256i_to_private(a),
334
+ b_ = simde__m256i_to_private(b);
335
+
336
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
337
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_andnot_si128(simde__m128i_from_private(a_.m128i_private[0]), simde__m128i_from_private(b_.m128i_private[0])));
338
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_andnot_si128(simde__m128i_from_private(a_.m128i_private[1]), simde__m128i_from_private(b_.m128i_private[1])));
339
+ #else
340
+ SIMDE__VECTORIZE
341
+ for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
342
+ r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
343
+ }
344
+ #endif
345
+
346
+ return simde__m256i_from_private(r_);
347
+ #endif
348
+ }
349
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
350
+ # define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
351
+ #endif
352
+
353
+ SIMDE__FUNCTION_ATTRIBUTES
354
+ simde__m256i
355
+ simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
356
+ #if defined(SIMDE_AVX2_NATIVE)
357
+ return _mm256_adds_epi8(a, b);
358
+ #else
359
+ simde__m256i_private
360
+ r_,
361
+ a_ = simde__m256i_to_private(a),
362
+ b_ = simde__m256i_to_private(b);
363
+
364
+ #if defined(SIMDE_ARCH_X86_SSE2) && !defined(HEDLEY_INTEL_VERSION)
365
+ SIMDE__VECTORIZE
366
+ for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
367
+ r_.m128i[i] = simde_mm_adds_epi8(a_.m128i[i], b_.m128i[i]);
368
+ }
369
+ #else
370
+ SIMDE__VECTORIZE
371
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
372
+ const int32_t tmp =
373
+ HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
374
+ HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
375
+ r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
376
+ }
377
+ #endif
378
+
379
+ return simde__m256i_from_private(r_);
380
+ #endif
381
+ }
382
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
383
+ # define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
384
+ #endif
385
+
386
+ SIMDE__FUNCTION_ATTRIBUTES
387
+ simde__m256i
388
+ simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
389
+ #if defined(SIMDE_AVX2_NATIVE)
390
+ return _mm256_adds_epi16(a, b);
391
+ #else
392
+ simde__m256i_private
393
+ r_,
394
+ a_ = simde__m256i_to_private(a),
395
+ b_ = simde__m256i_to_private(b);
396
+
397
+ #if defined(SIMDE_ARCH_X86_SSE2) && !defined(HEDLEY_INTEL_VERSION)
398
+ SIMDE__VECTORIZE
399
+ for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
400
+ r_.m128i[i] = simde_mm_adds_epi16(a_.m128i[i], b_.m128i[i]);
401
+ }
402
+ #else
403
+ SIMDE__VECTORIZE
404
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
405
+ const int32_t tmp =
406
+ HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
407
+ HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
408
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
409
+ }
410
+ #endif
411
+
412
+ return simde__m256i_from_private(r_);
413
+ #endif
414
+ }
415
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
416
+ # define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
417
+ #endif
418
+
419
+ SIMDE__FUNCTION_ATTRIBUTES
420
+ simde__m256i
421
+ simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
422
+ #if defined(SIMDE_AVX2_NATIVE)
423
+ return _mm256_adds_epu8(a, b);
424
+ #else
425
+ simde__m256i_private
426
+ r_,
427
+ a_ = simde__m256i_to_private(a),
428
+ b_ = simde__m256i_to_private(b);
429
+
430
+ #if defined(SIMDE_ARCH_X86_SSE2)
431
+ r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
432
+ r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
433
+ #else
434
+ SIMDE__VECTORIZE
435
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
436
+ r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
437
+ }
438
+ #endif
439
+
440
+ return simde__m256i_from_private(r_);
441
+ #endif
442
+ }
443
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
444
+ # define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
445
+ #endif
446
+
447
+ SIMDE__FUNCTION_ATTRIBUTES
448
+ simde__m256i
449
+ simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
450
+ #if defined(SIMDE_AVX2_NATIVE)
451
+ return _mm256_adds_epu16(a, b);
452
+ #else
453
+ simde__m256i_private
454
+ r_,
455
+ a_ = simde__m256i_to_private(a),
456
+ b_ = simde__m256i_to_private(b);
457
+
458
+ #if defined(SIMDE_ARCH_X86_SSE2)
459
+ r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
460
+ r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
461
+ #else
462
+ SIMDE__VECTORIZE
463
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
464
+ r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
465
+ }
466
+ #endif
467
+
468
+ return simde__m256i_from_private(r_);
469
+ #endif
470
+ }
471
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
472
+ # define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
473
+ #endif
474
+
475
+ SIMDE__FUNCTION_ATTRIBUTES
476
+ simde__m256i
477
+ simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
478
+ #if defined(SIMDE_AVX2_NATIVE)
479
+ return _mm256_avg_epu8(a, b);
480
+ #else
481
+ simde__m256i_private
482
+ r_,
483
+ a_ = simde__m256i_to_private(a),
484
+ b_ = simde__m256i_to_private(b);
485
+
486
+ SIMDE__VECTORIZE
487
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
488
+ r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
489
+ }
490
+
491
+ return simde__m256i_from_private(r_);
492
+ #endif
493
+ }
494
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
495
+ # define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
496
+ #endif
497
+
498
+ SIMDE__FUNCTION_ATTRIBUTES
499
+ simde__m256i
500
+ simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
501
+ #if defined(SIMDE_AVX2_NATIVE)
502
+ return _mm256_avg_epu16(a, b);
503
+ #else
504
+ simde__m256i_private
505
+ r_,
506
+ a_ = simde__m256i_to_private(a),
507
+ b_ = simde__m256i_to_private(b);
508
+
509
+ SIMDE__VECTORIZE
510
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
511
+ r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
512
+ }
513
+
514
+ return simde__m256i_from_private(r_);
515
+ #endif
516
+ }
517
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
518
+ # define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
519
+ #endif
520
+
521
+ SIMDE__FUNCTION_ATTRIBUTES
522
+ simde__m128i
523
+ simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
524
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
525
+ simde__m128i_private
526
+ r_,
527
+ a_ = simde__m128i_to_private(a),
528
+ b_ = simde__m128i_to_private(b);
529
+
530
+ SIMDE__VECTORIZE
531
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
532
+ r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
533
+ }
534
+
535
+ return simde__m128i_from_private(r_);
536
+ }
537
+ #if defined(SIMDE_AVX2_NATIVE)
538
+ # define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8);
539
+ #endif
540
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
541
+ # define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
542
+ #endif
543
+
544
+ SIMDE__FUNCTION_ATTRIBUTES
545
+ simde__m256i
546
+ simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
547
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
548
+ simde__m256i_private
549
+ r_,
550
+ a_ = simde__m256i_to_private(a),
551
+ b_ = simde__m256i_to_private(b);
552
+
553
+ SIMDE__VECTORIZE
554
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
555
+ r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
556
+ }
557
+
558
+ return simde__m256i_from_private(r_);
559
+ }
560
+ #if defined(SIMDE_AVX2_NATIVE)
561
+ # define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8);
562
+ #endif
563
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
564
+ # define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
565
+ #endif
566
+
567
+
568
+ SIMDE__FUNCTION_ATTRIBUTES
569
+ simde__m256i
570
+ simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
571
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
572
+ simde__m256i_private
573
+ r_,
574
+ a_ = simde__m256i_to_private(a),
575
+ b_ = simde__m256i_to_private(b);
576
+
577
+ SIMDE__VECTORIZE
578
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
579
+ r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
580
+ }
581
+
582
+ return simde__m256i_from_private(r_);
583
+ }
584
+ #if defined(SIMDE_AVX2_NATIVE)
585
+ # define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8);
586
+ #endif
587
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
588
+ # define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
589
+ #endif
590
+
591
+
592
+ SIMDE__FUNCTION_ATTRIBUTES
593
+ simde__m256i
594
+ simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
595
+ #if defined(SIMDE_AVX2_NATIVE)
596
+ return _mm256_blendv_epi8(a, b, mask);
597
+ #else
598
+ simde__m256i_private
599
+ r_,
600
+ a_ = simde__m256i_to_private(a),
601
+ b_ = simde__m256i_to_private(b),
602
+ mask_ = simde__m256i_to_private(mask);
603
+
604
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
605
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_blendv_epi8(simde__m128i_from_private(a_.m128i_private[0]), simde__m128i_from_private(b_.m128i_private[0]), simde__m128i_from_private(mask_.m128i_private[0])));
606
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_blendv_epi8(simde__m128i_from_private(a_.m128i_private[1]), simde__m128i_from_private(b_.m128i_private[1]), simde__m128i_from_private(mask_.m128i_private[1])));
607
+ #else
608
+ SIMDE__VECTORIZE
609
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
610
+ if (mask_.u8[i] & 0x80) {
611
+ r_.u8[i] = b_.u8[i];
612
+ } else {
613
+ r_.u8[i] = a_.u8[i];
614
+ }
615
+ }
616
+ #endif
617
+
618
+ return simde__m256i_from_private(r_);
619
+ #endif
620
+ }
621
+ #if defined(SIMDE_AVX2_NATIVE)
622
+ # define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8);
623
+ #endif
624
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
625
+ # define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
626
+ #endif
627
+
628
+ SIMDE__FUNCTION_ATTRIBUTES
629
+ simde__m128i
630
+ simde_mm_broadcastb_epi8 (simde__m128i a) {
631
+ #if defined(SIMDE_AVX2_NATIVE)
632
+ return _mm_broadcastb_epi8(a);
633
+ #else
634
+ simde__m128i_private r_;
635
+ simde__m128i_private a_= simde__m128i_to_private(a);
636
+
637
+ SIMDE__VECTORIZE
638
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
639
+ r_.i8[i] = a_.i8[0];
640
+ }
641
+
642
+ return simde__m128i_from_private(r_);
643
+ #endif
644
+ }
645
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
646
+ # define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
647
+ #endif
648
+
649
+ SIMDE__FUNCTION_ATTRIBUTES
650
+ simde__m256i
651
+ simde_mm256_broadcastb_epi8 (simde__m128i a) {
652
+ #if defined(SIMDE_AVX2_NATIVE)
653
+ return _mm256_broadcastb_epi8(a);
654
+ #else
655
+ simde__m256i_private r_;
656
+ simde__m128i_private a_= simde__m128i_to_private(a);
657
+
658
+ SIMDE__VECTORIZE
659
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
660
+ r_.i8[i] = a_.i8[0];
661
+ }
662
+
663
+ return simde__m256i_from_private(r_);
664
+ #endif
665
+ }
666
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
667
+ # define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
668
+ #endif
669
+
670
+ SIMDE__FUNCTION_ATTRIBUTES
671
+ simde__m256i
672
+ simde_mm256_broadcastsi128_si256 (simde__m128i a) {
673
+ #if defined(SIMDE_AVX2_NATIVE)
674
+ return _mm256_broadcastsi128_si256(a);
675
+ #else
676
+ simde__m256i_private r_;
677
+ simde__m128i_private a_ = simde__m128i_to_private(a);
678
+
679
+ #if defined(SIMDE_ARCH_X86_SSE2)
680
+ r_.m128i_private[0] = a_;
681
+ r_.m128i_private[1] = a_;
682
+ #else
683
+ r_.i64[0] = a_.i64[0];
684
+ r_.i64[1] = a_.i64[1];
685
+ r_.i64[2] = a_.i64[0];
686
+ r_.i64[3] = a_.i64[1];
687
+ #endif
688
+
689
+ return simde__m256i_from_private(r_);
690
+ #endif
691
+ }
692
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
693
+ # define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
694
+ #endif
695
+
696
+ SIMDE__FUNCTION_ATTRIBUTES
697
+ simde__m256i
698
+ simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
699
+ #if defined(SIMDE_AVX2_NATIVE)
700
+ return _mm256_cmpeq_epi8(a, b);
701
+ #else
702
+ simde__m256i_private
703
+ r_,
704
+ a_ = simde__m256i_to_private(a),
705
+ b_ = simde__m256i_to_private(b);
706
+
707
+ #if defined(SIMDE_ARCH_X86_SSE2)
708
+ r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
709
+ r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
710
+ #else
711
+ SIMDE__VECTORIZE
712
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
713
+ r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
714
+ }
715
+ #endif
716
+
717
+ return simde__m256i_from_private(r_);
718
+ #endif
719
+ }
720
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
721
+ # define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
722
+ #endif
723
+
724
+ SIMDE__FUNCTION_ATTRIBUTES
725
+ simde__m256i
726
+ simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
727
+ #if defined(SIMDE_AVX2_NATIVE)
728
+ return _mm256_cmpeq_epi16(a, b);
729
+ #else
730
+ simde__m256i_private
731
+ r_,
732
+ a_ = simde__m256i_to_private(a),
733
+ b_ = simde__m256i_to_private(b);
734
+
735
+ SIMDE__VECTORIZE
736
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
737
+ r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
738
+ }
739
+
740
+ return simde__m256i_from_private(r_);
741
+ #endif
742
+ }
743
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
744
+ # define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
745
+ #endif
746
+
747
+ SIMDE__FUNCTION_ATTRIBUTES
748
+ simde__m256i
749
+ simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
750
+ #if defined(SIMDE_AVX2_NATIVE)
751
+ return _mm256_cmpeq_epi32(a, b);
752
+ #else
753
+ simde__m256i_private
754
+ r_,
755
+ a_ = simde__m256i_to_private(a),
756
+ b_ = simde__m256i_to_private(b);
757
+
758
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
759
+ r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
760
+ r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
761
+ #else
762
+ SIMDE__VECTORIZE
763
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
764
+ r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
765
+ }
766
+ #endif
767
+
768
+ return simde__m256i_from_private(r_);
769
+ #endif
770
+ }
771
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
772
+ # define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
773
+ #endif
774
+
775
+ SIMDE__FUNCTION_ATTRIBUTES
776
+ simde__m256i
777
+ simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
778
+ #if defined(SIMDE_AVX2_NATIVE)
779
+ return _mm256_cmpeq_epi64(a, b);
780
+ #else
781
+ simde__m256i_private
782
+ r_,
783
+ a_ = simde__m256i_to_private(a),
784
+ b_ = simde__m256i_to_private(b);
785
+
786
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
787
+ r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
788
+ r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
789
+ #else
790
+ SIMDE__VECTORIZE
791
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
792
+ r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
793
+ }
794
+ #endif
795
+
796
+ return simde__m256i_from_private(r_);
797
+ #endif
798
+ }
799
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
800
+ # define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
801
+ #endif
802
+
803
+ SIMDE__FUNCTION_ATTRIBUTES
804
+ simde__m256i
805
+ simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
806
+ #if defined(SIMDE_AVX2_NATIVE)
807
+ return _mm256_cmpgt_epi8(a, b);
808
+ #else
809
+ simde__m256i_private
810
+ r_,
811
+ a_ = simde__m256i_to_private(a),
812
+ b_ = simde__m256i_to_private(b);
813
+
814
+ #if defined(SIMDE_ARCH_X86_SSE2)
815
+ r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
816
+ r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
817
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
818
+ r_.i8 = a_.i8 > b_.i8;
819
+ #else
820
+ SIMDE__VECTORIZE
821
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
822
+ r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
823
+ }
824
+ #endif
825
+
826
+ return simde__m256i_from_private(r_);
827
+ #endif
828
+ }
829
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
830
+ # define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
831
+ #endif
832
+
833
+ SIMDE__FUNCTION_ATTRIBUTES
834
+ simde__m256i
835
+ simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
836
+ #if defined(SIMDE_AVX2_NATIVE)
837
+ return _mm256_cmpgt_epi16(a, b);
838
+ #else
839
+ simde__m256i_private
840
+ r_,
841
+ a_ = simde__m256i_to_private(a),
842
+ b_ = simde__m256i_to_private(b);
843
+
844
+ #if defined(SIMDE_ARCH_X86_SSE2)
845
+ r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
846
+ r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
847
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
848
+ r_.i16 = a_.i16 > b_.i16;
849
+ #else
850
+ SIMDE__VECTORIZE
851
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
852
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
853
+ }
854
+ #endif
855
+
856
+ return simde__m256i_from_private(r_);
857
+ #endif
858
+ }
859
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
860
+ # define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
861
+ #endif
862
+
863
+ SIMDE__FUNCTION_ATTRIBUTES
864
+ simde__m256i
865
+ simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
866
+ #if defined(SIMDE_AVX2_NATIVE)
867
+ return _mm256_cmpgt_epi32(a, b);
868
+ #else
869
+ simde__m256i_private
870
+ r_,
871
+ a_ = simde__m256i_to_private(a),
872
+ b_ = simde__m256i_to_private(b);
873
+
874
+ #if defined(SIMDE_ARCH_X86_SSE2)
875
+ r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
876
+ r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
877
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
878
+ r_.i32 = a_.i32 > b_.i32;
879
+ #else
880
+ SIMDE__VECTORIZE
881
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
882
+ r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
883
+ }
884
+ #endif
885
+
886
+ return simde__m256i_from_private(r_);
887
+ #endif
888
+ }
889
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
890
+ # define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
891
+ #endif
892
+
893
+ SIMDE__FUNCTION_ATTRIBUTES
894
+ simde__m256i
895
+ simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
896
+ #if defined(SIMDE_AVX2_NATIVE)
897
+ return _mm256_cmpgt_epi64(a, b);
898
+ #else
899
+ simde__m256i_private
900
+ r_,
901
+ a_ = simde__m256i_to_private(a),
902
+ b_ = simde__m256i_to_private(b);
903
+
904
+ #if defined(SIMDE_ARCH_X86_SSE2)
905
+ r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
906
+ r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
907
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
908
+ r_.i64 = a_.i64 > b_.i64;
909
+ #else
910
+ SIMDE__VECTORIZE
911
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
912
+ r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
913
+ }
914
+ #endif
915
+
916
+ return simde__m256i_from_private(r_);
917
+ #endif
918
+ }
919
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
920
+ # define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
921
+ #endif
922
+
923
+ SIMDE__FUNCTION_ATTRIBUTES
924
+ simde__m256i
925
+ simde_mm256_cvtepi8_epi16 (simde__m128i a) {
926
+ #if defined(SIMDE_AVX2_NATIVE)
927
+ return _mm256_cvtepi8_epi16(a);
928
+ #else
929
+ simde__m256i_private r_;
930
+ simde__m128i_private a_ = simde__m128i_to_private(a);
931
+
932
+ #if defined(SIMDE__CONVERT_VECTOR)
933
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.i8);
934
+ #else
935
+ SIMDE__VECTORIZE
936
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
937
+ r_.i16[i] = a_.i8[i];
938
+ }
939
+ #endif
940
+
941
+ return simde__m256i_from_private(r_);
942
+ #endif
943
+ }
944
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
945
+ # define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
946
+ #endif
947
+
948
+ SIMDE__FUNCTION_ATTRIBUTES
949
+ simde__m256i
950
+ simde_mm256_cvtepi8_epi32 (simde__m128i a) {
951
+ #if defined(SIMDE_AVX2_NATIVE)
952
+ return _mm256_cvtepi8_epi32(a);
953
+ #else
954
+ simde__m256i_private r_;
955
+ simde__m128i_private a_ = simde__m128i_to_private(a);
956
+
957
+ #if defined(SIMDE__CONVERT_VECTOR)
958
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].i8);
959
+ #else
960
+ SIMDE__VECTORIZE
961
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
962
+ r_.i32[i] = a_.i8[i];
963
+ }
964
+ #endif
965
+
966
+ return simde__m256i_from_private(r_);
967
+ #endif
968
+ }
969
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
970
+ # define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
971
+ #endif
972
+
973
+ SIMDE__FUNCTION_ATTRIBUTES
974
+ simde__m256i
975
+ simde_mm256_cvtepi8_epi64 (simde__m128i a) {
976
+ #if defined(SIMDE_AVX2_NATIVE)
977
+ return _mm256_cvtepi8_epi64(a);
978
+ #else
979
+ simde__m256i_private r_;
980
+ simde__m128i_private a_ = simde__m128i_to_private(a);
981
+
982
+ SIMDE__VECTORIZE
983
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
984
+ r_.i64[i] = a_.i8[i];
985
+ }
986
+
987
+ return simde__m256i_from_private(r_);
988
+ #endif
989
+ }
990
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
991
+ # define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
992
+ #endif
993
+
994
+ SIMDE__FUNCTION_ATTRIBUTES
995
+ simde__m256i
996
+ simde_mm256_cvtepi16_epi32 (simde__m128i a) {
997
+ #if defined(SIMDE_AVX2_NATIVE)
998
+ return _mm256_cvtepi16_epi32(a);
999
+ #else
1000
+ simde__m256i_private r_;
1001
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1002
+
1003
+ #if defined(SIMDE__CONVERT_VECTOR)
1004
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.i16);
1005
+ #else
1006
+ SIMDE__VECTORIZE
1007
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1008
+ r_.i32[i] = a_.i16[i];
1009
+ }
1010
+ #endif
1011
+
1012
+ return simde__m256i_from_private(r_);
1013
+ #endif
1014
+ }
1015
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1016
+ # define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
1017
+ #endif
1018
+
1019
+ SIMDE__FUNCTION_ATTRIBUTES
1020
+ simde__m256i
1021
+ simde_mm256_cvtepi16_epi64 (simde__m128i a) {
1022
+ #if defined(SIMDE_AVX2_NATIVE)
1023
+ return _mm256_cvtepi16_epi64(a);
1024
+ #else
1025
+ simde__m256i_private r_;
1026
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1027
+
1028
+ #if defined(SIMDE__CONVERT_VECTOR)
1029
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].i16);
1030
+ #else
1031
+ SIMDE__VECTORIZE
1032
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1033
+ r_.i64[i] = a_.i16[i];
1034
+ }
1035
+ #endif
1036
+
1037
+ return simde__m256i_from_private(r_);
1038
+ #endif
1039
+ }
1040
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1041
+ # define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
1042
+ #endif
1043
+
1044
+ SIMDE__FUNCTION_ATTRIBUTES
1045
+ simde__m256i
1046
+ simde_mm256_cvtepi32_epi64 (simde__m128i a) {
1047
+ #if defined(SIMDE_AVX2_NATIVE)
1048
+ return _mm256_cvtepi32_epi64(a);
1049
+ #else
1050
+ simde__m256i_private r_;
1051
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1052
+
1053
+ #if defined(SIMDE__CONVERT_VECTOR)
1054
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.i32);
1055
+ #else
1056
+ SIMDE__VECTORIZE
1057
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1058
+ r_.i64[i] = a_.i32[i];
1059
+ }
1060
+ #endif
1061
+
1062
+ return simde__m256i_from_private(r_);
1063
+ #endif
1064
+ }
1065
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1066
+ # define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
1067
+ #endif
1068
+
1069
+ SIMDE__FUNCTION_ATTRIBUTES
1070
+ simde__m256i
1071
+ simde_mm256_cvtepu8_epi16 (simde__m128i a) {
1072
+ #if defined(SIMDE_AVX2_NATIVE)
1073
+ return _mm256_cvtepu8_epi16(a);
1074
+ #else
1075
+ simde__m256i_private r_;
1076
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1077
+
1078
+ #if defined(SIMDE__CONVERT_VECTOR)
1079
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.u8);
1080
+ #else
1081
+ SIMDE__VECTORIZE
1082
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1083
+ r_.i16[i] = a_.u8[i];
1084
+ }
1085
+ #endif
1086
+
1087
+ return simde__m256i_from_private(r_);
1088
+ #endif
1089
+ }
1090
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1091
+ # define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
1092
+ #endif
1093
+
1094
+ SIMDE__FUNCTION_ATTRIBUTES
1095
+ simde__m256i
1096
+ simde_mm256_cvtepu8_epi32 (simde__m128i a) {
1097
+ #if defined(SIMDE_AVX2_NATIVE)
1098
+ return _mm256_cvtepu8_epi32(a);
1099
+ #else
1100
+ simde__m256i_private r_;
1101
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1102
+
1103
+ #if defined(SIMDE__CONVERT_VECTOR)
1104
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].u8);
1105
+ #else
1106
+ SIMDE__VECTORIZE
1107
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1108
+ r_.i32[i] = a_.u8[i];
1109
+ }
1110
+ #endif
1111
+
1112
+ return simde__m256i_from_private(r_);
1113
+ #endif
1114
+ }
1115
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1116
+ # define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
1117
+ #endif
1118
+
1119
+ SIMDE__FUNCTION_ATTRIBUTES
1120
+ simde__m256i
1121
+ simde_mm256_cvtepu8_epi64 (simde__m128i a) {
1122
+ #if defined(SIMDE_AVX2_NATIVE)
1123
+ return _mm256_cvtepu8_epi64(a);
1124
+ #else
1125
+ simde__m256i_private r_;
1126
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1127
+
1128
+ SIMDE__VECTORIZE
1129
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1130
+ r_.i64[i] = a_.u8[i];
1131
+ }
1132
+
1133
+ return simde__m256i_from_private(r_);
1134
+ #endif
1135
+ }
1136
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1137
+ # define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
1138
+ #endif
1139
+
1140
+ SIMDE__FUNCTION_ATTRIBUTES
1141
+ simde__m256i
1142
+ simde_mm256_cvtepu16_epi32 (simde__m128i a) {
1143
+ #if defined(SIMDE_AVX2_NATIVE)
1144
+ return _mm256_cvtepu16_epi32(a);
1145
+ #else
1146
+ simde__m256i_private r_;
1147
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1148
+
1149
+ #if defined(SIMDE__CONVERT_VECTOR)
1150
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.u16);
1151
+ #else
1152
+ SIMDE__VECTORIZE
1153
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1154
+ r_.i32[i] = a_.u16[i];
1155
+ }
1156
+ #endif
1157
+
1158
+ return simde__m256i_from_private(r_);
1159
+ #endif
1160
+ }
1161
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1162
+ # define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
1163
+ #endif
1164
+
1165
+ SIMDE__FUNCTION_ATTRIBUTES
1166
+ simde__m256i
1167
+ simde_mm256_cvtepu16_epi64 (simde__m128i a) {
1168
+ #if defined(SIMDE_AVX2_NATIVE)
1169
+ return _mm256_cvtepu16_epi64(a);
1170
+ #else
1171
+ simde__m256i_private r_;
1172
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1173
+
1174
+ #if defined(SIMDE__CONVERT_VECTOR)
1175
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].u16);
1176
+ #else
1177
+ SIMDE__VECTORIZE
1178
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1179
+ r_.i64[i] = a_.u16[i];
1180
+ }
1181
+ #endif
1182
+
1183
+ return simde__m256i_from_private(r_);
1184
+ #endif
1185
+ }
1186
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1187
+ # define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
1188
+ #endif
1189
+
1190
+ SIMDE__FUNCTION_ATTRIBUTES
1191
+ simde__m256i
1192
+ simde_mm256_cvtepu32_epi64 (simde__m128i a) {
1193
+ #if defined(SIMDE_AVX2_NATIVE)
1194
+ return _mm256_cvtepu32_epi64(a);
1195
+ #else
1196
+ simde__m256i_private r_;
1197
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1198
+
1199
+ #if defined(SIMDE__CONVERT_VECTOR)
1200
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.u32);
1201
+ #else
1202
+ SIMDE__VECTORIZE
1203
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1204
+ r_.i64[i] = a_.u32[i];
1205
+ }
1206
+ #endif
1207
+
1208
+ return simde__m256i_from_private(r_);
1209
+ #endif
1210
+ }
1211
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1212
+ # define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
1213
+ #endif
1214
+
1215
+ SIMDE__FUNCTION_ATTRIBUTES
1216
+ int
1217
+ simde_mm256_extract_epi8 (simde__m256i a, const int index)
1218
+ HEDLEY_REQUIRE_MSG((index & 31) == index, "index must be in range [0, 31]"){
1219
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1220
+ return a_.i8[index];
1221
+ }
1222
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1223
+ # define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
1224
+ #endif
1225
+
1226
+ SIMDE__FUNCTION_ATTRIBUTES
1227
+ int
1228
+ simde_mm256_extract_epi16 (simde__m256i a, const int index)
1229
+ HEDLEY_REQUIRE_MSG((index & 0xf) == index, "index must be in range [0, 15]") {
1230
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1231
+ return a_.i16[index];
1232
+ }
1233
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1234
+ # define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
1235
+ #endif
1236
+
1237
+ SIMDE__FUNCTION_ATTRIBUTES
1238
+ simde__m128i
1239
+ simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
1240
+ HEDLEY_REQUIRE_MSG((imm8 & 1) == imm8, "imm8 must be 0 or 1") {
1241
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1242
+ return a_.m128i[imm8];
1243
+ }
1244
+ #if defined(SIMDE_AVX2_NATIVE)
1245
+ # define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
1246
+ #endif
1247
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1248
+ # define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
1249
+ #endif
1250
+
1251
+ SIMDE__FUNCTION_ATTRIBUTES
1252
+ simde__m256i
1253
+ simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) {
1254
+ #if defined(SIMDE_AVX2_NATIVE)
1255
+ return _mm256_madd_epi16(a, b);
1256
+ #else
1257
+ simde__m256i_private
1258
+ r_,
1259
+ a_ = simde__m256i_to_private(a),
1260
+ b_ = simde__m256i_to_private(b);
1261
+
1262
+ r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]);
1263
+ r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]);
1264
+
1265
+ return simde__m256i_from_private(r_);
1266
+ #endif
1267
+ }
1268
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1269
+ # define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
1270
+ #endif
1271
+
1272
+ SIMDE__FUNCTION_ATTRIBUTES
1273
+ simde__m256i
1274
+ simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) {
1275
+ #if defined(SIMDE_AVX2_NATIVE) && !defined(__PGI)
1276
+ return _mm256_max_epi8(a, b);
1277
+ #else
1278
+ simde__m256i_private
1279
+ r_,
1280
+ a_ = simde__m256i_to_private(a),
1281
+ b_ = simde__m256i_to_private(b);
1282
+
1283
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1284
+ r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]);
1285
+ r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]);
1286
+ #else
1287
+ SIMDE__VECTORIZE
1288
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1289
+ r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1290
+ }
1291
+ #endif
1292
+
1293
+ return simde__m256i_from_private(r_);
1294
+ #endif
1295
+ }
1296
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1297
+ # define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b)
1298
+ #endif
1299
+
1300
+ SIMDE__FUNCTION_ATTRIBUTES
1301
+ simde__m256i
1302
+ simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) {
1303
+ #if defined(SIMDE_AVX2_NATIVE)
1304
+ return _mm256_max_epu8(a, b);
1305
+ #else
1306
+ simde__m256i_private
1307
+ r_,
1308
+ a_ = simde__m256i_to_private(a),
1309
+ b_ = simde__m256i_to_private(b);
1310
+
1311
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1312
+ r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]);
1313
+ r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]);
1314
+ #else
1315
+ SIMDE__VECTORIZE
1316
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1317
+ r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
1318
+ }
1319
+ #endif
1320
+
1321
+ return simde__m256i_from_private(r_);
1322
+ #endif
1323
+ }
1324
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1325
+ # define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b)
1326
+ #endif
1327
+
1328
+ SIMDE__FUNCTION_ATTRIBUTES
1329
+ simde__m256i
1330
+ simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) {
1331
+ #if defined(SIMDE_AVX2_NATIVE)
1332
+ return _mm256_max_epu16(a, b);
1333
+ #else
1334
+ simde__m256i_private
1335
+ r_,
1336
+ a_ = simde__m256i_to_private(a),
1337
+ b_ = simde__m256i_to_private(b);
1338
+
1339
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1340
+ r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]);
1341
+ r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]);
1342
+ #else
1343
+ SIMDE__VECTORIZE
1344
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1345
+ r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i];
1346
+ }
1347
+ #endif
1348
+
1349
+ return simde__m256i_from_private(r_);
1350
+ #endif
1351
+ }
1352
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1353
+ # define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b)
1354
+ #endif
1355
+
1356
+ SIMDE__FUNCTION_ATTRIBUTES
1357
+ simde__m256i
1358
+ simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) {
1359
+ #if defined(SIMDE_AVX2_NATIVE)
1360
+ return _mm256_max_epu32(a, b);
1361
+ #else
1362
+ simde__m256i_private
1363
+ r_,
1364
+ a_ = simde__m256i_to_private(a),
1365
+ b_ = simde__m256i_to_private(b);
1366
+
1367
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1368
+ r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]);
1369
+ r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]);
1370
+ #else
1371
+ SIMDE__VECTORIZE
1372
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1373
+ r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i];
1374
+ }
1375
+ #endif
1376
+
1377
+ return simde__m256i_from_private(r_);
1378
+ #endif
1379
+ }
1380
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1381
+ # define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b)
1382
+ #endif
1383
+
1384
+ SIMDE__FUNCTION_ATTRIBUTES
1385
+ simde__m256i
1386
+ simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) {
1387
+ #if defined(SIMDE_AVX2_NATIVE)
1388
+ return _mm256_max_epi16(a, b);
1389
+ #else
1390
+ simde__m256i_private
1391
+ r_,
1392
+ a_ = simde__m256i_to_private(a),
1393
+ b_ = simde__m256i_to_private(b);
1394
+
1395
+ #if defined(SIMDE_ARCH_X86_SSE2)
1396
+ r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]);
1397
+ r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]);
1398
+ #else
1399
+ SIMDE__VECTORIZE
1400
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1401
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
1402
+ }
1403
+ #endif
1404
+
1405
+ return simde__m256i_from_private(r_);
1406
+ #endif
1407
+ }
1408
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1409
+ # define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b)
1410
+ #endif
1411
+
1412
+ SIMDE__FUNCTION_ATTRIBUTES
1413
+ simde__m256i
1414
+ simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) {
1415
+ #if defined(SIMDE_AVX2_NATIVE)
1416
+ return _mm256_max_epi32(a, b);
1417
+ #else
1418
+ simde__m256i_private
1419
+ r_,
1420
+ a_ = simde__m256i_to_private(a),
1421
+ b_ = simde__m256i_to_private(b);
1422
+
1423
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1424
+ r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]);
1425
+ r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]);
1426
+ #else
1427
+ SIMDE__VECTORIZE
1428
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1429
+ r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1430
+ }
1431
+ #endif
1432
+
1433
+ return simde__m256i_from_private(r_);
1434
+ #endif
1435
+ }
1436
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1437
+ # define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b)
1438
+ #endif
1439
+
1440
+ SIMDE__FUNCTION_ATTRIBUTES
1441
+ simde__m256i
1442
+ simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) {
1443
+ #if defined(SIMDE_AVX2_NATIVE) && !defined(__PGI)
1444
+ return _mm256_min_epi8(a, b);
1445
+ #else
1446
+ simde__m256i_private
1447
+ r_,
1448
+ a_ = simde__m256i_to_private(a),
1449
+ b_ = simde__m256i_to_private(b);
1450
+
1451
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1452
+ r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]);
1453
+ r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]);
1454
+ #else
1455
+ SIMDE__VECTORIZE
1456
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1457
+ r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1458
+ }
1459
+ #endif
1460
+
1461
+ return simde__m256i_from_private(r_);
1462
+ #endif
1463
+ }
1464
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1465
+ # define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b)
1466
+ #endif
1467
+
1468
+ SIMDE__FUNCTION_ATTRIBUTES
1469
+ simde__m256i
1470
+ simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) {
1471
+ #if defined(SIMDE_AVX2_NATIVE)
1472
+ return _mm256_min_epi16(a, b);
1473
+ #else
1474
+ simde__m256i_private
1475
+ r_,
1476
+ a_ = simde__m256i_to_private(a),
1477
+ b_ = simde__m256i_to_private(b);
1478
+
1479
+ #if defined(SIMDE_ARCH_X86_SSE2)
1480
+ r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]);
1481
+ r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]);
1482
+ #else
1483
+ SIMDE__VECTORIZE
1484
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1485
+ r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
1486
+ }
1487
+ #endif
1488
+
1489
+ return simde__m256i_from_private(r_);
1490
+ #endif
1491
+ }
1492
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1493
+ # define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b)
1494
+ #endif
1495
+
1496
+ SIMDE__FUNCTION_ATTRIBUTES
1497
+ simde__m256i
1498
+ simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) {
1499
+ #if defined(SIMDE_AVX2_NATIVE)
1500
+ return _mm256_min_epi32(a, b);
1501
+ #else
1502
+ simde__m256i_private
1503
+ r_,
1504
+ a_ = simde__m256i_to_private(a),
1505
+ b_ = simde__m256i_to_private(b);
1506
+
1507
+ #if defined(SIMDE_ARCH_X86_SSE4_1)
1508
+ r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]);
1509
+ r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]);
1510
+ #else
1511
+ SIMDE__VECTORIZE
1512
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1513
+ r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1514
+ }
1515
+ #endif
1516
+
1517
+ return simde__m256i_from_private(r_);
1518
+ #endif
1519
+ }
1520
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1521
+ # define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b)
1522
+ #endif
1523
+
1524
+ SIMDE__FUNCTION_ATTRIBUTES
1525
+ simde__m256i
1526
+ simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) {
1527
+ #if defined(SIMDE_AVX2_NATIVE)
1528
+ return _mm256_min_epu8(a, b);
1529
+ #else
1530
+ simde__m256i_private
1531
+ r_,
1532
+ a_ = simde__m256i_to_private(a),
1533
+ b_ = simde__m256i_to_private(b);
1534
+
1535
+ #if defined(SIMDE_ARCH_X86_SSE2)
1536
+ r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]);
1537
+ r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]);
1538
+ #else
1539
+ SIMDE__VECTORIZE
1540
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1541
+ r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
1542
+ }
1543
+ #endif
1544
+
1545
+ return simde__m256i_from_private(r_);
1546
+ #endif
1547
+ }
1548
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1549
+ # define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b)
1550
+ #endif
1551
+
1552
+ SIMDE__FUNCTION_ATTRIBUTES
1553
+ simde__m256i
1554
+ simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) {
1555
+ #if defined(SIMDE_AVX2_NATIVE)
1556
+ return _mm256_min_epu16(a, b);
1557
+ #else
1558
+ simde__m256i_private
1559
+ r_,
1560
+ a_ = simde__m256i_to_private(a),
1561
+ b_ = simde__m256i_to_private(b);
1562
+
1563
+ #if defined(SIMDE_ARCH_X86_SSE2)
1564
+ r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]);
1565
+ r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]);
1566
+ #else
1567
+ SIMDE__VECTORIZE
1568
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1569
+ r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i];
1570
+ }
1571
+ #endif
1572
+
1573
+ return simde__m256i_from_private(r_);
1574
+ #endif
1575
+ }
1576
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1577
+ # define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b)
1578
+ #endif
1579
+
1580
+ SIMDE__FUNCTION_ATTRIBUTES
1581
+ simde__m256i
1582
+ simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) {
1583
+ #if defined(SIMDE_AVX2_NATIVE)
1584
+ return _mm256_min_epu32(a, b);
1585
+ #else
1586
+ simde__m256i_private
1587
+ r_,
1588
+ a_ = simde__m256i_to_private(a),
1589
+ b_ = simde__m256i_to_private(b);
1590
+
1591
+ #if defined(SIMDE_ARCH_X86_SSE2)
1592
+ r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]);
1593
+ r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]);
1594
+ #else
1595
+ SIMDE__VECTORIZE
1596
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1597
+ r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i];
1598
+ }
1599
+ #endif
1600
+
1601
+ return simde__m256i_from_private(r_);
1602
+ #endif
1603
+ }
1604
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1605
+ # define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b)
1606
+ #endif
1607
+
1608
+ SIMDE__FUNCTION_ATTRIBUTES
1609
+ int32_t
1610
+ simde_mm256_movemask_epi8 (simde__m256i a) {
1611
+ #if defined(SIMDE_AVX2_NATIVE)
1612
+ return _mm256_movemask_epi8(a);
1613
+ #else
1614
+ simde__m256i_private a_ = simde__m256i_to_private(a);
1615
+ int32_t r;
1616
+
1617
+ #if defined(SIMDE_ARCH_X86_SSE2)
1618
+ r = simde_mm_movemask_epi8(a_.m128i[1]);
1619
+ r = (r << 16) | simde_mm_movemask_epi8(a_.m128i[0]);
1620
+ #else
1621
+ r = 0;
1622
+ SIMDE__VECTORIZE_REDUCTION(|:r)
1623
+ for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
1624
+ r |= (a_.u8[31 - i] >> 7) << (31 - i);
1625
+ }
1626
+ #endif
1627
+
1628
+ return r;
1629
+ #endif
1630
+ }
1631
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1632
+ # define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a)
1633
+ #endif
1634
+
1635
+ SIMDE__FUNCTION_ATTRIBUTES
1636
+ simde__m256i
1637
+ simde_mm256_or_si256 (simde__m256i a, simde__m256i b) {
1638
+ #if defined(SIMDE_AVX2_NATIVE)
1639
+ return _mm256_or_si256(a, b);
1640
+ #else
1641
+ simde__m256i_private
1642
+ r_,
1643
+ a_ = simde__m256i_to_private(a),
1644
+ b_ = simde__m256i_to_private(b);
1645
+
1646
+ #if defined(SIMDE_ARCH_X86_SSE2)
1647
+ r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]);
1648
+ r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]);
1649
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1650
+ r_.i32f = a_.i32f | b_.i32f;
1651
+ #else
1652
+ SIMDE__VECTORIZE
1653
+ for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1654
+ r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
1655
+ }
1656
+ #endif
1657
+
1658
+ return simde__m256i_from_private(r_);
1659
+ #endif
1660
+ }
1661
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1662
+ # define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b)
1663
+ #endif
1664
+
1665
+ SIMDE__FUNCTION_ATTRIBUTES
1666
+ simde__m256i
1667
+ simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) {
1668
+ #if defined(SIMDE_AVX2_NATIVE)
1669
+ return _mm256_packs_epi32(a, b);
1670
+ #else
1671
+ simde__m256i_private
1672
+ r_,
1673
+ v_[] = {
1674
+ simde__m256i_to_private(a),
1675
+ simde__m256i_to_private(b)
1676
+ };
1677
+ #if defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_SSE2_NEON)
1678
+ r_.m128i_private[0] = simde__m128i_to_private(simde_mm_packs_epi32(simde__m128i_from_private(v_[0].m128i_private[0]), simde__m128i_from_private(v_[1].m128i_private[0])));
1679
+ r_.m128i_private[1] = simde__m128i_to_private(simde_mm_packs_epi32(simde__m128i_from_private(v_[0].m128i_private[1]), simde__m128i_from_private(v_[1].m128i_private[1])));
1680
+ #else
1681
+ SIMDE__VECTORIZE
1682
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1683
+ const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)];
1684
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v));
1685
+ }
1686
+ #endif
1687
+
1688
+ return simde__m256i_from_private(r_);
1689
+ #endif
1690
+ }
1691
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1692
+ # define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b)
1693
+ #endif
1694
+
1695
+ SIMDE__FUNCTION_ATTRIBUTES
1696
+ simde__m256i
1697
+ simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8)
1698
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
1699
+ simde__m256i_private
1700
+ r_,
1701
+ a_ = simde__m256i_to_private(a),
1702
+ b_ = simde__m256i_to_private(b);
1703
+
1704
+ r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]);
1705
+ r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
1706
+
1707
+ return simde__m256i_from_private(r_);
1708
+ }
1709
+ #if defined(SIMDE_AVX2_NATIVE)
1710
+ # define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8)
1711
+ #endif
1712
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1713
+ # define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8)
1714
+ #endif
1715
+
1716
+ SIMDE__FUNCTION_ATTRIBUTES
1717
+ simde__m256i
1718
+ simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8)
1719
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
1720
+ simde__m256i_private
1721
+ r_,
1722
+ a_ = simde__m256i_to_private(a);
1723
+
1724
+ r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1];
1725
+ r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1];
1726
+ r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1];
1727
+ r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1];
1728
+
1729
+ return simde__m256i_from_private(r_);
1730
+ }
1731
+ #if defined(SIMDE_AVX2_NATIVE)
1732
+ # define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8)
1733
+ #endif
1734
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1735
+ # define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8)
1736
+ #endif
1737
+
1738
+ SIMDE__FUNCTION_ATTRIBUTES
1739
+ simde__m256i
1740
+ simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) {
1741
+ #if defined(SIMDE_AVX2_NATIVE)
1742
+ return _mm256_shuffle_epi8(a, b);
1743
+ #else
1744
+ simde__m256i_private
1745
+ r_,
1746
+ a_ = simde__m256i_to_private(a),
1747
+ b_ = simde__m256i_to_private(b);
1748
+
1749
+ #if defined(SIMDE_ARCH_X86_SSSE3)
1750
+ r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]);
1751
+ r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]);
1752
+ #else
1753
+ SIMDE__VECTORIZE
1754
+ for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) {
1755
+ r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ];
1756
+ r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16];
1757
+ }
1758
+ #endif
1759
+
1760
+ return simde__m256i_from_private(r_);
1761
+ #endif
1762
+ }
1763
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1764
+ # define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b)
1765
+ #endif
1766
+
1767
+ SIMDE__FUNCTION_ATTRIBUTES
1768
+ simde__m256i
1769
+ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
1770
+ simde__m256i_private
1771
+ r_,
1772
+ a_ = simde__m256i_to_private(a);
1773
+
1774
+ for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
1775
+ r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
1776
+ }
1777
+ for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
1778
+ r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4];
1779
+ }
1780
+
1781
+ return simde__m256i_from_private(r_);
1782
+ }
1783
+ #if defined(SIMDE_AVX2_NATIVE)
1784
+ # define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8)
1785
+ #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
1786
+ # define simde_mm256_shuffle_epi32(a, imm8) \
1787
+ simde_mm256_set_m128i( \
1788
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1789
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
1790
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
1791
+ # define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
1792
+ const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
1793
+ simde__m256i_from_private((simde__m256i_private) { .i32 = \
1794
+ SIMDE__SHUFFLE_VECTOR(32, 32, \
1795
+ (simde__tmp_a_).i32, \
1796
+ (simde__tmp_a_).i32, \
1797
+ ((imm8) ) & 3, \
1798
+ ((imm8) >> 2) & 3, \
1799
+ ((imm8) >> 4) & 3, \
1800
+ ((imm8) >> 6) & 3, \
1801
+ (((imm8) ) & 3) + 4, \
1802
+ (((imm8) >> 2) & 3) + 4, \
1803
+ (((imm8) >> 4) & 3) + 4, \
1804
+ (((imm8) >> 6) & 3) + 4) }); }))
1805
+ #endif
1806
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1807
+ # define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8)
1808
+ #endif
1809
+
1810
+ #if defined(SIMDE_AVX2_NATIVE)
1811
+ # define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8)
1812
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1813
+ # define simde_mm256_shufflelo_epi16(a, imm8) \
1814
+ simde_mm256_set_m128i( \
1815
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1816
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
1817
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
1818
+ # define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
1819
+ const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
1820
+ simde__m256i_from_private((simde__m256i_private) { .i16 = \
1821
+ SIMDE__SHUFFLE_VECTOR(16, 32, \
1822
+ (simde__tmp_a_).i16, \
1823
+ (simde__tmp_a_).i16, \
1824
+ (((imm8) ) & 3), \
1825
+ (((imm8) >> 2) & 3), \
1826
+ (((imm8) >> 4) & 3), \
1827
+ (((imm8) >> 6) & 3), \
1828
+ 4, 5, 6, 7, \
1829
+ ((((imm8) ) & 3) + 8), \
1830
+ ((((imm8) >> 2) & 3) + 8), \
1831
+ ((((imm8) >> 4) & 3) + 8), \
1832
+ ((((imm8) >> 6) & 3) + 8), \
1833
+ 12, 13, 14, 15) }); }))
1834
+ #else
1835
+ # define simde_mm256_shufflelo_epi16(a, imm8) \
1836
+ simde_mm256_set_m128i( \
1837
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
1838
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
1839
+ #endif
1840
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1841
+ # define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
1842
+ #endif
1843
+
1844
+ SIMDE__FUNCTION_ATTRIBUTES
1845
+ simde__m256i
1846
+ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
1847
+ HEDLEY_REQUIRE_MSG((imm8 & 15) == imm8, "imm8 must be in range [0, 15]") {
1848
+ /* Note: There is no consistency in how compilers handle values outside of
1849
+ the expected range, hence the discrepancy between what we allow and what
1850
+ Intel specifies. Some compilers will return 0, others seem to just mask
1851
+ off everything outside of the range. */
1852
+ simde__m256i_private
1853
+ r_,
1854
+ a_ = simde__m256i_to_private(a);
1855
+
1856
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1857
+ r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8);
1858
+ #else
1859
+ SIMDE__VECTORIZE
1860
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1861
+ r_.i16[i] = a_.i16[i] << (imm8 & 0xff);
1862
+ }
1863
+ #endif
1864
+
1865
+ return simde__m256i_from_private(r_);
1866
+ }
1867
+ #if defined(SIMDE_AVX2_NATIVE)
1868
+ # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
1869
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1870
+ # define simde_mm256_slli_epi16(a, imm8) \
1871
+ simde_mm256_set_m128i( \
1872
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1873
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
1874
+ #endif
1875
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1876
+ # define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
1877
+ #endif
1878
+
1879
+ SIMDE__FUNCTION_ATTRIBUTES
1880
+ simde__m256i
1881
+ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
1882
+ HEDLEY_REQUIRE_MSG((imm8 & 31) == imm8, "imm8 must be in range [0, 31]") {
1883
+ simde__m256i_private
1884
+ r_,
1885
+ a_ = simde__m256i_to_private(a);
1886
+
1887
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1888
+ r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8);
1889
+ #else
1890
+ SIMDE__VECTORIZE
1891
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1892
+ r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
1893
+ }
1894
+ #endif
1895
+
1896
+ return simde__m256i_from_private(r_);
1897
+ }
1898
+ #if defined(SIMDE_AVX2_NATIVE)
1899
+ # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
1900
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1901
+ # define simde_mm256_slli_epi32(a, imm8) \
1902
+ simde_mm256_set_m128i( \
1903
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1904
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
1905
+ #endif
1906
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1907
+ # define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
1908
+ #endif
1909
+
1910
+ SIMDE__FUNCTION_ATTRIBUTES
1911
+ simde__m256i
1912
+ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
1913
+ HEDLEY_REQUIRE_MSG((imm8 & 15) == imm8, "imm8 must be in range [0, 63]") {
1914
+ simde__m256i_private
1915
+ r_,
1916
+ a_ = simde__m256i_to_private(a);
1917
+
1918
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1919
+ r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8);
1920
+ #else
1921
+ SIMDE__VECTORIZE
1922
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1923
+ r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
1924
+ }
1925
+ #endif
1926
+
1927
+ return simde__m256i_from_private(r_);
1928
+ }
1929
+ #if defined(SIMDE_AVX2_NATIVE)
1930
+ # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
1931
+ #elif defined(SIMDE_ARCH_X86_SSE2)
1932
+ # define simde_mm256_slli_epi64(a, imm8) \
1933
+ simde_mm256_set_m128i( \
1934
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
1935
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
1936
+ #endif
1937
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1938
+ # define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
1939
+ #endif
1940
+
1941
+ SIMDE__FUNCTION_ATTRIBUTES
1942
+ simde__m256i
1943
+ simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) {
1944
+ #if defined(SIMDE_AVX2_NATIVE)
1945
+ return _mm256_sub_epi8(a, b);
1946
+ #else
1947
+ simde__m256i_private
1948
+ r_,
1949
+ a_ = simde__m256i_to_private(a),
1950
+ b_ = simde__m256i_to_private(b);
1951
+
1952
+ #if defined(SIMDE_ARCH_X86_SSE2)
1953
+ r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]);
1954
+ r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]);
1955
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1956
+ r_.i8 = a_.i8 - b_.i8;
1957
+ #else
1958
+ SIMDE__VECTORIZE
1959
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1960
+ r_.i8[i] = a_.i8[i] - b_.i8[i];
1961
+ }
1962
+ #endif
1963
+
1964
+ return simde__m256i_from_private(r_);
1965
+ #endif
1966
+ }
1967
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1968
+ # define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b)
1969
+ #endif
1970
+
1971
+ SIMDE__FUNCTION_ATTRIBUTES
1972
+ simde__m256i
1973
+ simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) {
1974
+ #if defined(SIMDE_AVX2_NATIVE)
1975
+ return _mm256_sub_epi16(a, b);
1976
+ #else
1977
+ simde__m256i_private
1978
+ r_,
1979
+ a_ = simde__m256i_to_private(a),
1980
+ b_ = simde__m256i_to_private(b);
1981
+
1982
+ #if defined(SIMDE_ARCH_X86_SSE2)
1983
+ r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]);
1984
+ r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]);
1985
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1986
+ r_.i16 = a_.i16 - b_.i16;
1987
+ #else
1988
+ SIMDE__VECTORIZE
1989
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1990
+ r_.i16[i] = a_.i16[i] - b_.i16[i];
1991
+ }
1992
+ #endif
1993
+
1994
+ return simde__m256i_from_private(r_);
1995
+ #endif
1996
+ }
1997
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
1998
+ # define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b)
1999
+ #endif
2000
+
2001
+ SIMDE__FUNCTION_ATTRIBUTES
2002
+ simde__m256i
2003
+ simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) {
2004
+ #if defined(SIMDE_AVX2_NATIVE)
2005
+ return _mm256_sub_epi32(a, b);
2006
+ #else
2007
+ simde__m256i_private
2008
+ r_,
2009
+ a_ = simde__m256i_to_private(a),
2010
+ b_ = simde__m256i_to_private(b);
2011
+
2012
+ #if defined(SIMDE_ARCH_X86_SSE2)
2013
+ r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]);
2014
+ r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]);
2015
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2016
+ r_.i32 = a_.i32 - b_.i32;
2017
+ #else
2018
+ SIMDE__VECTORIZE
2019
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2020
+ r_.i32[i] = a_.i32[i] - b_.i32[i];
2021
+ }
2022
+ #endif
2023
+
2024
+ return simde__m256i_from_private(r_);
2025
+ #endif
2026
+ }
2027
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2028
+ # define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b)
2029
+ #endif
2030
+
2031
+ SIMDE__FUNCTION_ATTRIBUTES
2032
+ simde__m256i
2033
+ simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) {
2034
+ #if defined(SIMDE_AVX2_NATIVE)
2035
+ return _mm256_sub_epi64(a, b);
2036
+ #else
2037
+ simde__m256i_private
2038
+ r_,
2039
+ a_ = simde__m256i_to_private(a),
2040
+ b_ = simde__m256i_to_private(b);
2041
+
2042
+ #if defined(SIMDE_ARCH_X86_SSE2)
2043
+ r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]);
2044
+ r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]);
2045
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2046
+ r_.i64 = a_.i64 - b_.i64;
2047
+ #else
2048
+ SIMDE__VECTORIZE
2049
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
2050
+ r_.i64[i] = a_.i64[i] - b_.i64[i];
2051
+ }
2052
+ #endif
2053
+
2054
+ return simde__m256i_from_private(r_);
2055
+ #endif
2056
+ }
2057
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2058
+ # define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b)
2059
+ #endif
2060
+
2061
+ SIMDE__FUNCTION_ATTRIBUTES
2062
+ simde__m256i
2063
+ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) {
2064
+ simde__m256i_private
2065
+ r_,
2066
+ a_ = simde__m256i_to_private(a);
2067
+
2068
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2069
+ r_.u64 = a_.u64 >> HEDLEY_STATIC_CAST(int32_t, imm8);
2070
+ #else
2071
+ SIMDE__VECTORIZE
2072
+ for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
2073
+ r_.u64[i] = a_.u64[i] >> imm8;
2074
+ }
2075
+ #endif
2076
+
2077
+ return simde__m256i_from_private(r_);
2078
+ }
2079
+ #if defined(SIMDE_AVX2_NATIVE)
2080
+ # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
2081
+ #elif defined(SIMDE_ARCH_X86_SSE2)
2082
+ # define simde_mm256_srli_epi64(a, imm8) \
2083
+ simde_mm256_set_m128i( \
2084
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2085
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
2086
+ #endif
2087
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2088
+ # define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
2089
+ #endif
2090
+
2091
+ SIMDE__FUNCTION_ATTRIBUTES
2092
+ simde__m256i
2093
+ simde_mm256_srli_si256 (simde__m256i a, const int imm8) {
2094
+ simde__m256i_private
2095
+ r_,
2096
+ a_ = simde__m256i_to_private(a);
2097
+
2098
+ for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
2099
+ SIMDE__VECTORIZE
2100
+ for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
2101
+ const int e = imm8 + HEDLEY_STATIC_CAST(int, i);
2102
+ r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0;
2103
+ }
2104
+ }
2105
+
2106
+ return simde__m256i_from_private(r_);
2107
+ }
2108
+ #if defined(SIMDE_AVX2_NATIVE)
2109
+ # define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8)
2110
+ #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
2111
+ # define simde_mm256_srli_si256(a, imm8) \
2112
+ simde_mm256_set_m128i( \
2113
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2114
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
2115
+ #elif defined(SIMDE_SSE2_NEON)
2116
+ # define simde_mm256_srli_si256(a, imm8) \
2117
+ simde_mm256_set_m128i( \
2118
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2119
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
2120
+ #endif
2121
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2122
+ # define _mm256_srli_si256(a, imm8) simde_mm_srli_si256(a, imm8)
2123
+ #endif
2124
+
2125
+ SIMDE__FUNCTION_ATTRIBUTES
2126
+ simde__m256i
2127
+ simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) {
2128
+ #if defined(SIMDE_AVX2_NATIVE)
2129
+ return _mm256_unpacklo_epi8(a, b);
2130
+ #else
2131
+ simde__m256i_private
2132
+ r_,
2133
+ a_ = simde__m256i_to_private(a),
2134
+ b_ = simde__m256i_to_private(b);
2135
+
2136
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2137
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 32, a_.i8, b_.i8,
2138
+ 0, 32, 1, 33, 2, 34, 3, 35,
2139
+ 4, 36, 5, 37, 6, 38, 7, 39,
2140
+ 16, 48, 17, 49, 18, 50, 19, 51,
2141
+ 20, 52, 21, 53, 22, 54, 23, 55);
2142
+ #else
2143
+ r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]);
2144
+ r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]);
2145
+ #endif
2146
+
2147
+ return simde__m256i_from_private(r_);
2148
+ #endif
2149
+ }
2150
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2151
+ # define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b)
2152
+ #endif
2153
+
2154
+ SIMDE__FUNCTION_ATTRIBUTES
2155
+ simde__m256i
2156
+ simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) {
2157
+ #if defined(SIMDE_AVX2_NATIVE)
2158
+ return _mm256_unpacklo_epi16(a, b);
2159
+ #else
2160
+ simde__m256i_private
2161
+ r_,
2162
+ a_ = simde__m256i_to_private(a),
2163
+ b_ = simde__m256i_to_private(b);
2164
+
2165
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2166
+ r_.i16 =SIMDE__SHUFFLE_VECTOR(16, 32, a_.i16, b_.i16,
2167
+ 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27);
2168
+ #else
2169
+ r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]);
2170
+ r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]);
2171
+ #endif
2172
+
2173
+ return simde__m256i_from_private(r_);
2174
+ #endif
2175
+ }
2176
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2177
+ # define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b)
2178
+ #endif
2179
+
2180
+ SIMDE__FUNCTION_ATTRIBUTES
2181
+ simde__m256i
2182
+ simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) {
2183
+ #if defined(SIMDE_AVX2_NATIVE)
2184
+ return _mm256_unpacklo_epi32(a, b);
2185
+ #else
2186
+ simde__m256i_private
2187
+ r_,
2188
+ a_ = simde__m256i_to_private(a),
2189
+ b_ = simde__m256i_to_private(b);
2190
+
2191
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2192
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 32, a_.i32, b_.i32,
2193
+ 0, 8, 1, 9, 4, 12, 5, 13);
2194
+ #else
2195
+ r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]);
2196
+ r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]);
2197
+ #endif
2198
+
2199
+ return simde__m256i_from_private(r_);
2200
+ #endif
2201
+ }
2202
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2203
+ # define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b)
2204
+ #endif
2205
+
2206
+ SIMDE__FUNCTION_ATTRIBUTES
2207
+ simde__m256i
2208
+ simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) {
2209
+ #if defined(SIMDE_AVX2_NATIVE)
2210
+ return _mm256_unpacklo_epi64(a, b);
2211
+ #else
2212
+ simde__m256i_private
2213
+ r_,
2214
+ a_ = simde__m256i_to_private(a),
2215
+ b_ = simde__m256i_to_private(b);
2216
+
2217
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2218
+ r_.i64 = SIMDE__SHUFFLE_VECTOR(64, 32, a_.i64, b_.i64, 0, 4, 2, 6);
2219
+ #else
2220
+ r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]);
2221
+ r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]);
2222
+ #endif
2223
+
2224
+ return simde__m256i_from_private(r_);
2225
+ #endif
2226
+ }
2227
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2228
+ # define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b)
2229
+ #endif
2230
+
2231
+ SIMDE__FUNCTION_ATTRIBUTES
2232
+ simde__m256i
2233
+ simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) {
2234
+ #if defined(SIMDE_AVX2_NATIVE)
2235
+ return _mm256_unpackhi_epi8(a, b);
2236
+ #else
2237
+ simde__m256i_private
2238
+ r_,
2239
+ a_ = simde__m256i_to_private(a),
2240
+ b_ = simde__m256i_to_private(b);
2241
+
2242
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2243
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 32, a_.i8, b_.i8,
2244
+ 8, 40, 9, 41, 10, 42, 11, 43,
2245
+ 12, 44, 13, 45, 14, 46, 15, 47,
2246
+ 24, 56, 25, 57, 26, 58, 27, 59,
2247
+ 28, 60, 29, 61, 30, 62, 31, 63);
2248
+ #else
2249
+ r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]);
2250
+ r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]);
2251
+ #endif
2252
+
2253
+ return simde__m256i_from_private(r_);
2254
+ #endif
2255
+ }
2256
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2257
+ # define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b)
2258
+ #endif
2259
+
2260
+ SIMDE__FUNCTION_ATTRIBUTES
2261
+ simde__m256i
2262
+ simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) {
2263
+ #if defined(SIMDE_AVX2_NATIVE)
2264
+ return _mm256_unpackhi_epi16(a, b);
2265
+ #else
2266
+ simde__m256i_private
2267
+ r_,
2268
+ a_ = simde__m256i_to_private(a),
2269
+ b_ = simde__m256i_to_private(b);
2270
+
2271
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2272
+ r_.i16 = SIMDE__SHUFFLE_VECTOR(16, 32, a_.i16, b_.i16,
2273
+ 4, 20, 5, 21, 6, 22, 7, 23,
2274
+ 12, 28, 13, 29, 14, 30, 15, 31);
2275
+ #else
2276
+ r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]);
2277
+ r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]);
2278
+ #endif
2279
+
2280
+ return simde__m256i_from_private(r_);
2281
+ #endif
2282
+ }
2283
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2284
+ # define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b)
2285
+ #endif
2286
+
2287
+ SIMDE__FUNCTION_ATTRIBUTES
2288
+ simde__m256i
2289
+ simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) {
2290
+ #if defined(SIMDE_AVX2_NATIVE)
2291
+ return _mm256_unpackhi_epi32(a, b);
2292
+ #else
2293
+ simde__m256i_private
2294
+ r_,
2295
+ a_ = simde__m256i_to_private(a),
2296
+ b_ = simde__m256i_to_private(b);
2297
+
2298
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2299
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 32, a_.i32, b_.i32,
2300
+ 2, 10, 3, 11, 6, 14, 7, 15);
2301
+ #else
2302
+ r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]);
2303
+ r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]);
2304
+ #endif
2305
+
2306
+ return simde__m256i_from_private(r_);
2307
+ #endif
2308
+ }
2309
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2310
+ # define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b)
2311
+ #endif
2312
+
2313
+ SIMDE__FUNCTION_ATTRIBUTES
2314
+ simde__m256i
2315
+ simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) {
2316
+ #if defined(SIMDE_AVX2_NATIVE)
2317
+ return _mm256_unpackhi_epi64(a, b);
2318
+ #else
2319
+ simde__m256i_private
2320
+ r_,
2321
+ a_ = simde__m256i_to_private(a),
2322
+ b_ = simde__m256i_to_private(b);
2323
+
2324
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2325
+ r_.i64 = SIMDE__SHUFFLE_VECTOR(64, 32, a_.i64, b_.i64, 1, 5, 3, 7);
2326
+ #else
2327
+ r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]);
2328
+ r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]);
2329
+ #endif
2330
+
2331
+ return simde__m256i_from_private(r_);
2332
+ #endif
2333
+ }
2334
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2335
+ # define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b)
2336
+ #endif
2337
+
2338
+ SIMDE__FUNCTION_ATTRIBUTES
2339
+ simde__m256i
2340
+ simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) {
2341
+ #if defined(SIMDE_AVX2_NATIVE)
2342
+ return _mm256_xor_si256(a, b);
2343
+ #else
2344
+ simde__m256i_private
2345
+ r_,
2346
+ a_ = simde__m256i_to_private(a),
2347
+ b_ = simde__m256i_to_private(b);
2348
+
2349
+ #if defined(SIMDE_ARCH_X86_SSE2)
2350
+ r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
2351
+ r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
2352
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2353
+ r_.i32f = a_.i32f ^ b_.i32f;
2354
+ #else
2355
+ SIMDE__VECTORIZE
2356
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
2357
+ r_.i64[i] = a_.i64[i] ^ b_.i64[i];
2358
+ }
2359
+ #endif
2360
+
2361
+ return simde__m256i_from_private(r_);
2362
+ #endif
2363
+ }
2364
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2365
+ # define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b)
2366
+ #endif
2367
+
2368
+ SIMDE__FUNCTION_ATTRIBUTES
2369
+ simde__m256i
2370
+ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) {
2371
+ simde__m256i_private
2372
+ r_,
2373
+ a_ = simde__m256i_to_private(a);
2374
+
2375
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2376
+ r_.u32 = a_.u32 >> HEDLEY_STATIC_CAST(int16_t, imm8);
2377
+ #else
2378
+ SIMDE__VECTORIZE
2379
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
2380
+ r_.u32[i] = a_.u32[i] >> imm8;
2381
+ }
2382
+ #endif
2383
+
2384
+ return simde__m256i_from_private(r_);
2385
+ }
2386
+ #if defined(SIMDE_AVX2_NATIVE)
2387
+ # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
2388
+ #elif defined(SIMDE_ARCH_X86_SSE2)
2389
+ # define simde_mm256_srli_epi32(a, imm8) \
2390
+ simde_mm256_set_m128i( \
2391
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
2392
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
2393
+ #endif
2394
+ #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
2395
+ # define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
2396
+ #endif
2397
+
2398
+ SIMDE__END_DECLS
2399
+
2400
+ HEDLEY_DIAGNOSTIC_POP
2401
+
2402
+ #endif /* !defined(SIMDE__AVX2_H) */