minimap2 0.2.25.0 → 0.2.25.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (123) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/ext/minimap2/Makefile +6 -2
  4. data/ext/minimap2/NEWS.md +38 -0
  5. data/ext/minimap2/README.md +9 -3
  6. data/ext/minimap2/align.c +5 -3
  7. data/ext/minimap2/cookbook.md +2 -2
  8. data/ext/minimap2/format.c +7 -4
  9. data/ext/minimap2/kalloc.c +20 -1
  10. data/ext/minimap2/kalloc.h +13 -2
  11. data/ext/minimap2/ksw2.h +1 -0
  12. data/ext/minimap2/ksw2_extd2_sse.c +1 -1
  13. data/ext/minimap2/ksw2_exts2_sse.c +79 -40
  14. data/ext/minimap2/ksw2_extz2_sse.c +1 -1
  15. data/ext/minimap2/lchain.c +15 -16
  16. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  17. data/ext/minimap2/lib/simde/COPYING +20 -0
  18. data/ext/minimap2/lib/simde/README.md +333 -0
  19. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  20. data/ext/minimap2/lib/simde/meson.build +33 -0
  21. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  29. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  30. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  31. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  32. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  33. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  34. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  35. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  36. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  37. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  38. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  39. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  40. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  41. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  42. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  43. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  44. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  45. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  46. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  47. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  48. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  49. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  50. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  51. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  52. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  53. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  54. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  55. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  56. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  57. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  58. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  59. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  60. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  61. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  62. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  63. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  64. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  65. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  66. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  67. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  68. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  69. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  70. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  71. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  72. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  73. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  74. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  75. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  76. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  77. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  78. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  79. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  80. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  81. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  82. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  83. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  84. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  85. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  86. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  87. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  88. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  89. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  90. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  91. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  92. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  93. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  94. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  95. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  96. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  97. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  98. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  99. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  100. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  101. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  102. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  103. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  104. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  105. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  106. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  107. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  108. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  109. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  110. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  111. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  112. data/ext/minimap2/main.c +13 -6
  113. data/ext/minimap2/map.c +0 -5
  114. data/ext/minimap2/minimap.h +40 -31
  115. data/ext/minimap2/minimap2.1 +19 -5
  116. data/ext/minimap2/misc/paftools.js +545 -24
  117. data/ext/minimap2/options.c +1 -1
  118. data/ext/minimap2/pyproject.toml +2 -0
  119. data/ext/minimap2/python/mappy.pyx +3 -1
  120. data/ext/minimap2/seed.c +1 -1
  121. data/ext/minimap2/setup.py +32 -22
  122. data/lib/minimap2/version.rb +1 -1
  123. metadata +100 -3
@@ -0,0 +1,1783 @@
1
+ /* Copyright (c) 2017-2020 Evan Nemerson <evan@nemerson.com>
2
+ *
3
+ * Permission is hereby granted, free of charge, to any person
4
+ * obtaining a copy of this software and associated documentation
5
+ * files (the "Software"), to deal in the Software without
6
+ * restriction, including without limitation the rights to use, copy,
7
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ * of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be
12
+ * included in all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ * SOFTWARE.
22
+ */
23
+
24
+ #if !defined(SIMDE__SSE4_1_H)
25
+ # if !defined(SIMDE__SSE4_1_H)
26
+ # define SIMDE__SSE4_1_H
27
+ # endif
28
+ # include "ssse3.h"
29
+
30
+ HEDLEY_DIAGNOSTIC_PUSH
31
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
32
+
33
+ # if defined(SIMDE_SSE4_1_NATIVE)
34
+ # undef SIMDE_SSE4_1_NATIVE
35
+ # endif
36
+ # if defined(SIMDE_ARCH_X86_SSE4_1) && !defined(SIMDE_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
37
+ # define SIMDE_SSE4_1_NATIVE
38
+ # elif defined(__ARM_NEON) && !defined(SIMDE_SSE4_1_NO_NEON) && !defined(SIMDE_NO_NEON)
39
+ # define SIMDE_SSE4_1_NEON
40
+ # elif defined(SIMDE_ARCH_POWER_ALTIVEC)
41
+ # define SIMDE_SSE4_1_POWER_ALTIVEC
42
+ # endif
43
+
44
+ # if defined(SIMDE_SSE4_1_NATIVE) && !defined(SIMDE_SSE3_NATIVE)
45
+ # if defined(SIMDE_SSE4_1_FORCE_NATIVE)
46
+ # error Native SSE4.1 support requires native SSE3 support
47
+ # else
48
+ HEDLEY_WARNING("Native SSE4.1 support requires native SSE3 support, disabling")
49
+ # undef SIMDE_SSE4_1_NATIVE
50
+ # endif
51
+ # elif defined(SIMDE_SSE4_1_NEON) && !defined(SIMDE_SSE3_NEON)
52
+ HEDLEY_WARNING("SSE4.1 NEON support requires SSE3 NEON support, disabling")
53
+ # undef SIMDE_SSE4_1_NEON
54
+ # endif
55
+
56
+ # if defined(SIMDE_SSE4_1_NATIVE)
57
+ # include <smmintrin.h>
58
+ # else
59
+ # if defined(SIMDE_SSE4_1_NEON)
60
+ # include <arm_neon.h>
61
+ # endif
62
+ # endif
63
+
64
+ SIMDE__BEGIN_DECLS
65
+
66
+ #if !defined(SIMDE_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
67
+ # define SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES
68
+ #endif
69
+
70
+ #if defined(SIMDE_SSE4_1_NATIVE)
71
+ # define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT
72
+ # define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF
73
+ # define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF
74
+ # define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO
75
+ # define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION
76
+
77
+ # define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC
78
+ # define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC
79
+ #else
80
+ # define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00
81
+ # define SIMDE_MM_FROUND_TO_NEG_INF 0x01
82
+ # define SIMDE_MM_FROUND_TO_POS_INF 0x02
83
+ # define SIMDE_MM_FROUND_TO_ZERO 0x03
84
+ # define SIMDE_MM_FROUND_CUR_DIRECTION 0x04
85
+
86
+ # define SIMDE_MM_FROUND_RAISE_EXC 0x00
87
+ # define SIMDE_MM_FROUND_NO_EXC 0x08
88
+ #endif
89
+
90
+ #define SIMDE_MM_FROUND_NINT \
91
+ (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC)
92
+ #define SIMDE_MM_FROUND_FLOOR \
93
+ (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC)
94
+ #define SIMDE_MM_FROUND_CEIL \
95
+ (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC)
96
+ #define SIMDE_MM_FROUND_TRUNC \
97
+ (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC)
98
+ #define SIMDE_MM_FROUND_RINT \
99
+ (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC)
100
+ #define SIMDE_MM_FROUND_NEARBYINT \
101
+ (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC)
102
+
103
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
104
+ # define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT
105
+ # define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
106
+ # define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF
107
+ # define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO
108
+ # define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION
109
+ # define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC
110
+ # define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT
111
+ # define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR
112
+ # define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL
113
+ # define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC
114
+ # define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT
115
+ # define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT
116
+ #endif
117
+
118
+ SIMDE__FUNCTION_ATTRIBUTES
119
+ simde__m128i
120
+ simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
121
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
122
+ simde__m128i_private
123
+ r_,
124
+ a_ = simde__m128i_to_private(a),
125
+ b_ = simde__m128i_to_private(b);
126
+
127
+ SIMDE__VECTORIZE
128
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
129
+ r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
130
+ }
131
+
132
+ return simde__m128i_from_private(r_);
133
+ }
134
+ #if defined(SIMDE_SSE4_1_NATIVE)
135
+ # define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
136
+ #endif
137
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
138
+ # define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
139
+ #endif
140
+
141
+ SIMDE__FUNCTION_ATTRIBUTES
142
+ simde__m128d
143
+ simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
144
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
145
+ simde__m128d_private
146
+ r_,
147
+ a_ = simde__m128d_to_private(a),
148
+ b_ = simde__m128d_to_private(b);
149
+
150
+ SIMDE__VECTORIZE
151
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
152
+ r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
153
+ }
154
+ return simde__m128d_from_private(r_);
155
+ }
156
+ #if defined(SIMDE_SSE4_1_NATIVE)
157
+ # define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
158
+ #endif
159
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
160
+ # define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
161
+ #endif
162
+
163
+ SIMDE__FUNCTION_ATTRIBUTES
164
+ simde__m128
165
+ simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
166
+ HEDLEY_REQUIRE_MSG((imm8 & 0xf) == imm8, "imm8 must be in range [0, 15]") {
167
+ simde__m128_private
168
+ r_,
169
+ a_ = simde__m128_to_private(a),
170
+ b_ = simde__m128_to_private(b);
171
+
172
+ SIMDE__VECTORIZE
173
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
174
+ r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
175
+ }
176
+ return simde__m128_from_private(r_);
177
+ }
178
+ #if defined(SIMDE_SSE4_1_NATIVE)
179
+ # define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
180
+ #endif
181
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
182
+ # define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
183
+ #endif
184
+
185
+ SIMDE__FUNCTION_ATTRIBUTES
186
+ simde__m128i
187
+ simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
188
+ #if defined(SIMDE_SSE4_1_NATIVE)
189
+ return _mm_blendv_epi8(a, b, mask);
190
+ #else
191
+ simde__m128i_private
192
+ r_,
193
+ a_ = simde__m128i_to_private(a),
194
+ b_ = simde__m128i_to_private(b),
195
+ mask_ = simde__m128i_to_private(mask);
196
+
197
+ #if defined(SIMDE_SSE4_1_NEON)
198
+ mask_ = simde__m128i_to_private(simde_mm_cmplt_epi8(mask, simde_mm_setzero_si128()));
199
+ r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
201
+ /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
202
+ #if defined(HEDLEY_INTEL_VERSION_CHECK)
203
+ __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
204
+ mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
205
+ #else
206
+ mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
207
+ #endif
208
+
209
+ r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
210
+ #else
211
+ SIMDE__VECTORIZE
212
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
213
+ int8_t m = mask_.i8[i] >> 7;
214
+ r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
215
+ }
216
+ #endif
217
+
218
+ return simde__m128i_from_private(r_);
219
+ #endif
220
+ }
221
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
222
+ # define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
223
+ #endif
224
+
225
+ SIMDE__FUNCTION_ATTRIBUTES
226
+ simde__m128i
227
+ simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
228
+ #if defined(SIMDE_SSE2_NATIVE)
229
+ mask = simde_mm_srai_epi16(mask, 15);
230
+ return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
231
+ #else
232
+ simde__m128i_private
233
+ r_,
234
+ a_ = simde__m128i_to_private(a),
235
+ b_ = simde__m128i_to_private(b),
236
+ mask_ = simde__m128i_to_private(mask);
237
+
238
+ #if defined(SIMDE_SSE4_1_NEON)
239
+ mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
240
+ r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
241
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
242
+ #if defined(HEDLEY_INTEL_VERSION_CHECK)
243
+ __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
244
+ mask_.i16 = mask_.i16 < z;
245
+ #else
246
+ mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
247
+ #endif
248
+
249
+ r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
250
+ #else
251
+ SIMDE__VECTORIZE
252
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
253
+ int16_t m = mask_.i16[i] >> 15;
254
+ r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
255
+ }
256
+ #endif
257
+
258
+ return simde__m128i_from_private(r_);
259
+ #endif
260
+ }
261
+
262
+ SIMDE__FUNCTION_ATTRIBUTES
263
+ simde__m128i
264
+ simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
265
+ #if defined(SIMDE_SSE4_1_NATIVE)
266
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
267
+ #else
268
+ simde__m128i_private
269
+ r_,
270
+ a_ = simde__m128i_to_private(a),
271
+ b_ = simde__m128i_to_private(b),
272
+ mask_ = simde__m128i_to_private(mask);
273
+
274
+ #if defined(SIMDE_SSE4_1_NEON)
275
+ mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
276
+ r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
277
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
278
+ #if defined(HEDLEY_INTEL_VERSION_CHECK)
279
+ __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
280
+ mask_.i32 = mask_.i32 < z;
281
+ #else
282
+ mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
283
+ #endif
284
+
285
+ r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
286
+ #else
287
+ SIMDE__VECTORIZE
288
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
289
+ int32_t m = mask_.i32[i] >> 31;
290
+ r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
291
+ }
292
+ #endif
293
+
294
+ return simde__m128i_from_private(r_);
295
+ #endif
296
+ }
297
+
298
+ SIMDE__FUNCTION_ATTRIBUTES
299
+ simde__m128i
300
+ simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
301
+ #if defined(SIMDE_SSE4_1_NATIVE)
302
+ return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
303
+ #else
304
+ simde__m128i_private
305
+ r_,
306
+ a_ = simde__m128i_to_private(a),
307
+ b_ = simde__m128i_to_private(b),
308
+ mask_ = simde__m128i_to_private(mask);
309
+
310
+ #if defined(SIMDE_SSE4_1_NEON) && defined(SIMDE_ARCH_AARCH64)
311
+ mask_.i64 = vreinterpretq_s64_u64(vcltq_s64(mask_.i64, vdupq_n_s64(UINT64_C(0))));
312
+ r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
313
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
314
+ #if defined(HEDLEY_INTEL_VERSION_CHECK)
315
+ __typeof__(mask_.i64) z = { 0, 0 };
316
+ mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
317
+ #else
318
+ mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
319
+ #endif
320
+
321
+ r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
322
+ #else
323
+ SIMDE__VECTORIZE
324
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
325
+ int64_t m = mask_.i64[i] >> 63;
326
+ r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
327
+ }
328
+ #endif
329
+
330
+ return simde__m128i_from_private(r_);
331
+ #endif
332
+ }
333
+
334
+ SIMDE__FUNCTION_ATTRIBUTES
335
+ simde__m128d
336
+ simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
337
+ #if defined(SIMDE_SSE4_1_NATIVE)
338
+ return _mm_blendv_pd(a, b, mask);
339
+ #else
340
+ return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
341
+ #endif
342
+ }
343
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
344
+ # define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
345
+ #endif
346
+
347
+ SIMDE__FUNCTION_ATTRIBUTES
348
+ simde__m128
349
+ simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
350
+ #if defined(SIMDE_SSE4_1_NATIVE)
351
+ return _mm_blendv_ps(a, b, mask);
352
+ #else
353
+ return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
354
+ #endif
355
+ }
356
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
357
+ # define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
358
+ #endif
359
+
360
+ SIMDE__FUNCTION_ATTRIBUTES
361
+ simde__m128d
362
+ simde_mm_ceil_pd (simde__m128d a) {
363
+ #if defined(SIMDE_SSE4_1_NATIVE)
364
+ return _mm_ceil_pd(a);
365
+ #else
366
+ simde__m128d_private
367
+ r_,
368
+ a_ = simde__m128d_to_private(a);
369
+
370
+ #if defined(SIMDE_SSE4_1_NEON) && defined(SIMDE_ARCH_AARCH64)
371
+ r_.neon_f64 = vrndpq_f64(a_.neon_f64);
372
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
373
+ r_.altivec_f64 = vec_ceil(a_.altivec_f64);
374
+ #elif defined(SIMDE_HAVE_MATH_H)
375
+ SIMDE__VECTORIZE
376
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
377
+ r_.f64[i] = ceil(a_.f64[i]);
378
+ }
379
+ #else
380
+ HEDLEY_UNREACHABLE();
381
+ #endif
382
+
383
+ return simde__m128d_from_private(r_);
384
+ #endif
385
+ }
386
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
387
+ # define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
388
+ #endif
389
+
390
+ SIMDE__FUNCTION_ATTRIBUTES
391
+ simde__m128
392
+ simde_mm_ceil_ps (simde__m128 a) {
393
+ #if defined(SIMDE_SSE4_1_NATIVE)
394
+ return _mm_ceil_ps(a);
395
+ #else
396
+ simde__m128_private
397
+ r_,
398
+ a_ = simde__m128_to_private(a);
399
+
400
+ #if defined(SIMDE_SSE4_1_NEON) && (SIMDE_ARCH_ARM >= 80) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0))
401
+ r_.neon_f32 = vrndpq_f32(a_.neon_f32);
402
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
403
+ r_.altivec_f32 = vec_ceil(a_.altivec_f32);
404
+ #elif defined(SIMDE_HAVE_MATH_H)
405
+ SIMDE__VECTORIZE
406
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
407
+ r_.f32[i] = ceilf(a_.f32[i]);
408
+ }
409
+ #else
410
+ HEDLEY_UNREACHABLE();
411
+ #endif
412
+
413
+ return simde__m128_from_private(r_);
414
+ #endif
415
+ }
416
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
417
+ # define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
418
+ #endif
419
+
420
+ SIMDE__FUNCTION_ATTRIBUTES
421
+ simde__m128d
422
+ simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
423
+ #if defined(SIMDE_SSE4_1_NATIVE)
424
+ return _mm_ceil_sd(a, b);
425
+ #else
426
+ simde__m128d_private
427
+ r_,
428
+ a_ = simde__m128d_to_private(a),
429
+ b_ = simde__m128d_to_private(b);
430
+
431
+ #if defined(SIMDE_HAVE_MATH_H)
432
+ r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], ceil(b_.f64[0])));
433
+ #else
434
+ HEDLEY_UNREACHABLE();
435
+ #endif
436
+
437
+ return simde__m128d_from_private(r_);
438
+ #endif
439
+ }
440
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
441
+ # define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
442
+ #endif
443
+
444
+ SIMDE__FUNCTION_ATTRIBUTES
445
+ simde__m128
446
+ simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
447
+ #if defined(SIMDE_SSE4_1_NATIVE)
448
+ return _mm_ceil_ss(a, b);
449
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
450
+ return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
451
+ #else
452
+ simde__m128_private
453
+ r_,
454
+ a_ = simde__m128_to_private(a),
455
+ b_ = simde__m128_to_private(b);
456
+
457
+ #if defined(SIMDE_HAVE_MATH_H)
458
+ r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], ceilf(b_.f32[0])));
459
+ #else
460
+ HEDLEY_UNREACHABLE();
461
+ #endif
462
+
463
+ return simde__m128_from_private(r_);
464
+ #endif
465
+ }
466
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
467
+ # define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
468
+ #endif
469
+
470
+ SIMDE__FUNCTION_ATTRIBUTES
471
+ simde__m128i
472
+ simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
473
+ #if defined(SIMDE_SSE4_1_NATIVE)
474
+ return _mm_cmpeq_epi64(a, b);
475
+ #else
476
+ simde__m128i_private
477
+ r_,
478
+ a_ = simde__m128i_to_private(a),
479
+ b_ = simde__m128i_to_private(b);
480
+
481
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
482
+ r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
483
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
484
+ r_.altivec_i64 = (vector signed long long) vec_cmpeq(a_.altivec_i64, b_.altivec_i64);
485
+ #else
486
+ SIMDE__VECTORIZE
487
+ for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
488
+ r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
489
+ }
490
+ #endif
491
+
492
+ return simde__m128i_from_private(r_);
493
+ #endif
494
+ }
495
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
496
+ # define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
497
+ #endif
498
+
499
+ SIMDE__FUNCTION_ATTRIBUTES
500
+ simde__m128i
501
+ simde_mm_cvtepi8_epi16 (simde__m128i a) {
502
+ #if defined(SIMDE_SSE4_1_NATIVE)
503
+ return _mm_cvtepi8_epi16(a);
504
+ #else
505
+ simde__m128i_private
506
+ r_,
507
+ a_ = simde__m128i_to_private(a);
508
+
509
+ #if defined(SIMDE__CONVERT_VECTOR)
510
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.m64_private[0].i8);
511
+ #else
512
+ SIMDE__VECTORIZE
513
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
514
+ r_.i16[i] = a_.i8[i];
515
+ }
516
+ #endif
517
+
518
+ return simde__m128i_from_private(r_);
519
+ #endif
520
+ }
521
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
522
+ # define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
523
+ #endif
524
+
525
+ SIMDE__FUNCTION_ATTRIBUTES
526
+ simde__m128i
527
+ simde_mm_cvtepi8_epi32 (simde__m128i a) {
528
+ #if defined(SIMDE_SSE4_1_NATIVE)
529
+ return _mm_cvtepi8_epi32(a);
530
+ #else
531
+ simde__m128i_private
532
+ r_,
533
+ a_ = simde__m128i_to_private(a);
534
+
535
+ SIMDE__VECTORIZE
536
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
537
+ r_.i32[i] = a_.i8[i];
538
+ }
539
+
540
+ return simde__m128i_from_private(r_);
541
+ #endif
542
+ }
543
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
544
+ # define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
545
+ #endif
546
+
547
+ SIMDE__FUNCTION_ATTRIBUTES
548
+ simde__m128i
549
+ simde_mm_cvtepi8_epi64 (simde__m128i a) {
550
+ #if defined(SIMDE_SSE4_1_NATIVE)
551
+ return _mm_cvtepi8_epi64(a);
552
+ #else
553
+ simde__m128i_private
554
+ r_,
555
+ a_ = simde__m128i_to_private(a);
556
+
557
+ SIMDE__VECTORIZE
558
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
559
+ r_.i64[i] = a_.i8[i];
560
+ }
561
+
562
+ return simde__m128i_from_private(r_);
563
+ #endif
564
+ }
565
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
566
+ # define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
567
+ #endif
568
+
569
+ SIMDE__FUNCTION_ATTRIBUTES
570
+ simde__m128i
571
+ simde_mm_cvtepu8_epi16 (simde__m128i a) {
572
+ #if defined(SIMDE_SSE4_1_NATIVE)
573
+ return _mm_cvtepu8_epi16(a);
574
+ #else
575
+ simde__m128i_private
576
+ r_,
577
+ a_ = simde__m128i_to_private(a);
578
+
579
+ #if defined(SIMDE__CONVERT_VECTOR) && !defined(SIMDE_BUG_CLANG_45541)
580
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.m64_private[0].u8);
581
+ #else
582
+ SIMDE__VECTORIZE
583
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
584
+ r_.i16[i] = a_.u8[i];
585
+ }
586
+ #endif
587
+
588
+ return simde__m128i_from_private(r_);
589
+ #endif
590
+ }
591
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
592
+ # define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
593
+ #endif
594
+
595
+ SIMDE__FUNCTION_ATTRIBUTES
596
+ simde__m128i
597
+ simde_mm_cvtepu8_epi32 (simde__m128i a) {
598
+ #if defined(SIMDE_SSE4_1_NATIVE)
599
+ return _mm_cvtepu8_epi32(a);
600
+ #else
601
+ simde__m128i_private
602
+ r_,
603
+ a_ = simde__m128i_to_private(a);
604
+
605
+ #if defined(SIMDE_SSE4_1_NEON)
606
+ uint8x16_t u8x16 = a_.neon_u8;
607
+ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
608
+ r_.neon_u32 = vmovl_u16(vget_low_u16(u16x8));
609
+ #else
610
+ SIMDE__VECTORIZE
611
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
612
+ r_.i32[i] = a_.u8[i];
613
+ }
614
+ #endif
615
+
616
+ return simde__m128i_from_private(r_);
617
+ #endif
618
+ }
619
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
620
+ # define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
621
+ #endif
622
+
623
+ SIMDE__FUNCTION_ATTRIBUTES
624
+ simde__m128i
625
+ simde_mm_cvtepu8_epi64 (simde__m128i a) {
626
+ #if defined(SIMDE_SSE4_1_NATIVE)
627
+ return _mm_cvtepu8_epi64(a);
628
+ #else
629
+ simde__m128i_private
630
+ r_,
631
+ a_ = simde__m128i_to_private(a);
632
+
633
+ SIMDE__VECTORIZE
634
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
635
+ r_.i64[i] = a_.u8[i];
636
+ }
637
+
638
+ return simde__m128i_from_private(r_);
639
+ #endif
640
+ }
641
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
642
+ # define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
643
+ #endif
644
+
645
+ SIMDE__FUNCTION_ATTRIBUTES
646
+ simde__m128i
647
+ simde_mm_cvtepi16_epi32 (simde__m128i a) {
648
+ #if defined(SIMDE_SSE4_1_NATIVE)
649
+ return _mm_cvtepi16_epi32(a);
650
+ #else
651
+ simde__m128i_private
652
+ r_,
653
+ a_ = simde__m128i_to_private(a);
654
+
655
+ #if defined(SIMDE_SSE4_1_NEON)
656
+ r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
657
+ #else
658
+ SIMDE__VECTORIZE
659
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
660
+ r_.i32[i] = a_.i16[i];
661
+ }
662
+ #endif
663
+
664
+ return simde__m128i_from_private(r_);
665
+ #endif
666
+ }
667
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
668
+ # define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
669
+ #endif
670
+
671
+ SIMDE__FUNCTION_ATTRIBUTES
672
+ simde__m128i
673
+ simde_mm_cvtepu16_epi32 (simde__m128i a) {
674
+ #if defined(SIMDE_SSE4_1_NATIVE)
675
+ return _mm_cvtepu16_epi32(a);
676
+ #else
677
+ simde__m128i_private
678
+ r_,
679
+ a_ = simde__m128i_to_private(a);
680
+
681
+ #if defined(SIMDE__CONVERT_VECTOR) && !defined(SIMDE_BUG_CLANG_45541)
682
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].u16);
683
+ #else
684
+ SIMDE__VECTORIZE
685
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
686
+ r_.i32[i] = a_.u16[i];
687
+ }
688
+ #endif
689
+
690
+ return simde__m128i_from_private(r_);
691
+ #endif
692
+ }
693
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
694
+ # define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
695
+ #endif
696
+
697
+ SIMDE__FUNCTION_ATTRIBUTES
698
+ simde__m128i
699
+ simde_mm_cvtepu16_epi64 (simde__m128i a) {
700
+ #if defined(SIMDE_SSE4_1_NATIVE)
701
+ return _mm_cvtepu16_epi64(a);
702
+ #else
703
+ simde__m128i_private
704
+ r_,
705
+ a_ = simde__m128i_to_private(a);
706
+
707
+ SIMDE__VECTORIZE
708
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
709
+ r_.i64[i] = a_.u16[i];
710
+ }
711
+
712
+ return simde__m128i_from_private(r_);
713
+ #endif
714
+ }
715
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
716
+ # define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
717
+ #endif
718
+
719
+ SIMDE__FUNCTION_ATTRIBUTES
720
+ simde__m128i
721
+ simde_mm_cvtepi16_epi64 (simde__m128i a) {
722
+ #if defined(SIMDE_SSE4_1_NATIVE)
723
+ return _mm_cvtepi16_epi64(a);
724
+ #else
725
+ simde__m128i_private
726
+ r_,
727
+ a_ = simde__m128i_to_private(a);
728
+
729
+ SIMDE__VECTORIZE
730
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
731
+ r_.i64[i] = a_.i16[i];
732
+ }
733
+
734
+ return simde__m128i_from_private(r_);
735
+ #endif
736
+ }
737
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
738
+ # define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
739
+ #endif
740
+
741
+ SIMDE__FUNCTION_ATTRIBUTES
742
+ simde__m128i
743
+ simde_mm_cvtepi32_epi64 (simde__m128i a) {
744
+ #if defined(SIMDE_SSE4_1_NATIVE)
745
+ return _mm_cvtepi32_epi64(a);
746
+ #else
747
+ simde__m128i_private
748
+ r_,
749
+ a_ = simde__m128i_to_private(a);
750
+
751
+ #if defined(SIMDE__CONVERT_VECTOR)
752
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].i32);
753
+ #else
754
+ SIMDE__VECTORIZE
755
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
756
+ r_.i64[i] = a_.i32[i];
757
+ }
758
+ #endif
759
+
760
+ return simde__m128i_from_private(r_);
761
+ #endif
762
+ }
763
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
764
+ # define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
765
+ #endif
766
+
767
+ SIMDE__FUNCTION_ATTRIBUTES
768
+ simde__m128i
769
+ simde_mm_cvtepu32_epi64 (simde__m128i a) {
770
+ #if defined(SIMDE_SSE4_1_NATIVE)
771
+ return _mm_cvtepu32_epi64(a);
772
+ #else
773
+ simde__m128i_private
774
+ r_,
775
+ a_ = simde__m128i_to_private(a);
776
+
777
+ #if defined(SIMDE__CONVERT_VECTOR)
778
+ SIMDE__CONVERT_VECTOR(r_.i64, a_.m64_private[0].u32);
779
+ #else
780
+ SIMDE__VECTORIZE
781
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
782
+ r_.i64[i] = a_.u32[i];
783
+ }
784
+ #endif
785
+
786
+ return simde__m128i_from_private(r_);
787
+ #endif
788
+ }
789
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
790
+ # define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
791
+ #endif
792
+
793
+ SIMDE__FUNCTION_ATTRIBUTES
794
+ simde__m128d
795
+ simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
796
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
797
+ simde__m128d_private
798
+ r_,
799
+ a_ = simde__m128d_to_private(a),
800
+ b_ = simde__m128d_to_private(b);
801
+
802
+ simde_float64 sum = SIMDE_FLOAT64_C(0.0);
803
+
804
+ SIMDE__VECTORIZE_REDUCTION(+:sum)
805
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
806
+ sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
807
+ }
808
+
809
+ SIMDE__VECTORIZE
810
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
811
+ r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
812
+ }
813
+
814
+ return simde__m128d_from_private(r_);
815
+ }
816
+ #if defined(SIMDE_SSE4_1_NATIVE)
817
+ # define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
818
+ #endif
819
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
820
+ # define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
821
+ #endif
822
+
823
+ SIMDE__FUNCTION_ATTRIBUTES
824
+ simde__m128
825
+ simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
826
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
827
+ simde__m128_private
828
+ r_,
829
+ a_ = simde__m128_to_private(a),
830
+ b_ = simde__m128_to_private(b);
831
+
832
+ simde_float32 sum = SIMDE_FLOAT32_C(0.0);
833
+
834
+ SIMDE__VECTORIZE_REDUCTION(+:sum)
835
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
836
+ sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
837
+ }
838
+
839
+ SIMDE__VECTORIZE
840
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
841
+ r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
842
+ }
843
+
844
+ return simde__m128_from_private(r_);
845
+ }
846
+ #if defined(SIMDE_SSE4_1_NATIVE)
847
+ # define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
848
+ #endif
849
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
850
+ # define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
851
+ #endif
852
+
853
+ #if defined(simde_mm_extract_epi8)
854
+ # undef simde_mm_extract_epi8
855
+ #endif
856
+ SIMDE__FUNCTION_ATTRIBUTES
857
+ int8_t
858
+ simde_mm_extract_epi8 (simde__m128i a, const int imm8)
859
+ HEDLEY_REQUIRE_MSG((imm8 & 0xf) == imm8, "imm8 must be in range [0, 15]") {
860
+ simde__m128i_private
861
+ a_ = simde__m128i_to_private(a);
862
+
863
+ return a_.i8[imm8&15];
864
+ }
865
+ #if defined(SIMDE_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
866
+ # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
867
+ #elif defined(SIMDE_SSE4_1_NEON)
868
+ # define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
869
+ #endif
870
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
871
+ # define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
872
+ #endif
873
+
874
+ #if defined(simde_mm_extract_epi32)
875
+ # undef simde_mm_extract_epi32
876
+ #endif
877
+ SIMDE__FUNCTION_ATTRIBUTES
878
+ int32_t
879
+ simde_mm_extract_epi32 (simde__m128i a, const int imm8)
880
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
881
+ simde__m128i_private
882
+ a_ = simde__m128i_to_private(a);
883
+
884
+ return a_.i32[imm8&3];
885
+ }
886
+ #if defined(SIMDE_SSE4_1_NATIVE)
887
+ # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
888
+ #elif defined(SIMDE_SSE4_1_NEON)
889
+ # define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
890
+ #endif
891
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
892
+ # define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
893
+ #endif
894
+
895
+ #if defined(simde_mm_extract_epi64)
896
+ # undef simde_mm_extract_epi64
897
+ #endif
898
+ SIMDE__FUNCTION_ATTRIBUTES
899
+ int64_t
900
+ simde_mm_extract_epi64 (simde__m128i a, const int imm8)
901
+ HEDLEY_REQUIRE_MSG((imm8 & 1) == imm8, "imm8 must be 0 or 1") {
902
+ simde__m128i_private
903
+ a_ = simde__m128i_to_private(a);
904
+
905
+ return a_.i64[imm8&1];
906
+ }
907
+ #if defined(SIMDE_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
908
+ # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
909
+ #elif defined(SIMDE_SSE4_1_NEON)
910
+ # define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
911
+ #endif
912
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
913
+ # define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
914
+ #endif
915
+
916
+ SIMDE__FUNCTION_ATTRIBUTES
917
+ simde__m128d
918
+ simde_mm_floor_pd (simde__m128d a) {
919
+ #if defined(SIMDE_SSE4_1_NATIVE)
920
+ return _mm_floor_pd(a);
921
+ #else
922
+ simde__m128d_private
923
+ r_,
924
+ a_ = simde__m128d_to_private(a);
925
+
926
+ #if defined(SIMDE_HAVE_MATH_H)
927
+ SIMDE__VECTORIZE
928
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
929
+ r_.f64[i] = floor(a_.f64[i]);
930
+ }
931
+ #else
932
+ HEDLEY_UNREACHABLE();
933
+ #endif
934
+
935
+ return simde__m128d_from_private(r_);
936
+ #endif
937
+ }
938
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
939
+ # define _mm_floor_pd(a) simde_mm_floor_pd(a)
940
+ #endif
941
+
942
+ SIMDE__FUNCTION_ATTRIBUTES
943
+ simde__m128
944
+ simde_mm_floor_ps (simde__m128 a) {
945
+ #if defined(SIMDE_SSE4_1_NATIVE)
946
+ return _mm_floor_ps(a);
947
+ #else
948
+ simde__m128_private
949
+ r_,
950
+ a_ = simde__m128_to_private(a);
951
+
952
+ #if defined(SIMDE_HAVE_MATH_H)
953
+ SIMDE__VECTORIZE
954
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
955
+ r_.f32[i] = floorf(a_.f32[i]);
956
+ }
957
+ #else
958
+ HEDLEY_UNREACHABLE();
959
+ #endif
960
+
961
+ return simde__m128_from_private(r_);
962
+ #endif
963
+ }
964
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
965
+ # define _mm_floor_ps(a) simde_mm_floor_ps(a)
966
+ #endif
967
+
968
+ SIMDE__FUNCTION_ATTRIBUTES
969
+ simde__m128d
970
+ simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
971
+ #if defined(SIMDE_SSE4_1_NATIVE)
972
+ return _mm_floor_sd(a, b);
973
+ #else
974
+ simde__m128d_private
975
+ r_,
976
+ a_ = simde__m128d_to_private(a),
977
+ b_ = simde__m128d_to_private(b);
978
+
979
+ #if defined(SIMDE_HAVE_MATH_H)
980
+ r_.f64[0] = floor(b_.f64[0]);
981
+ r_.f64[1] = a_.f64[1];
982
+ #else
983
+ HEDLEY_UNREACHABLE();
984
+ #endif
985
+
986
+ return simde__m128d_from_private(r_);
987
+ #endif
988
+ }
989
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
990
+ # define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
991
+ #endif
992
+
993
+ SIMDE__FUNCTION_ATTRIBUTES
994
+ simde__m128
995
+ simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
996
+ #if defined(SIMDE_SSE4_1_NATIVE)
997
+ return _mm_floor_ss(a, b);
998
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
999
+ return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1000
+ #else
1001
+ simde__m128_private
1002
+ r_,
1003
+ a_ = simde__m128_to_private(a),
1004
+ b_ = simde__m128_to_private(b);
1005
+
1006
+ #if defined(SIMDE_HAVE_MATH_H)
1007
+ r_.f32[0] = floorf(b_.f32[0]);
1008
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1009
+ r_.f32[i] = a_.f32[i];
1010
+ }
1011
+ #else
1012
+ HEDLEY_UNREACHABLE();
1013
+ #endif
1014
+
1015
+ return simde__m128_from_private(r_);
1016
+ #endif
1017
+ }
1018
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1019
+ # define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1020
+ #endif
1021
+
1022
+ SIMDE__FUNCTION_ATTRIBUTES
1023
+ simde__m128i
1024
+ simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1025
+ HEDLEY_REQUIRE_MSG((imm8 & 0xf) == imm8, "imm8 must be in range [0, 15]") {
1026
+ simde__m128i_private
1027
+ r_ = simde__m128i_to_private(a);
1028
+
1029
+ r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1030
+
1031
+ return simde__m128i_from_private(r_);
1032
+ }
1033
+ #if defined(SIMDE_SSE4_1_NATIVE)
1034
+ # define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1035
+ #endif
1036
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1037
+ # define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1038
+ #endif
1039
+
1040
+ SIMDE__FUNCTION_ATTRIBUTES
1041
+ simde__m128i
1042
+ simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1043
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
1044
+ simde__m128i_private
1045
+ r_ = simde__m128i_to_private(a);
1046
+
1047
+ r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1048
+
1049
+ return simde__m128i_from_private(r_);
1050
+ }
1051
+ #if defined(SIMDE_SSE4_1_NATIVE)
1052
+ # define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1053
+ #endif
1054
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1055
+ # define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1056
+ #endif
1057
+
1058
+ SIMDE__FUNCTION_ATTRIBUTES
1059
+ simde__m128i
1060
+ simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1061
+ HEDLEY_REQUIRE_MSG((imm8 & 1) == imm8, "imm8 must be 0 or 1") {
1062
+ #if defined(SIMDE_BUG_GCC_94482)
1063
+ simde__m128i_private
1064
+ a_ = simde__m128i_to_private(a);
1065
+
1066
+ switch(imm8) {
1067
+ case 0:
1068
+ return simde_mm_set_epi64x(a_.i64[1], i);
1069
+ break;
1070
+ case 1:
1071
+ return simde_mm_set_epi64x(i, a_.i64[0]);
1072
+ break;
1073
+ default:
1074
+ HEDLEY_UNREACHABLE();
1075
+ break;
1076
+ }
1077
+ #else
1078
+ simde__m128i_private
1079
+ r_ = simde__m128i_to_private(a);
1080
+
1081
+ r_.i64[imm8] = i;
1082
+
1083
+ return simde__m128i_from_private(r_);
1084
+ #endif
1085
+ }
1086
+ #if defined(SIMDE_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1087
+ # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1088
+ #endif
1089
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1090
+ # define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1091
+ #endif
1092
+
1093
+ SIMDE__FUNCTION_ATTRIBUTES
1094
+ simde__m128
1095
+ simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1096
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
1097
+ simde__m128_private
1098
+ r_,
1099
+ a_ = simde__m128_to_private(a),
1100
+ b_ = simde__m128_to_private(b);
1101
+
1102
+ a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1103
+ a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1104
+
1105
+ SIMDE__VECTORIZE
1106
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1107
+ r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1108
+ }
1109
+
1110
+ return simde__m128_from_private(r_);
1111
+ }
1112
+ #if defined(SIMDE_SSE4_1_NATIVE)
1113
+ # define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1114
+ #endif
1115
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1116
+ # define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1117
+ #endif
1118
+
1119
+ SIMDE__FUNCTION_ATTRIBUTES
1120
+ simde__m128i
1121
+ simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1122
+ #if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI)
1123
+ return _mm_max_epi8(a, b);
1124
+ #else
1125
+ simde__m128i_private
1126
+ r_,
1127
+ a_ = simde__m128i_to_private(a),
1128
+ b_ = simde__m128i_to_private(b);
1129
+
1130
+ #if defined(SIMDE_SSE4_1_NEON)
1131
+ r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1132
+ #else
1133
+ SIMDE__VECTORIZE
1134
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1135
+ r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1136
+ }
1137
+ #endif
1138
+
1139
+ return simde__m128i_from_private(r_);
1140
+ #endif
1141
+ }
1142
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1143
+ # define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1144
+ #endif
1145
+
1146
+ SIMDE__FUNCTION_ATTRIBUTES
1147
+ simde__m128i
1148
+ simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1149
+ #if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI)
1150
+ return _mm_max_epi32(a, b);
1151
+ #else
1152
+ simde__m128i_private
1153
+ r_,
1154
+ a_ = simde__m128i_to_private(a),
1155
+ b_ = simde__m128i_to_private(b);
1156
+
1157
+ #if defined(SIMDE_SSE4_1_NEON)
1158
+ r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1159
+ #else
1160
+ SIMDE__VECTORIZE
1161
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1162
+ r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1163
+ }
1164
+ #endif
1165
+
1166
+ return simde__m128i_from_private(r_);
1167
+ #endif
1168
+ }
1169
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1170
+ # define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1171
+ #endif
1172
+
1173
+ SIMDE__FUNCTION_ATTRIBUTES
1174
+ simde__m128i
1175
+ simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1176
+ #if defined(SIMDE_SSE4_1_NATIVE)
1177
+ return _mm_max_epu16(a, b);
1178
+ #else
1179
+ simde__m128i_private
1180
+ r_,
1181
+ a_ = simde__m128i_to_private(a),
1182
+ b_ = simde__m128i_to_private(b);
1183
+
1184
+ #if defined(SIMDE_SSE4_1_NEON)
1185
+ r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1186
+ #else
1187
+ SIMDE__VECTORIZE
1188
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1189
+ r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1190
+ }
1191
+ #endif
1192
+
1193
+ return simde__m128i_from_private(r_);
1194
+ #endif
1195
+ }
1196
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1197
+ # define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1198
+ #endif
1199
+
1200
+ SIMDE__FUNCTION_ATTRIBUTES
1201
+ simde__m128i
1202
+ simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1203
+ #if defined(SIMDE_SSE4_1_NATIVE)
1204
+ return _mm_max_epu32(a, b);
1205
+ #else
1206
+ simde__m128i_private
1207
+ r_,
1208
+ a_ = simde__m128i_to_private(a),
1209
+ b_ = simde__m128i_to_private(b);
1210
+
1211
+ #if defined(SIMDE_SSE4_1_NEON)
1212
+ r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1213
+ #else
1214
+ SIMDE__VECTORIZE
1215
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1216
+ r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1217
+ }
1218
+ #endif
1219
+
1220
+ return simde__m128i_from_private(r_);
1221
+ #endif
1222
+ }
1223
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1224
+ # define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1225
+ #endif
1226
+
1227
+ SIMDE__FUNCTION_ATTRIBUTES
1228
+ simde__m128i
1229
+ simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1230
+ #if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI)
1231
+ return _mm_min_epi8(a, b);
1232
+ #else
1233
+ simde__m128i_private
1234
+ r_,
1235
+ a_ = simde__m128i_to_private(a),
1236
+ b_ = simde__m128i_to_private(b);
1237
+
1238
+ #if defined(SIMDE_SSE4_1_NEON)
1239
+ r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1240
+ #else
1241
+ SIMDE__VECTORIZE
1242
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1243
+ r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1244
+ }
1245
+ #endif
1246
+
1247
+ return simde__m128i_from_private(r_);
1248
+ #endif
1249
+ }
1250
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1251
+ # define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1252
+ #endif
1253
+
1254
+ SIMDE__FUNCTION_ATTRIBUTES
1255
+ simde__m128i
1256
+ simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1257
+ #if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI)
1258
+ return _mm_min_epi32(a, b);
1259
+ #else
1260
+ simde__m128i_private
1261
+ r_,
1262
+ a_ = simde__m128i_to_private(a),
1263
+ b_ = simde__m128i_to_private(b);
1264
+
1265
+ #if defined(SIMDE_SSE4_1_NEON)
1266
+ r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1267
+ #else
1268
+ SIMDE__VECTORIZE
1269
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1270
+ r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1271
+ }
1272
+ #endif
1273
+
1274
+ return simde__m128i_from_private(r_);
1275
+ #endif
1276
+ }
1277
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1278
+ # define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1279
+ #endif
1280
+
1281
+ SIMDE__FUNCTION_ATTRIBUTES
1282
+ simde__m128i
1283
+ simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1284
+ #if defined(SIMDE_SSE4_1_NATIVE)
1285
+ return _mm_min_epu16(a, b);
1286
+ #else
1287
+ simde__m128i_private
1288
+ r_,
1289
+ a_ = simde__m128i_to_private(a),
1290
+ b_ = simde__m128i_to_private(b);
1291
+
1292
+ #if defined(SIMDE_SSE4_1_NEON)
1293
+ r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1294
+ #else
1295
+ SIMDE__VECTORIZE
1296
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1297
+ r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1298
+ }
1299
+ #endif
1300
+
1301
+ return simde__m128i_from_private(r_);
1302
+ #endif
1303
+ }
1304
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1305
+ # define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1306
+ #endif
1307
+
1308
+ SIMDE__FUNCTION_ATTRIBUTES
1309
+ simde__m128i
1310
+ simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1311
+ #if defined(SIMDE_SSE4_1_NATIVE)
1312
+ return _mm_min_epu32(a, b);
1313
+ #else
1314
+ simde__m128i_private
1315
+ r_,
1316
+ a_ = simde__m128i_to_private(a),
1317
+ b_ = simde__m128i_to_private(b);
1318
+
1319
+ #if defined(SIMDE_SSE4_1_NEON)
1320
+ r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1321
+ #else
1322
+ SIMDE__VECTORIZE
1323
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1324
+ r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1325
+ }
1326
+ #endif
1327
+
1328
+ return simde__m128i_from_private(r_);
1329
+ #endif
1330
+ }
1331
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1332
+ # define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1333
+ #endif
1334
+
1335
+ SIMDE__FUNCTION_ATTRIBUTES
1336
+ simde__m128i
1337
+ simde_mm_minpos_epu16 (simde__m128i a) {
1338
+ #if defined(SIMDE_SSE4_1_NATIVE)
1339
+ return _mm_minpos_epu16(a);
1340
+ #else
1341
+ simde__m128i_private
1342
+ r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1343
+ a_ = simde__m128i_to_private(a);
1344
+
1345
+ r_.u16[0] = UINT16_MAX;
1346
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1347
+ if (a_.u16[i] < r_.u16[0]) {
1348
+ r_.u16[0] = a_.u16[i];
1349
+ r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1350
+ }
1351
+ }
1352
+
1353
+ return simde__m128i_from_private(r_);
1354
+ #endif
1355
+ }
1356
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1357
+ # define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1358
+ #endif
1359
+
1360
+ SIMDE__FUNCTION_ATTRIBUTES
1361
+ simde__m128i
1362
+ simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1363
+ HEDLEY_REQUIRE_MSG((imm8 & 7) == imm8, "imm8 must be in range [0, 7]") {
1364
+ simde__m128i_private
1365
+ r_,
1366
+ a_ = simde__m128i_to_private(a),
1367
+ b_ = simde__m128i_to_private(b);
1368
+
1369
+ const int a_offset = imm8 & 4;
1370
+ const int b_offset = (imm8 & 3) << 2;
1371
+
1372
+ #if defined(SIMDE_HAVE_MATH_H)
1373
+ for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1374
+ r_.u16[i] =
1375
+ HEDLEY_STATIC_CAST(uint16_t, abs(a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0])) +
1376
+ HEDLEY_STATIC_CAST(uint16_t, abs(a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1])) +
1377
+ HEDLEY_STATIC_CAST(uint16_t, abs(a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2])) +
1378
+ HEDLEY_STATIC_CAST(uint16_t, abs(a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3]));
1379
+ }
1380
+ #else
1381
+ HEDLEY_UNREACHABLE();
1382
+ #endif
1383
+
1384
+ return simde__m128i_from_private(r_);
1385
+ }
1386
+ #if defined(SIMDE_SSE4_1_NATIVE)
1387
+ # define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1388
+ #endif
1389
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1390
+ # define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1391
+ #endif
1392
+
1393
+ SIMDE__FUNCTION_ATTRIBUTES
1394
+ simde__m128i
1395
+ simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1396
+ #if defined(SIMDE_SSE4_1_NATIVE)
1397
+ return _mm_mul_epi32(a, b);
1398
+ #else
1399
+ simde__m128i_private
1400
+ r_,
1401
+ a_ = simde__m128i_to_private(a),
1402
+ b_ = simde__m128i_to_private(b);
1403
+
1404
+ SIMDE__VECTORIZE
1405
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1406
+ r_.i64[i] =
1407
+ HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1408
+ HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1409
+ }
1410
+
1411
+ return simde__m128i_from_private(r_);
1412
+ #endif
1413
+ }
1414
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1415
+ # define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1416
+ #endif
1417
+
1418
+ SIMDE__FUNCTION_ATTRIBUTES
1419
+ simde__m128i
1420
+ simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1421
+ #if defined(SIMDE_SSE4_1_NATIVE)
1422
+ return _mm_mullo_epi32(a, b);
1423
+ #else
1424
+ simde__m128i_private
1425
+ r_,
1426
+ a_ = simde__m128i_to_private(a),
1427
+ b_ = simde__m128i_to_private(b);
1428
+
1429
+ #if defined(SIMDE_SSE4_1_NEON)
1430
+ r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1431
+ #else
1432
+ SIMDE__VECTORIZE
1433
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1434
+ r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1435
+ }
1436
+ #endif
1437
+
1438
+ return simde__m128i_from_private(r_);
1439
+ #endif
1440
+ }
1441
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1442
+ # define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1443
+ #endif
1444
+
1445
+ SIMDE__FUNCTION_ATTRIBUTES
1446
+ simde__m128i
1447
+ simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1448
+ #if defined(SIMDE_SSE4_1_NATIVE)
1449
+ return _mm_packus_epi32(a, b);
1450
+ #else
1451
+ simde__m128i_private
1452
+ r_,
1453
+ a_ = simde__m128i_to_private(a),
1454
+ b_ = simde__m128i_to_private(b);
1455
+
1456
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1457
+ r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1458
+ r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1459
+ }
1460
+ return simde__m128i_from_private(r_);
1461
+ #endif
1462
+ }
1463
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1464
+ # define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1465
+ #endif
1466
+
1467
+ SIMDE__FUNCTION_ATTRIBUTES
1468
+ simde__m128d
1469
+ simde_mm_round_pd (simde__m128d a, int rounding) {
1470
+ simde__m128d_private
1471
+ r_,
1472
+ a_ = simde__m128d_to_private(a);
1473
+
1474
+ #if defined(SIMDE_HAVE_MATH_H)
1475
+ for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1476
+ switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1477
+ case SIMDE_MM_FROUND_TO_NEAREST_INT:
1478
+ r_.f64[i] = nearbyint(a_.f64[i]);
1479
+ break;
1480
+ case SIMDE_MM_FROUND_TO_NEG_INF:
1481
+ r_.f64[i] = floor(a_.f64[i]);
1482
+ break;
1483
+ case SIMDE_MM_FROUND_TO_POS_INF:
1484
+ r_.f64[i] = ceil(a_.f64[i]);
1485
+ break;
1486
+ case SIMDE_MM_FROUND_TO_ZERO:
1487
+ r_.f64[i] = trunc(a_.f64[i]);
1488
+ break;
1489
+ case SIMDE_MM_FROUND_CUR_DIRECTION:
1490
+ r_.f64[i] = nearbyint(a_.f64[i]);
1491
+ break;
1492
+ default:
1493
+ HEDLEY_UNREACHABLE();
1494
+ break;
1495
+ }
1496
+ }
1497
+ #else
1498
+ HEDLEY_UNREACHABLE();
1499
+ #endif
1500
+
1501
+ return simde__m128d_from_private(r_);
1502
+ }
1503
+ #if defined(SIMDE_SSE4_1_NATIVE)
1504
+ # define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
1505
+ #endif
1506
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1507
+ # define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
1508
+ #endif
1509
+
1510
+ SIMDE__FUNCTION_ATTRIBUTES
1511
+ simde__m128
1512
+ simde_mm_round_ps (simde__m128 a, int rounding) {
1513
+ simde__m128_private
1514
+ r_,
1515
+ a_ = simde__m128_to_private(a);
1516
+
1517
+ #if defined(SIMDE_HAVE_MATH_H)
1518
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1519
+ switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1520
+ case SIMDE_MM_FROUND_TO_NEAREST_INT:
1521
+ r_.f32[i] = nearbyintf(a_.f32[i]);
1522
+ break;
1523
+ case SIMDE_MM_FROUND_TO_NEG_INF:
1524
+ r_.f32[i] = floorf(a_.f32[i]);
1525
+ break;
1526
+ case SIMDE_MM_FROUND_TO_POS_INF:
1527
+ r_.f32[i] = ceilf(a_.f32[i]);
1528
+ break;
1529
+ case SIMDE_MM_FROUND_TO_ZERO:
1530
+ r_.f32[i] = truncf(a_.f32[i]);
1531
+ break;
1532
+ case SIMDE_MM_FROUND_CUR_DIRECTION:
1533
+ r_.f32[i] = nearbyintf (a_.f32[i]);
1534
+ break;
1535
+ default:
1536
+ HEDLEY_UNREACHABLE();
1537
+ break;
1538
+ }
1539
+ }
1540
+ #else
1541
+ HEDLEY_UNREACHABLE();
1542
+ #endif
1543
+
1544
+ return simde__m128_from_private(r_);
1545
+ }
1546
+ #if defined(SIMDE_SSE4_1_NATIVE)
1547
+ # define simde_mm_round_ps(a, rounding) _mm_round_ps(a, rounding)
1548
+ #endif
1549
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1550
+ # define _mm_round_ps(a, rounding) simde_mm_round_ps(a, rounding)
1551
+ #endif
1552
+
1553
+ SIMDE__FUNCTION_ATTRIBUTES
1554
+ simde__m128d
1555
+ simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) {
1556
+ simde__m128d_private
1557
+ r_ = simde__m128d_to_private(a),
1558
+ b_ = simde__m128d_to_private(b);
1559
+
1560
+ #if defined(SIMDE_HAVE_MATH_H)
1561
+ switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1562
+ case SIMDE_MM_FROUND_TO_NEAREST_INT:
1563
+ r_.f64[0] = nearbyint(b_.f64[0]);
1564
+ break;
1565
+ case SIMDE_MM_FROUND_TO_NEG_INF:
1566
+ r_.f64[0] = floor(b_.f64[0]);
1567
+ break;
1568
+ case SIMDE_MM_FROUND_TO_POS_INF:
1569
+ r_.f64[0] = ceil(b_.f64[0]);
1570
+ break;
1571
+ case SIMDE_MM_FROUND_TO_ZERO:
1572
+ r_.f64[0] = trunc(b_.f64[0]);
1573
+ break;
1574
+ case SIMDE_MM_FROUND_CUR_DIRECTION:
1575
+ r_.f64[0] = nearbyint(b_.f64[0]);
1576
+ break;
1577
+ default:
1578
+ HEDLEY_UNREACHABLE();
1579
+ break;
1580
+ }
1581
+ #else
1582
+ HEDLEY_UNREACHABLE();
1583
+ #endif
1584
+
1585
+ return simde__m128d_from_private(r_);
1586
+ }
1587
+ #if defined(SIMDE_SSE4_1_NATIVE)
1588
+ # define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1589
+ #endif
1590
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1591
+ # define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1592
+ #endif
1593
+
1594
+ SIMDE__FUNCTION_ATTRIBUTES
1595
+ simde__m128
1596
+ simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) {
1597
+ simde__m128_private
1598
+ r_ = simde__m128_to_private(a),
1599
+ b_ = simde__m128_to_private(b);
1600
+
1601
+ #if defined(SIMDE_HAVE_MATH_H)
1602
+ switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1603
+ case SIMDE_MM_FROUND_TO_NEAREST_INT:
1604
+ r_.f32[0] = nearbyintf(b_.f32[0]);
1605
+ break;
1606
+ case SIMDE_MM_FROUND_TO_NEG_INF:
1607
+ r_.f32[0] = floorf(b_.f32[0]);
1608
+ break;
1609
+ case SIMDE_MM_FROUND_TO_POS_INF:
1610
+ r_.f32[0] = ceilf(b_.f32[0]);
1611
+ break;
1612
+ case SIMDE_MM_FROUND_TO_ZERO:
1613
+ r_.f32[0] = truncf(b_.f32[0]);
1614
+ break;
1615
+ case SIMDE_MM_FROUND_CUR_DIRECTION:
1616
+ r_.f32[0] = nearbyintf (b_.f32[0]);
1617
+ break;
1618
+ default:
1619
+ HEDLEY_UNREACHABLE();
1620
+ break;
1621
+ }
1622
+ #else
1623
+ HEDLEY_UNREACHABLE();
1624
+ #endif
1625
+
1626
+ return simde__m128_from_private(r_);
1627
+ }
1628
+ #if defined(SIMDE_SSE4_1_NATIVE)
1629
+ # define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
1630
+ #endif
1631
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1632
+ # define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
1633
+ #endif
1634
+
1635
+ SIMDE__FUNCTION_ATTRIBUTES
1636
+ simde__m128i
1637
+ simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
1638
+ #if defined(SIMDE_SSE4_1_NATIVE)
1639
+ return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
1640
+ #else
1641
+ return *mem_addr;
1642
+ #endif
1643
+ }
1644
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1645
+ # define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
1646
+ #endif
1647
+
1648
+ SIMDE__FUNCTION_ATTRIBUTES
1649
+ int
1650
+ simde_mm_test_all_ones (simde__m128i a) {
1651
+ #if defined(SIMDE_SSE4_1_NATIVE)
1652
+ return _mm_test_all_ones(a);
1653
+ #else
1654
+ simde__m128i_private a_ = simde__m128i_to_private(a);
1655
+
1656
+ for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1657
+ if (a_.u64[i] != ~UINT64_C(0))
1658
+ return 0;
1659
+ }
1660
+
1661
+ return 1;
1662
+ #endif
1663
+ }
1664
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1665
+ # define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
1666
+ #endif
1667
+
1668
+ SIMDE__FUNCTION_ATTRIBUTES
1669
+ int
1670
+ simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
1671
+ #if defined(SIMDE_SSE4_1_NATIVE)
1672
+ return _mm_test_all_zeros(a, mask);
1673
+ #else
1674
+ simde__m128i_private
1675
+ a_ = simde__m128i_to_private(a),
1676
+ mask_ = simde__m128i_to_private(mask);
1677
+
1678
+ for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1679
+ if ((a_.u64[i] & mask_.u64[i]) != 0)
1680
+ return 0;
1681
+ }
1682
+
1683
+ return 1;
1684
+ #endif
1685
+ }
1686
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1687
+ # define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
1688
+ #endif
1689
+
1690
+ SIMDE__FUNCTION_ATTRIBUTES
1691
+ int
1692
+ simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
1693
+ #if defined(SIMDE_SSE4_1_NATIVE)
1694
+ return _mm_test_mix_ones_zeros(a, mask);
1695
+ #else
1696
+ simde__m128i_private
1697
+ a_ = simde__m128i_to_private(a),
1698
+ mask_ = simde__m128i_to_private(mask);
1699
+
1700
+ for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
1701
+ if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
1702
+ return 1;
1703
+
1704
+ return 0;
1705
+ #endif
1706
+ }
1707
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1708
+ # define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
1709
+ #endif
1710
+
1711
+ SIMDE__FUNCTION_ATTRIBUTES
1712
+ int
1713
+ simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
1714
+ #if defined(SIMDE_SSE4_1_NATIVE)
1715
+ return _mm_testc_si128(a, b);
1716
+ #else
1717
+ simde__m128i_private
1718
+ a_ = simde__m128i_to_private(a),
1719
+ b_ = simde__m128i_to_private(b);
1720
+
1721
+ int_fast32_t r = 0;
1722
+
1723
+ SIMDE__VECTORIZE_REDUCTION(|:r)
1724
+ for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
1725
+ r |= ~a_.i32f[i] & b_.i32f[i];
1726
+ }
1727
+
1728
+ return HEDLEY_STATIC_CAST(int, !r);
1729
+ #endif
1730
+ }
1731
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1732
+ # define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
1733
+ #endif
1734
+
1735
+ SIMDE__FUNCTION_ATTRIBUTES
1736
+ int
1737
+ simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
1738
+ #if defined(SIMDE_SSE4_1_NATIVE)
1739
+ return _mm_testnzc_si128(a, b);
1740
+ #else
1741
+ simde__m128i_private
1742
+ a_ = simde__m128i_to_private(a),
1743
+ b_ = simde__m128i_to_private(b);
1744
+
1745
+ for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1746
+ if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
1747
+ return 1;
1748
+ }
1749
+
1750
+ return 0;
1751
+ #endif
1752
+ }
1753
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1754
+ # define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
1755
+ #endif
1756
+
1757
+ SIMDE__FUNCTION_ATTRIBUTES
1758
+ int
1759
+ simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
1760
+ #if defined(SIMDE_SSE4_1_NATIVE)
1761
+ return _mm_testz_si128(a, b);
1762
+ #else
1763
+ simde__m128i_private
1764
+ a_ = simde__m128i_to_private(a),
1765
+ b_ = simde__m128i_to_private(b);
1766
+
1767
+ for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1768
+ if ((a_.u64[i] & b_.u64[i]) == 0)
1769
+ return 1;
1770
+ }
1771
+
1772
+ return 0;
1773
+ #endif
1774
+ }
1775
+ #if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES)
1776
+ # define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
1777
+ #endif
1778
+
1779
+ SIMDE__END_DECLS
1780
+
1781
+ HEDLEY_DIAGNOSTIC_POP
1782
+
1783
+ #endif /* !defined(SIMDE__SSE4_1_H) */