minimap2 0.2.25.0 → 0.2.25.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (123) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/ext/minimap2/Makefile +6 -2
  4. data/ext/minimap2/NEWS.md +38 -0
  5. data/ext/minimap2/README.md +9 -3
  6. data/ext/minimap2/align.c +5 -3
  7. data/ext/minimap2/cookbook.md +2 -2
  8. data/ext/minimap2/format.c +7 -4
  9. data/ext/minimap2/kalloc.c +20 -1
  10. data/ext/minimap2/kalloc.h +13 -2
  11. data/ext/minimap2/ksw2.h +1 -0
  12. data/ext/minimap2/ksw2_extd2_sse.c +1 -1
  13. data/ext/minimap2/ksw2_exts2_sse.c +79 -40
  14. data/ext/minimap2/ksw2_extz2_sse.c +1 -1
  15. data/ext/minimap2/lchain.c +15 -16
  16. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  17. data/ext/minimap2/lib/simde/COPYING +20 -0
  18. data/ext/minimap2/lib/simde/README.md +333 -0
  19. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  20. data/ext/minimap2/lib/simde/meson.build +33 -0
  21. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  29. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  30. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  31. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  32. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  33. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  34. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  35. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  36. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  37. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  38. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  39. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  40. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  41. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  42. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  43. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  44. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  45. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  46. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  47. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  48. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  49. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  50. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  51. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  52. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  53. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  54. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  55. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  56. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  57. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  58. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  59. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  60. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  61. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  62. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  63. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  64. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  65. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  66. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  67. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  68. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  69. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  70. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  71. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  72. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  73. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  74. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  75. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  76. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  77. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  78. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  79. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  80. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  81. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  82. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  83. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  84. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  85. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  86. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  87. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  88. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  89. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  90. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  91. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  92. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  93. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  94. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  95. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  96. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  97. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  98. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  99. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  100. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  101. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  102. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  103. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  104. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  105. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  106. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  107. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  108. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  109. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  110. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  111. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  112. data/ext/minimap2/main.c +13 -6
  113. data/ext/minimap2/map.c +0 -5
  114. data/ext/minimap2/minimap.h +40 -31
  115. data/ext/minimap2/minimap2.1 +19 -5
  116. data/ext/minimap2/misc/paftools.js +545 -24
  117. data/ext/minimap2/options.c +1 -1
  118. data/ext/minimap2/pyproject.toml +2 -0
  119. data/ext/minimap2/python/mappy.pyx +3 -1
  120. data/ext/minimap2/seed.c +1 -1
  121. data/ext/minimap2/setup.py +32 -22
  122. data/lib/minimap2/version.rb +1 -1
  123. metadata +100 -3
@@ -0,0 +1,3696 @@
1
+ /* Permission is hereby granted, free of charge, to any person
2
+ * obtaining a copy of this software and associated documentation
3
+ * files (the "Software"), to deal in the Software without
4
+ * restriction, including without limitation the rights to use, copy,
5
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
6
+ * of the Software, and to permit persons to whom the Software is
7
+ * furnished to do so, subject to the following conditions:
8
+ *
9
+ * The above copyright notice and this permission notice shall be
10
+ * included in all copies or substantial portions of the Software.
11
+ *
12
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ * SOFTWARE.
20
+ *
21
+ * Copyright:
22
+ * 2017-2020 Evan Nemerson <evan@nemerson.com>
23
+ * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
24
+ * 2015 Brandon Rowlett <browlett@nvidia.com>
25
+ * 2015 Ken Fast <kfast@gdeb.com>
26
+ */
27
+
28
+ #if !defined(SIMDE__SSE_H)
29
+ # if !defined(SIMDE__SSE_H)
30
+ # define SIMDE__SSE_H
31
+ # endif
32
+ # include "mmx.h"
33
+
34
+ HEDLEY_DIAGNOSTIC_PUSH
35
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
36
+
37
+ # if defined(SIMDE_SSE_NATIVE)
38
+ # undef SIMDE_SSE_NATIVE
39
+ # endif
40
+ # if defined(SIMDE_ARCH_X86_SSE) && !defined(SIMDE_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
41
+ # define SIMDE_SSE_NATIVE
42
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && !defined(SIMDE_NO_NEON)
43
+ # define SIMDE_SSE_NEON
44
+ # elif defined(SIMDE_ARCH_WASM_SIMD128)
45
+ # define SIMDE_SSE_WASM_SIMD128
46
+ # elif defined(SIMDE_ARCH_POWER_ALTIVEC)
47
+ # define SIMDE_SSE_POWER_ALTIVEC
48
+ # endif
49
+
50
+ # if defined(SIMDE_SSE_NATIVE)
51
+ # include <xmmintrin.h>
52
+ # else
53
+ # if defined(SIMDE_SSE_NEON)
54
+ # include <arm_neon.h>
55
+ # endif
56
+ # if defined(SIMDE_SSE_WASM_SIMD128)
57
+ # if !defined(__wasm_unimplemented_simd128__)
58
+ # define __wasm_unimplemented_simd128__
59
+ # endif
60
+ # include <wasm_simd128.h>
61
+ # endif
62
+ # if defined(SIMDE_SSE_POWER_ALTIVEC)
63
+ # include <altivec.h>
64
+ # endif
65
+
66
+ # if !defined(HEDLEY_INTEL_VERSION) && !defined(HEDLEY_EMSCRIPTEN_VERSION) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
67
+ # include <stdatomic.h>
68
+ # elif defined(_WIN32)
69
+ # include <windows.h>
70
+ # endif
71
+ # endif
72
+
73
+ SIMDE__BEGIN_DECLS
74
+
75
+ typedef union {
76
+ #if defined(SIMDE_VECTOR_SUBSCRIPT)
77
+ SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
78
+ SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
79
+ SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
80
+ SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
81
+ SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
82
+ SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
83
+ SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
84
+ SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
85
+ #if defined(SIMDE__HAVE_INT128)
86
+ SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
87
+ SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
88
+ #endif
89
+ SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
90
+ SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
91
+ SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
92
+ #else
93
+ SIMDE_ALIGN(16) int8_t i8[16];
94
+ SIMDE_ALIGN(16) int16_t i16[8];
95
+ SIMDE_ALIGN(16) int32_t i32[4];
96
+ SIMDE_ALIGN(16) int64_t i64[2];
97
+ SIMDE_ALIGN(16) uint8_t u8[16];
98
+ SIMDE_ALIGN(16) uint16_t u16[8];
99
+ SIMDE_ALIGN(16) uint32_t u32[4];
100
+ SIMDE_ALIGN(16) uint64_t u64[2];
101
+ #if defined(SIMDE__HAVE_INT128)
102
+ SIMDE_ALIGN(16) simde_int128 i128[1];
103
+ SIMDE_ALIGN(16) simde_uint128 u128[1];
104
+ #endif
105
+ SIMDE_ALIGN(16) simde_float32 f32[4];
106
+ SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
107
+ SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
108
+ #endif
109
+
110
+ SIMDE_ALIGN(16) simde__m64_private m64_private[2];
111
+ SIMDE_ALIGN(16) simde__m64 m64[2];
112
+
113
+ #if defined(SIMDE_SSE_NATIVE)
114
+ SIMDE_ALIGN(16) __m128 n;
115
+ #elif defined(SIMDE_SSE_NEON)
116
+ SIMDE_ALIGN(16) int8x16_t neon_i8;
117
+ SIMDE_ALIGN(16) int16x8_t neon_i16;
118
+ SIMDE_ALIGN(16) int32x4_t neon_i32;
119
+ SIMDE_ALIGN(16) int64x2_t neon_i64;
120
+ SIMDE_ALIGN(16) uint8x16_t neon_u8;
121
+ SIMDE_ALIGN(16) uint16x8_t neon_u16;
122
+ SIMDE_ALIGN(16) uint32x4_t neon_u32;
123
+ SIMDE_ALIGN(16) uint64x2_t neon_u64;
124
+ SIMDE_ALIGN(16) float32x4_t neon_f32;
125
+ #if defined(SIMDE_ARCH_AARCH64)
126
+ SIMDE_ALIGN(16) float64x2_t neon_f64;
127
+ #endif
128
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
129
+ SIMDE_ALIGN(16) v128_t wasm_v128;
130
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
131
+ SIMDE_ALIGN(16) vector unsigned char altivec_u8;
132
+ SIMDE_ALIGN(16) vector unsigned short altivec_u16;
133
+ SIMDE_ALIGN(16) vector unsigned int altivec_u32;
134
+ SIMDE_ALIGN(16) vector unsigned long long altivec_u64;
135
+ SIMDE_ALIGN(16) vector signed char altivec_i8;
136
+ SIMDE_ALIGN(16) vector signed short altivec_i16;
137
+ SIMDE_ALIGN(16) vector signed int altivec_i32;
138
+ SIMDE_ALIGN(16) vector signed long long altivec_i64;
139
+ SIMDE_ALIGN(16) vector float altivec_f32;
140
+ SIMDE_ALIGN(16) vector double altivec_f64;
141
+ #endif
142
+ } simde__m128_private;
143
+
144
+ #if defined(SIMDE_SSE_NATIVE)
145
+ typedef __m128 simde__m128;
146
+ #elif defined(SIMDE_SSE_NEON)
147
+ typedef float32x4_t simde__m128;
148
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
149
+ typedef v128_t simde__m128;
150
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
151
+ typedef vector float simde__m128;
152
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT)
153
+ typedef simde_float32 simde__m128 SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
154
+ #else
155
+ typedef simde__m128_private simde__m128;
156
+ #endif
157
+
158
+ #if !defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
159
+ #define SIMDE_SSE_ENABLE_NATIVE_ALIASES
160
+ typedef simde__m128 __m128;
161
+ #endif
162
+
163
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
164
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), "simde__m128_private size incorrect");
165
+ #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
166
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, "simde__m128 is not 16-byte aligned");
167
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, "simde__m128_private is not 16-byte aligned");
168
+ #endif
169
+
170
+ SIMDE__FUNCTION_ATTRIBUTES
171
+ simde__m128
172
+ simde__m128_from_private(simde__m128_private v) {
173
+ simde__m128 r;
174
+ simde_memcpy(&r, &v, sizeof(r));
175
+ return r;
176
+ }
177
+
178
+ SIMDE__FUNCTION_ATTRIBUTES
179
+ simde__m128_private
180
+ simde__m128_to_private(simde__m128 v) {
181
+ simde__m128_private r;
182
+ simde_memcpy(&r, &v, sizeof(r));
183
+ return r;
184
+ }
185
+
186
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
187
+ HEDLEY_DIAGNOSTIC_POP
188
+ #endif
189
+
190
+ SIMDE__FUNCTION_ATTRIBUTES
191
+ simde__m128
192
+ simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
193
+ #if defined(SIMDE_SSE_NATIVE)
194
+ return _mm_set_ps(e3, e2, e1, e0);
195
+ #else
196
+ simde__m128_private r_;
197
+
198
+ #if defined(SIMDE_SSE_NEON)
199
+ SIMDE_ALIGN(16) simde_float32 data[4] = { e0, e1, e2, e3 };
200
+ r_.neon_f32 = vld1q_f32(data);
201
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
202
+ r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
203
+ #else
204
+ r_.f32[0] = e0;
205
+ r_.f32[1] = e1;
206
+ r_.f32[2] = e2;
207
+ r_.f32[3] = e3;
208
+ #endif
209
+
210
+ return simde__m128_from_private(r_);
211
+ #endif
212
+ }
213
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
214
+ # define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
215
+ #endif
216
+
217
+ SIMDE__FUNCTION_ATTRIBUTES
218
+ simde__m128
219
+ simde_mm_set_ps1 (simde_float32 a) {
220
+ #if defined(SIMDE_SSE_NATIVE)
221
+ return _mm_set_ps1(a);
222
+ #elif defined(SIMDE_SSE_NEON)
223
+ return vdupq_n_f32(a);
224
+ #else
225
+ return simde_mm_set_ps(a, a, a, a);
226
+ #endif
227
+ }
228
+ #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
229
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
230
+ # define _mm_set_ps1(a) simde_mm_set_ps1(a)
231
+ # define _mm_set1_ps(a) simde_mm_set1_ps(a)
232
+ #endif
233
+
234
+ SIMDE__FUNCTION_ATTRIBUTES
235
+ simde__m128
236
+ simde_mm_move_ss (simde__m128 a, simde__m128 b) {
237
+ #if defined(SIMDE_SSE_NATIVE)
238
+ return _mm_move_ss(a, b);
239
+ #else
240
+ simde__m128_private
241
+ r_,
242
+ a_ = simde__m128_to_private(a),
243
+ b_ = simde__m128_to_private(b);
244
+
245
+ #if defined(SIMDE_SSE_NEON)
246
+ r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
247
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
248
+ vector unsigned char m = {
249
+ 16, 17, 18, 19,
250
+ 4, 5, 6, 7,
251
+ 8, 9, 10, 11,
252
+ 12, 13, 14, 15
253
+ };
254
+ r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
255
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
256
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
257
+ #else
258
+ r_.f32[0] = b_.f32[0];
259
+ r_.f32[1] = a_.f32[1];
260
+ r_.f32[2] = a_.f32[2];
261
+ r_.f32[3] = a_.f32[3];
262
+ #endif
263
+
264
+ return simde__m128_from_private(r_);
265
+ #endif
266
+ }
267
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
268
+ # define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
269
+ #endif
270
+
271
+ SIMDE__FUNCTION_ATTRIBUTES
272
+ simde__m128
273
+ simde_mm_add_ps (simde__m128 a, simde__m128 b) {
274
+ #if defined(SIMDE_SSE_NATIVE)
275
+ return _mm_add_ps(a, b);
276
+ #else
277
+ simde__m128_private
278
+ r_,
279
+ a_ = simde__m128_to_private(a),
280
+ b_ = simde__m128_to_private(b);
281
+
282
+ #if defined(SIMDE_SSE_NEON)
283
+ r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
284
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
285
+ r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
286
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
287
+ r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
288
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
289
+ r_.f32 = a_.f32 + b_.f32;
290
+ #else
291
+ SIMDE__VECTORIZE
292
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
293
+ r_.f32[i] = a_.f32[i] + b_.f32[i];
294
+ }
295
+ #endif
296
+
297
+ return simde__m128_from_private(r_);
298
+ #endif
299
+ }
300
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
301
+ # define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
302
+ #endif
303
+
304
+ SIMDE__FUNCTION_ATTRIBUTES
305
+ simde__m128
306
+ simde_mm_add_ss (simde__m128 a, simde__m128 b) {
307
+ #if defined(SIMDE_SSE_NATIVE)
308
+ return _mm_add_ss(a, b);
309
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
310
+ return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
311
+ #else
312
+ simde__m128_private
313
+ r_,
314
+ a_ = simde__m128_to_private(a),
315
+ b_ = simde__m128_to_private(b);
316
+
317
+ r_.f32[0] = a_.f32[0] + b_.f32[0];
318
+ r_.f32[1] = a_.f32[1];
319
+ r_.f32[2] = a_.f32[2];
320
+ r_.f32[3] = a_.f32[3];
321
+
322
+ return simde__m128_from_private(r_);
323
+ #endif
324
+ }
325
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
326
+ # define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
327
+ #endif
328
+
329
+ SIMDE__FUNCTION_ATTRIBUTES
330
+ simde__m128
331
+ simde_mm_and_ps (simde__m128 a, simde__m128 b) {
332
+ #if defined(SIMDE_SSE_NATIVE)
333
+ return _mm_and_ps(a, b);
334
+ #else
335
+ simde__m128_private
336
+ r_,
337
+ a_ = simde__m128_to_private(a),
338
+ b_ = simde__m128_to_private(b);
339
+
340
+ #if defined(SIMDE_SSE_NEON)
341
+ r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
342
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
343
+ r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
344
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
345
+ r_.i32 = a_.i32 & b_.i32;
346
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
347
+ r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
348
+ #else
349
+ SIMDE__VECTORIZE
350
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
351
+ r_.i32[i] = a_.i32[i] & b_.i32[i];
352
+ }
353
+ #endif
354
+
355
+ return simde__m128_from_private(r_);
356
+ #endif
357
+ }
358
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
359
+ # define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
360
+ #endif
361
+
362
+ SIMDE__FUNCTION_ATTRIBUTES
363
+ simde__m128
364
+ simde_mm_andnot_ps (simde__m128 a, simde__m128 b) {
365
+ #if defined(SIMDE_SSE_NATIVE)
366
+ return _mm_andnot_ps(a, b);
367
+ #else
368
+ simde__m128_private
369
+ r_,
370
+ a_ = simde__m128_to_private(a),
371
+ b_ = simde__m128_to_private(b);
372
+
373
+ #if defined(SIMDE_SSE_NEON)
374
+ r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
375
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
376
+ r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
377
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
378
+ r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
379
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
380
+ r_.i32 = ~a_.i32 & b_.i32;
381
+ #else
382
+ SIMDE__VECTORIZE
383
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
384
+ r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
385
+ }
386
+ #endif
387
+
388
+ return simde__m128_from_private(r_);
389
+ #endif
390
+ }
391
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
392
+ # define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
393
+ #endif
394
+
395
+ SIMDE__FUNCTION_ATTRIBUTES
396
+ simde__m64
397
+ simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) {
398
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
399
+ return _mm_avg_pu16(a, b);
400
+ #else
401
+ simde__m64_private
402
+ r_,
403
+ a_ = simde__m64_to_private(a),
404
+ b_ = simde__m64_to_private(b);
405
+
406
+ #if defined(SIMDE_SSE_NEON)
407
+ r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
408
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE__CONVERT_VECTOR)
409
+ uint32_t wa SIMDE_VECTOR(16);
410
+ uint32_t wb SIMDE_VECTOR(16);
411
+ uint32_t wr SIMDE_VECTOR(16);
412
+ SIMDE__CONVERT_VECTOR(wa, a_.u16);
413
+ SIMDE__CONVERT_VECTOR(wb, b_.u16);
414
+ wr = (wa + wb + 1) >> 1;
415
+ SIMDE__CONVERT_VECTOR(r_.u16, wr);
416
+ #else
417
+ SIMDE__VECTORIZE
418
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
419
+ r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
420
+ }
421
+ #endif
422
+
423
+ return simde__m64_from_private(r_);
424
+ #endif
425
+ }
426
+ #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
427
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
428
+ # define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
429
+ # define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
430
+ #endif
431
+
432
+ SIMDE__FUNCTION_ATTRIBUTES
433
+ simde__m64
434
+ simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) {
435
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
436
+ return _mm_avg_pu8(a, b);
437
+ #else
438
+ simde__m64_private
439
+ r_,
440
+ a_ = simde__m64_to_private(a),
441
+ b_ = simde__m64_to_private(b);
442
+
443
+ #if defined(SIMDE_SSE_NEON)
444
+ r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
445
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE__CONVERT_VECTOR)
446
+ uint16_t wa SIMDE_VECTOR(16);
447
+ uint16_t wb SIMDE_VECTOR(16);
448
+ uint16_t wr SIMDE_VECTOR(16);
449
+ SIMDE__CONVERT_VECTOR(wa, a_.u8);
450
+ SIMDE__CONVERT_VECTOR(wb, b_.u8);
451
+ wr = (wa + wb + 1) >> 1;
452
+ SIMDE__CONVERT_VECTOR(r_.u8, wr);
453
+ #else
454
+ SIMDE__VECTORIZE
455
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
456
+ r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
457
+ }
458
+ #endif
459
+
460
+ return simde__m64_from_private(r_);
461
+ #endif
462
+ }
463
+ #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
464
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
465
+ # define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
466
+ # define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
467
+ #endif
468
+
469
+ SIMDE__FUNCTION_ATTRIBUTES
470
+ simde__m128
471
+ simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) {
472
+ #if defined(SIMDE_SSE_NATIVE)
473
+ return _mm_cmpeq_ps(a, b);
474
+ #else
475
+ simde__m128_private
476
+ r_,
477
+ a_ = simde__m128_to_private(a),
478
+ b_ = simde__m128_to_private(b);
479
+
480
+ #if defined(SIMDE_SSE_NEON)
481
+ r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
482
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
483
+ r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
484
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
485
+ r_.altivec_f32 = (vector float) vec_cmpeq(a_.altivec_f32, b_.altivec_f32);
486
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
487
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
488
+ #else
489
+ SIMDE__VECTORIZE
490
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
491
+ r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
492
+ }
493
+ #endif
494
+
495
+ return simde__m128_from_private(r_);
496
+ #endif
497
+ }
498
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
499
+ # define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
500
+ #endif
501
+
502
+ SIMDE__FUNCTION_ATTRIBUTES
503
+ simde__m128
504
+ simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) {
505
+ #if defined(SIMDE_SSE_NATIVE)
506
+ return _mm_cmpeq_ss(a, b);
507
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
508
+ return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
509
+ #else
510
+ simde__m128_private
511
+ r_,
512
+ a_ = simde__m128_to_private(a),
513
+ b_ = simde__m128_to_private(b);
514
+
515
+ r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
516
+ SIMDE__VECTORIZE
517
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
518
+ r_.u32[i] = a_.u32[i];
519
+ }
520
+
521
+ return simde__m128_from_private(r_);
522
+ #endif
523
+ }
524
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
525
+ # define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
526
+ #endif
527
+
528
+ SIMDE__FUNCTION_ATTRIBUTES
529
+ simde__m128
530
+ simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) {
531
+ #if defined(SIMDE_SSE_NATIVE)
532
+ return _mm_cmpge_ps(a, b);
533
+ #else
534
+ simde__m128_private
535
+ r_,
536
+ a_ = simde__m128_to_private(a),
537
+ b_ = simde__m128_to_private(b);
538
+
539
+ #if defined(SIMDE_SSE_NEON)
540
+ r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
541
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
542
+ r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
543
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
544
+ r_.altivec_f32 = (vector float) vec_cmpge(a_.altivec_f32, b_.altivec_f32);
545
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
546
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 >= b_.f32);
547
+ #else
548
+ SIMDE__VECTORIZE
549
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
550
+ r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
551
+ }
552
+ #endif
553
+
554
+ return simde__m128_from_private(r_);
555
+ #endif
556
+ }
557
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
558
+ # define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
559
+ #endif
560
+
561
+ SIMDE__FUNCTION_ATTRIBUTES
562
+ simde__m128
563
+ simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) {
564
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
565
+ return _mm_cmpge_ss(a, b);
566
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
567
+ return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
568
+ #else
569
+ simde__m128_private
570
+ r_,
571
+ a_ = simde__m128_to_private(a),
572
+ b_ = simde__m128_to_private(b);
573
+
574
+ r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
575
+ SIMDE__VECTORIZE
576
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
577
+ r_.u32[i] = a_.u32[i];
578
+ }
579
+
580
+ return simde__m128_from_private(r_);
581
+ #endif
582
+ }
583
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
584
+ # define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
585
+ #endif
586
+
587
+ SIMDE__FUNCTION_ATTRIBUTES
588
+ simde__m128
589
+ simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) {
590
+ #if defined(SIMDE_SSE_NATIVE)
591
+ return _mm_cmpgt_ps(a, b);
592
+ #else
593
+ simde__m128_private
594
+ r_,
595
+ a_ = simde__m128_to_private(a),
596
+ b_ = simde__m128_to_private(b);
597
+
598
+ #if defined(SIMDE_SSE_NEON)
599
+ r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
600
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
601
+ r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
602
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
603
+ r_.altivec_f32 = (vector float) vec_cmpgt(a_.altivec_f32, b_.altivec_f32);
604
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
605
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 > b_.f32);
606
+ #else
607
+ SIMDE__VECTORIZE
608
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
609
+ r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
610
+ }
611
+ #endif
612
+
613
+ return simde__m128_from_private(r_);
614
+ #endif
615
+ }
616
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
617
+ # define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
618
+ #endif
619
+
620
+ SIMDE__FUNCTION_ATTRIBUTES
621
+ simde__m128
622
+ simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) {
623
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
624
+ return _mm_cmpgt_ss(a, b);
625
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
626
+ return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
627
+ #else
628
+ simde__m128_private
629
+ r_,
630
+ a_ = simde__m128_to_private(a),
631
+ b_ = simde__m128_to_private(b);
632
+
633
+ r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
634
+ SIMDE__VECTORIZE
635
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
636
+ r_.u32[i] = a_.u32[i];
637
+ }
638
+
639
+ return simde__m128_from_private(r_);
640
+ #endif
641
+ }
642
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
643
+ # define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
644
+ #endif
645
+
646
+ SIMDE__FUNCTION_ATTRIBUTES
647
+ simde__m128
648
+ simde_mm_cmple_ps (simde__m128 a, simde__m128 b) {
649
+ #if defined(SIMDE_SSE_NATIVE)
650
+ return _mm_cmple_ps(a, b);
651
+ #else
652
+ simde__m128_private
653
+ r_,
654
+ a_ = simde__m128_to_private(a),
655
+ b_ = simde__m128_to_private(b);
656
+
657
+ #if defined(SIMDE_SSE_NEON)
658
+ r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
659
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
660
+ r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
661
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
662
+ r_.altivec_f32 = (vector float) vec_cmple(a_.altivec_f32, b_.altivec_f32);
663
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
664
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 <= b_.f32);
665
+ #else
666
+ SIMDE__VECTORIZE
667
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
668
+ r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
669
+ }
670
+ #endif
671
+
672
+ return simde__m128_from_private(r_);
673
+ #endif
674
+ }
675
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
676
+ # define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
677
+ #endif
678
+
679
+ SIMDE__FUNCTION_ATTRIBUTES
680
+ simde__m128
681
+ simde_mm_cmple_ss (simde__m128 a, simde__m128 b) {
682
+ #if defined(SIMDE_SSE_NATIVE)
683
+ return _mm_cmple_ss(a, b);
684
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
685
+ return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
686
+ #else
687
+ simde__m128_private
688
+ r_,
689
+ a_ = simde__m128_to_private(a),
690
+ b_ = simde__m128_to_private(b);
691
+
692
+ r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
693
+ SIMDE__VECTORIZE
694
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
695
+ r_.u32[i] = a_.u32[i];
696
+ }
697
+
698
+ return simde__m128_from_private(r_);
699
+ #endif
700
+ }
701
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
702
+ # define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
703
+ #endif
704
+
705
+ SIMDE__FUNCTION_ATTRIBUTES
706
+ simde__m128
707
+ simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) {
708
+ #if defined(SIMDE_SSE_NATIVE)
709
+ return _mm_cmplt_ps(a, b);
710
+ #else
711
+ simde__m128_private
712
+ r_,
713
+ a_ = simde__m128_to_private(a),
714
+ b_ = simde__m128_to_private(b);
715
+
716
+ #if defined(SIMDE_SSE_NEON)
717
+ r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
718
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
719
+ r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
720
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
721
+ r_.altivec_f32 = (vector float) vec_cmplt(a_.altivec_f32, b_.altivec_f32);
722
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
723
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 < b_.f32);
724
+ #else
725
+ SIMDE__VECTORIZE
726
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
727
+ r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
728
+ }
729
+ #endif
730
+
731
+ return simde__m128_from_private(r_);
732
+ #endif
733
+ }
734
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
735
+ # define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
736
+ #endif
737
+
738
+ SIMDE__FUNCTION_ATTRIBUTES
739
+ simde__m128
740
+ simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) {
741
+ #if defined(SIMDE_SSE_NATIVE)
742
+ return _mm_cmplt_ss(a, b);
743
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
744
+ return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
745
+ #else
746
+ simde__m128_private
747
+ r_,
748
+ a_ = simde__m128_to_private(a),
749
+ b_ = simde__m128_to_private(b);
750
+
751
+ r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
752
+ SIMDE__VECTORIZE
753
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
754
+ r_.u32[i] = a_.u32[i];
755
+ }
756
+
757
+ return simde__m128_from_private(r_);
758
+ #endif
759
+ }
760
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
761
+ # define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
762
+ #endif
763
+
764
+ SIMDE__FUNCTION_ATTRIBUTES
765
+ simde__m128
766
+ simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) {
767
+ #if defined(SIMDE_SSE_NATIVE)
768
+ return _mm_cmpneq_ps(a, b);
769
+ #else
770
+ simde__m128_private
771
+ r_,
772
+ a_ = simde__m128_to_private(a),
773
+ b_ = simde__m128_to_private(b);
774
+
775
+ #if defined(SIMDE_SSE_NEON)
776
+ r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
777
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
778
+ r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
779
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC) && (SIMDE_ARCH_POWER >= 900) && !defined(HEDLEY_IBM_VERSION)
780
+ /* vec_cmpne(vector float, vector float) is missing from XL C/C++ v16.1.1,
781
+ though the documentation (table 89 on page 432 of the IBM XL C/C++ for
782
+ Linux Compiler Reference, Version 16.1.1) shows that it should be
783
+ present. Both GCC and clang support it. */
784
+ r_.altivec_f32 = (vector float) vec_cmpne(a_.altivec_f32, b_.altivec_f32);
785
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
786
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 != b_.f32);
787
+ #else
788
+ SIMDE__VECTORIZE
789
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
790
+ r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
791
+ }
792
+ #endif
793
+
794
+ return simde__m128_from_private(r_);
795
+ #endif
796
+ }
797
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
798
+ # define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
799
+ #endif
800
+
801
+ SIMDE__FUNCTION_ATTRIBUTES
802
+ simde__m128
803
+ simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) {
804
+ #if defined(SIMDE_SSE_NATIVE)
805
+ return _mm_cmpneq_ss(a, b);
806
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
807
+ return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
808
+ #else
809
+ simde__m128_private
810
+ r_,
811
+ a_ = simde__m128_to_private(a),
812
+ b_ = simde__m128_to_private(b);
813
+
814
+ r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
815
+ SIMDE__VECTORIZE
816
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
817
+ r_.u32[i] = a_.u32[i];
818
+ }
819
+
820
+ return simde__m128_from_private(r_);
821
+ #endif
822
+ }
823
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
824
+ # define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
825
+ #endif
826
+
827
+ SIMDE__FUNCTION_ATTRIBUTES
828
+ simde__m128
829
+ simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) {
830
+ return simde_mm_cmplt_ps(a, b);
831
+ }
832
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
833
+ # define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
834
+ #endif
835
+
836
+ SIMDE__FUNCTION_ATTRIBUTES
837
+ simde__m128
838
+ simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) {
839
+ return simde_mm_cmplt_ss(a, b);
840
+ }
841
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
842
+ # define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
843
+ #endif
844
+
845
+ SIMDE__FUNCTION_ATTRIBUTES
846
+ simde__m128
847
+ simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) {
848
+ return simde_mm_cmple_ps(a, b);
849
+ }
850
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
851
+ # define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
852
+ #endif
853
+
854
+ SIMDE__FUNCTION_ATTRIBUTES
855
+ simde__m128
856
+ simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) {
857
+ return simde_mm_cmple_ss(a, b);
858
+ }
859
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
860
+ # define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
861
+ #endif
862
+
863
+ SIMDE__FUNCTION_ATTRIBUTES
864
+ simde__m128
865
+ simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) {
866
+ return simde_mm_cmpgt_ps(a, b);
867
+ }
868
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
869
+ # define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
870
+ #endif
871
+
872
+ SIMDE__FUNCTION_ATTRIBUTES
873
+ simde__m128
874
+ simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) {
875
+ return simde_mm_cmpgt_ss(a, b);
876
+ }
877
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
878
+ # define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
879
+ #endif
880
+
881
+ SIMDE__FUNCTION_ATTRIBUTES
882
+ simde__m128
883
+ simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) {
884
+ return simde_mm_cmpge_ps(a, b);
885
+ }
886
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
887
+ # define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
888
+ #endif
889
+
890
+ SIMDE__FUNCTION_ATTRIBUTES
891
+ simde__m128
892
+ simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) {
893
+ return simde_mm_cmpge_ss(a, b);
894
+ }
895
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
896
+ # define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
897
+ #endif
898
+
899
+ SIMDE__FUNCTION_ATTRIBUTES
900
+ simde__m128
901
+ simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) {
902
+ #if defined(SIMDE_SSE_NATIVE)
903
+ return _mm_cmpord_ps(a, b);
904
+ #else
905
+ simde__m128_private
906
+ r_,
907
+ a_ = simde__m128_to_private(a),
908
+ b_ = simde__m128_to_private(b);
909
+
910
+ #if defined(SIMDE_SSE_NEON)
911
+ /* Note: NEON does not have ordered compare builtin
912
+ Need to compare a eq a and b eq b to check for NaN
913
+ Do AND of results to get final */
914
+ uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
915
+ uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
916
+ r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
917
+ #elif defined(simde_isnanf)
918
+ SIMDE__VECTORIZE
919
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
920
+ r_.u32[i] = (simde_isnanf(a_.f32[i]) || simde_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0);
921
+ }
922
+ #else
923
+ HEDLEY_UNREACHABLE();
924
+ #endif
925
+
926
+ return simde__m128_from_private(r_);
927
+ #endif
928
+ }
929
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
930
+ # define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
931
+ #endif
932
+
933
+ SIMDE__FUNCTION_ATTRIBUTES
934
+ simde__m128
935
+ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) {
936
+ #if defined(SIMDE_SSE_NATIVE)
937
+ return _mm_cmpunord_ps(a, b);
938
+ #else
939
+ simde__m128_private
940
+ r_,
941
+ a_ = simde__m128_to_private(a),
942
+ b_ = simde__m128_to_private(b);
943
+
944
+ #if defined(simde_isnanf)
945
+ SIMDE__VECTORIZE
946
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
947
+ r_.u32[i] = (simde_isnanf(a_.f32[i]) || simde_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
948
+ }
949
+ #else
950
+ HEDLEY_UNREACHABLE();
951
+ #endif
952
+
953
+ return simde__m128_from_private(r_);
954
+ #endif
955
+ }
956
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
957
+ # define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
958
+ #endif
959
+
960
+ SIMDE__FUNCTION_ATTRIBUTES
961
+ simde__m128
962
+ simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) {
963
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
964
+ return _mm_cmpunord_ss(a, b);
965
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
966
+ return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
967
+ #else
968
+ simde__m128_private
969
+ r_,
970
+ a_ = simde__m128_to_private(a),
971
+ b_ = simde__m128_to_private(b);
972
+
973
+ #if defined(simde_isnanf)
974
+ r_.u32[0] = (simde_isnanf(a_.f32[0]) || simde_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
975
+ SIMDE__VECTORIZE
976
+ for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
977
+ r_.u32[i] = a_.u32[i];
978
+ }
979
+ #else
980
+ HEDLEY_UNREACHABLE();
981
+ #endif
982
+
983
+ return simde__m128_from_private(r_);
984
+ #endif
985
+ }
986
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
987
+ # define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
988
+ #endif
989
+
990
+ SIMDE__FUNCTION_ATTRIBUTES
991
+ int
992
+ simde_mm_comieq_ss (simde__m128 a, simde__m128 b) {
993
+ #if defined(SIMDE_SSE_NATIVE)
994
+ return _mm_comieq_ss(a, b);
995
+ #else
996
+ simde__m128_private
997
+ a_ = simde__m128_to_private(a),
998
+ b_ = simde__m128_to_private(b);
999
+
1000
+ #if defined(SIMDE_SSE_NEON)
1001
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1002
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1003
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1004
+ uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
1005
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
1006
+ #else
1007
+ return a_.f32[0] == b_.f32[0];
1008
+ #endif
1009
+ #endif
1010
+ }
1011
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1012
+ # define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
1013
+ #endif
1014
+
1015
+ SIMDE__FUNCTION_ATTRIBUTES
1016
+ int
1017
+ simde_mm_comige_ss (simde__m128 a, simde__m128 b) {
1018
+ #if defined(SIMDE_SSE_NATIVE)
1019
+ return _mm_comige_ss(a, b);
1020
+ #else
1021
+ simde__m128_private
1022
+ a_ = simde__m128_to_private(a),
1023
+ b_ = simde__m128_to_private(b);
1024
+
1025
+ #if defined(SIMDE_SSE_NEON)
1026
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1027
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1028
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1029
+ uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1030
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
1031
+ #else
1032
+ return a_.f32[0] >= b_.f32[0];
1033
+ #endif
1034
+ #endif
1035
+ }
1036
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1037
+ # define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
1038
+ #endif
1039
+
1040
+ SIMDE__FUNCTION_ATTRIBUTES
1041
+ int
1042
+ simde_mm_comigt_ss (simde__m128 a, simde__m128 b) {
1043
+ #if defined(SIMDE_SSE_NATIVE)
1044
+ return _mm_comigt_ss(a, b);
1045
+ #else
1046
+ simde__m128_private
1047
+ a_ = simde__m128_to_private(a),
1048
+ b_ = simde__m128_to_private(b);
1049
+
1050
+ #if defined(SIMDE_SSE_NEON)
1051
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1052
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1053
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1054
+ uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1055
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
1056
+ #else
1057
+ return a_.f32[0] > b_.f32[0];
1058
+ #endif
1059
+ #endif
1060
+ }
1061
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1062
+ # define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
1063
+ #endif
1064
+
1065
+ SIMDE__FUNCTION_ATTRIBUTES
1066
+ int
1067
+ simde_mm_comile_ss (simde__m128 a, simde__m128 b) {
1068
+ #if defined(SIMDE_SSE_NATIVE)
1069
+ return _mm_comile_ss(a, b);
1070
+ #else
1071
+ simde__m128_private
1072
+ a_ = simde__m128_to_private(a),
1073
+ b_ = simde__m128_to_private(b);
1074
+
1075
+ #if defined(SIMDE_SSE_NEON)
1076
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1077
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1078
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1079
+ uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
1080
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
1081
+ #else
1082
+ return a_.f32[0] <= b_.f32[0];
1083
+ #endif
1084
+ #endif
1085
+ }
1086
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1087
+ # define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
1088
+ #endif
1089
+
1090
+ SIMDE__FUNCTION_ATTRIBUTES
1091
+ int
1092
+ simde_mm_comilt_ss (simde__m128 a, simde__m128 b) {
1093
+ #if defined(SIMDE_SSE_NATIVE)
1094
+ return _mm_comilt_ss(a, b);
1095
+ #else
1096
+ simde__m128_private
1097
+ a_ = simde__m128_to_private(a),
1098
+ b_ = simde__m128_to_private(b);
1099
+
1100
+ #if defined(SIMDE_SSE_NEON)
1101
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1102
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1103
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1104
+ uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
1105
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
1106
+ #else
1107
+ return a_.f32[0] < b_.f32[0];
1108
+ #endif
1109
+ #endif
1110
+ }
1111
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1112
+ # define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
1113
+ #endif
1114
+
1115
+ SIMDE__FUNCTION_ATTRIBUTES
1116
+ int
1117
+ simde_mm_comineq_ss (simde__m128 a, simde__m128 b) {
1118
+ #if defined(SIMDE_SSE_NATIVE)
1119
+ return _mm_comineq_ss(a, b);
1120
+ #else
1121
+ simde__m128_private
1122
+ a_ = simde__m128_to_private(a),
1123
+ b_ = simde__m128_to_private(b);
1124
+
1125
+ #if defined(SIMDE_SSE_NEON)
1126
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1127
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1128
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1129
+ uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1130
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
1131
+ #else
1132
+ return a_.f32[0] != b_.f32[0];
1133
+ #endif
1134
+ #endif
1135
+ }
1136
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1137
+ # define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
1138
+ #endif
1139
+
1140
+ SIMDE__FUNCTION_ATTRIBUTES
1141
+ simde__m128
1142
+ simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) {
1143
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1144
+ return _mm_cvt_pi2ps(a, b);
1145
+ #else
1146
+ simde__m128_private
1147
+ r_,
1148
+ a_ = simde__m128_to_private(a);
1149
+ simde__m64_private b_ = simde__m64_to_private(b);
1150
+
1151
+ #if defined(SIMDE_SSE_NEON)
1152
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
1153
+ #elif defined(SIMDE__CONVERT_VECTOR)
1154
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, b_.i32);
1155
+ r_.m64_private[1] = a_.m64_private[1];
1156
+
1157
+ #else
1158
+ r_.f32[0] = (simde_float32) b_.i32[0];
1159
+ r_.f32[1] = (simde_float32) b_.i32[1];
1160
+ r_.i32[2] = a_.i32[2];
1161
+ r_.i32[3] = a_.i32[3];
1162
+ #endif
1163
+
1164
+ return simde__m128_from_private(r_);
1165
+ #endif
1166
+ }
1167
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1168
+ # define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), b)
1169
+ #endif
1170
+
1171
+ SIMDE__FUNCTION_ATTRIBUTES
1172
+ simde__m64
1173
+ simde_mm_cvt_ps2pi (simde__m128 a) {
1174
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1175
+ return _mm_cvt_ps2pi(a);
1176
+ #else
1177
+ simde__m64_private r_;
1178
+ simde__m128_private a_ = simde__m128_to_private(a);
1179
+
1180
+ #if defined(SIMDE_SSE_NEON)
1181
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1182
+ #elif defined(SIMDE__CONVERT_VECTOR) && !defined(__clang__)
1183
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1184
+ #else
1185
+ SIMDE__VECTORIZE
1186
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1187
+ r_.i32[i] = (int32_t) a_.f32[i];
1188
+ }
1189
+ #endif
1190
+
1191
+ return simde__m64_from_private(r_);
1192
+ #endif
1193
+ }
1194
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1195
+ # define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
1196
+ #endif
1197
+
1198
+ SIMDE__FUNCTION_ATTRIBUTES
1199
+ simde__m128
1200
+ simde_mm_cvt_si2ss (simde__m128 a, int32_t b) {
1201
+ #if defined(SIMDE_SSE_NATIVE)
1202
+ return _mm_cvt_si2ss(a, b);
1203
+ #else
1204
+ simde__m128_private
1205
+ r_,
1206
+ a_ = simde__m128_to_private(a);
1207
+
1208
+ #if defined(SIMDE_SSE_NEON)
1209
+ r_.neon_f32 = vsetq_lane_f32((float) b, a_.neon_f32, 0);
1210
+ #else
1211
+ r_.f32[0] = (simde_float32) b;
1212
+ r_.i32[1] = a_.i32[1];
1213
+ r_.i32[2] = a_.i32[2];
1214
+ r_.i32[3] = a_.i32[3];
1215
+ #endif
1216
+
1217
+ return simde__m128_from_private(r_);
1218
+ #endif
1219
+ }
1220
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1221
+ # define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
1222
+ #endif
1223
+
1224
+ SIMDE__FUNCTION_ATTRIBUTES
1225
+ int32_t
1226
+ simde_mm_cvt_ss2si (simde__m128 a) {
1227
+ #if defined(SIMDE_SSE_NATIVE)
1228
+ return _mm_cvt_ss2si(a);
1229
+ #else
1230
+ simde__m128_private a_ = simde__m128_to_private(a);
1231
+
1232
+ #if defined(SIMDE_SSE_NEON)
1233
+ return SIMDE_CONVERT_FTOI(int32_t, nearbyintf(vgetq_lane_f32(a_.neon_f32, 0)));
1234
+ #elif defined(SIMDE_HAVE_MATH_H)
1235
+ return SIMDE_CONVERT_FTOI(int32_t, nearbyintf(a_.f32[0]));
1236
+ #else
1237
+ HEDLEY_UNREACHABLE();
1238
+ #endif
1239
+ #endif
1240
+ }
1241
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1242
+ # define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
1243
+ #endif
1244
+
1245
+ SIMDE__FUNCTION_ATTRIBUTES
1246
+ simde__m128
1247
+ simde_mm_cvtpi16_ps (simde__m64 a) {
1248
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1249
+ return _mm_cvtpi16_ps(a);
1250
+ #else
1251
+ simde__m128_private r_;
1252
+ simde__m64_private a_ = simde__m64_to_private(a);
1253
+
1254
+ #if defined(SIMDE_SSE_NEON) && 0 /* TODO */
1255
+ r_.neon_f32 = vmovl_s16(vget_low_s16(vuzp1q_s16(a_.neon_i16, vmovq_n_s16(0))));
1256
+ #elif defined(SIMDE__CONVERT_VECTOR)
1257
+ SIMDE__CONVERT_VECTOR(r_.f32, a_.i16);
1258
+ #else
1259
+ SIMDE__VECTORIZE
1260
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1261
+ simde_float32 v = a_.i16[i];
1262
+ r_.f32[i] = v;
1263
+ }
1264
+ #endif
1265
+
1266
+ return simde__m128_from_private(r_);
1267
+ #endif
1268
+ }
1269
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1270
+ # define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
1271
+ #endif
1272
+
1273
+ SIMDE__FUNCTION_ATTRIBUTES
1274
+ simde__m128
1275
+ simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) {
1276
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1277
+ return _mm_cvtpi32_ps(a, b);
1278
+ #else
1279
+ simde__m128_private
1280
+ r_,
1281
+ a_ = simde__m128_to_private(a);
1282
+ simde__m64_private b_ = simde__m64_to_private(b);
1283
+
1284
+ #if defined(SIMDE_SSE_NEON)
1285
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
1286
+ #elif defined(SIMDE__CONVERT_VECTOR)
1287
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, b_.i32);
1288
+ r_.m64_private[1] = a_.m64_private[1];
1289
+ #else
1290
+ r_.f32[0] = (simde_float32) b_.i32[0];
1291
+ r_.f32[1] = (simde_float32) b_.i32[1];
1292
+ r_.i32[2] = a_.i32[2];
1293
+ r_.i32[3] = a_.i32[3];
1294
+ #endif
1295
+
1296
+ return simde__m128_from_private(r_);
1297
+ #endif
1298
+ }
1299
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1300
+ # define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
1301
+ #endif
1302
+
1303
+ SIMDE__FUNCTION_ATTRIBUTES
1304
+ simde__m128
1305
+ simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) {
1306
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1307
+ return _mm_cvtpi32x2_ps(a, b);
1308
+ #else
1309
+ simde__m128_private r_;
1310
+ simde__m64_private
1311
+ a_ = simde__m64_to_private(a),
1312
+ b_ = simde__m64_to_private(b);
1313
+
1314
+ #if defined(SIMDE_SSE_NEON)
1315
+ r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
1316
+ #elif defined(SIMDE__CONVERT_VECTOR)
1317
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, a_.i32);
1318
+ SIMDE__CONVERT_VECTOR(r_.m64_private[1].f32, b_.i32);
1319
+ #else
1320
+ r_.f32[0] = (simde_float32) a_.i32[0];
1321
+ r_.f32[1] = (simde_float32) a_.i32[1];
1322
+ r_.f32[2] = (simde_float32) b_.i32[0];
1323
+ r_.f32[3] = (simde_float32) b_.i32[1];
1324
+ #endif
1325
+
1326
+ return simde__m128_from_private(r_);
1327
+ #endif
1328
+ }
1329
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1330
+ # define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
1331
+ #endif
1332
+
1333
+ SIMDE__FUNCTION_ATTRIBUTES
1334
+ simde__m128
1335
+ simde_mm_cvtpi8_ps (simde__m64 a) {
1336
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1337
+ return _mm_cvtpi8_ps(a);
1338
+ #else
1339
+ simde__m128_private r_;
1340
+ simde__m64_private a_ = simde__m64_to_private(a);
1341
+
1342
+ #if defined(SIMDE_SSE_NEON)
1343
+ r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
1344
+ #else
1345
+ r_.f32[0] = (simde_float32) a_.i8[0];
1346
+ r_.f32[1] = (simde_float32) a_.i8[1];
1347
+ r_.f32[2] = (simde_float32) a_.i8[2];
1348
+ r_.f32[3] = (simde_float32) a_.i8[3];
1349
+ #endif
1350
+
1351
+ return simde__m128_from_private(r_);
1352
+ #endif
1353
+ }
1354
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1355
+ # define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
1356
+ #endif
1357
+
1358
+ SIMDE__FUNCTION_ATTRIBUTES
1359
+ simde__m64
1360
+ simde_mm_cvtps_pi16 (simde__m128 a) {
1361
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1362
+ return _mm_cvtps_pi16(a);
1363
+ #else
1364
+ simde__m64_private r_;
1365
+ simde__m128_private a_ = simde__m128_to_private(a);
1366
+
1367
+ #if defined(SIMDE__CONVERT_VECTOR)
1368
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.f32);
1369
+ #elif defined(SIMDE_SSE_NEON)
1370
+ r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1371
+ #else
1372
+ SIMDE__VECTORIZE
1373
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1374
+ r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, a_.f32[i]);
1375
+ }
1376
+ #endif
1377
+
1378
+ return simde__m64_from_private(r_);
1379
+ #endif
1380
+ }
1381
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1382
+ # define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
1383
+ #endif
1384
+
1385
+ SIMDE__FUNCTION_ATTRIBUTES
1386
+ simde__m64
1387
+ simde_mm_cvtps_pi32 (simde__m128 a) {
1388
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1389
+ return _mm_cvtps_pi32(a);
1390
+ #else
1391
+ simde__m64_private r_;
1392
+ simde__m128_private a_ = simde__m128_to_private(a);
1393
+
1394
+ #if defined(SIMDE_SSE_NEON)
1395
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1396
+ #elif defined(SIMDE__CONVERT_VECTOR)
1397
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1398
+ #else
1399
+ SIMDE__VECTORIZE
1400
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1401
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1402
+ }
1403
+ #endif
1404
+
1405
+ return simde__m64_from_private(r_);
1406
+ #endif
1407
+ }
1408
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1409
+ # define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
1410
+ #endif
1411
+
1412
+ SIMDE__FUNCTION_ATTRIBUTES
1413
+ simde__m64
1414
+ simde_mm_cvtps_pi8 (simde__m128 a) {
1415
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1416
+ return _mm_cvtps_pi8(a);
1417
+ #else
1418
+ simde__m64_private r_;
1419
+ simde__m128_private a_ = simde__m128_to_private(a);
1420
+
1421
+ #if defined(SIMDE_SSE_NEON)
1422
+ int16x4_t b = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1423
+ int16x8_t c = vcombine_s16(b, vmov_n_s16(0));
1424
+ r_.neon_i8 = vmovn_s16(c);
1425
+ #else
1426
+ SIMDE__VECTORIZE
1427
+ for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
1428
+ r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, a_.f32[i]);
1429
+ }
1430
+ /* Note: the upper half is undefined */
1431
+ #endif
1432
+
1433
+ return simde__m64_from_private(r_);
1434
+ #endif
1435
+ }
1436
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1437
+ # define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
1438
+ #endif
1439
+
1440
+ SIMDE__FUNCTION_ATTRIBUTES
1441
+ simde__m128
1442
+ simde_mm_cvtpu16_ps (simde__m64 a) {
1443
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1444
+ return _mm_cvtpu16_ps(a);
1445
+ #else
1446
+ simde__m128_private r_;
1447
+ simde__m64_private a_ = simde__m64_to_private(a);
1448
+
1449
+ #if defined(SIMDE_SSE_NEON)
1450
+ r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
1451
+ #elif defined(SIMDE__CONVERT_VECTOR)
1452
+ SIMDE__CONVERT_VECTOR(r_.f32, a_.u16);
1453
+ #else
1454
+ SIMDE__VECTORIZE
1455
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1456
+ r_.f32[i] = (simde_float32) a_.u16[i];
1457
+ }
1458
+ #endif
1459
+
1460
+ return simde__m128_from_private(r_);
1461
+ #endif
1462
+ }
1463
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1464
+ # define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
1465
+ #endif
1466
+
1467
+ SIMDE__FUNCTION_ATTRIBUTES
1468
+ simde__m128
1469
+ simde_mm_cvtpu8_ps (simde__m64 a) {
1470
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1471
+ return _mm_cvtpu8_ps(a);
1472
+ #else
1473
+ simde__m128_private r_;
1474
+ simde__m64_private a_ = simde__m64_to_private(a);
1475
+
1476
+ #if defined(SIMDE_SSE_NEON)
1477
+ r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
1478
+ #else
1479
+ SIMDE__VECTORIZE
1480
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1481
+ r_.f32[i] = (simde_float32) a_.u8[i];
1482
+ }
1483
+ #endif
1484
+
1485
+ return simde__m128_from_private(r_);
1486
+ #endif
1487
+ }
1488
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1489
+ # define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
1490
+ #endif
1491
+
1492
+ SIMDE__FUNCTION_ATTRIBUTES
1493
+ simde__m128
1494
+ simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) {
1495
+ #if defined(SIMDE_SSE_NATIVE)
1496
+ return _mm_cvtsi32_ss(a, b);
1497
+ #else
1498
+ simde__m128_private r_;
1499
+ simde__m128_private a_ = simde__m128_to_private(a);
1500
+
1501
+ #if defined(SIMDE_SSE_NEON)
1502
+ r_.neon_f32 = vsetq_lane_f32((simde_float32) b, a_.neon_f32, 0);
1503
+ #else
1504
+ r_.f32[0] = (simde_float32) b;
1505
+ SIMDE__VECTORIZE
1506
+ for (size_t i = 1 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1507
+ r_.i32[i] = a_.i32[i];
1508
+ }
1509
+ #endif
1510
+
1511
+ return simde__m128_from_private(r_);
1512
+ #endif
1513
+ }
1514
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1515
+ # define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
1516
+ #endif
1517
+
1518
+ SIMDE__FUNCTION_ATTRIBUTES
1519
+ simde__m128
1520
+ simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) {
1521
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1522
+ #if !defined(__PGI)
1523
+ return _mm_cvtsi64_ss(a, b);
1524
+ #else
1525
+ return _mm_cvtsi64x_ss(a, b);
1526
+ #endif
1527
+ #else
1528
+ simde__m128_private r_;
1529
+ simde__m128_private a_ = simde__m128_to_private(a);
1530
+
1531
+ #if defined(SIMDE_SSE_NEON)
1532
+ r_.neon_f32 = vsetq_lane_f32((simde_float32) b, a_.neon_f32, 0);
1533
+ #else
1534
+ r_ = a_;
1535
+ r_.f32[0] = (simde_float32) b;
1536
+ #endif
1537
+
1538
+ return simde__m128_from_private(r_);
1539
+ #endif
1540
+ }
1541
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1542
+ # define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
1543
+ #endif
1544
+
1545
+ SIMDE__FUNCTION_ATTRIBUTES
1546
+ simde_float32
1547
+ simde_mm_cvtss_f32 (simde__m128 a) {
1548
+ #if defined(SIMDE_SSE_NATIVE)
1549
+ return _mm_cvtss_f32(a);
1550
+ #else
1551
+ simde__m128_private a_ = simde__m128_to_private(a);
1552
+ #if defined(SIMDE_SSE_NEON)
1553
+ return vgetq_lane_f32(a_.neon_f32, 0);
1554
+ #else
1555
+ return a_.f32[0];
1556
+ #endif
1557
+ #endif
1558
+ }
1559
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1560
+ # define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
1561
+ #endif
1562
+
1563
+ SIMDE__FUNCTION_ATTRIBUTES
1564
+ int32_t
1565
+ simde_mm_cvtss_si32 (simde__m128 a) {
1566
+ return simde_mm_cvt_ss2si(a);
1567
+ }
1568
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1569
+ # define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
1570
+ #endif
1571
+
1572
+ SIMDE__FUNCTION_ATTRIBUTES
1573
+ int64_t
1574
+ simde_mm_cvtss_si64 (simde__m128 a) {
1575
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1576
+ #if !defined(__PGI)
1577
+ return _mm_cvtss_si64(a);
1578
+ #else
1579
+ return _mm_cvtss_si64x(a);
1580
+ #endif
1581
+ #else
1582
+ simde__m128_private a_ = simde__m128_to_private(a);
1583
+ #if defined(SIMDE_SSE_NEON)
1584
+ return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1585
+ #else
1586
+ return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1587
+ #endif
1588
+ #endif
1589
+ }
1590
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1591
+ # define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
1592
+ #endif
1593
+
1594
+ SIMDE__FUNCTION_ATTRIBUTES
1595
+ simde__m64
1596
+ simde_mm_cvtt_ps2pi (simde__m128 a) {
1597
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1598
+ return _mm_cvtt_ps2pi(a);
1599
+ #else
1600
+ simde__m64_private r_;
1601
+ simde__m128_private a_ = simde__m128_to_private(a);
1602
+
1603
+ #if defined(SIMDE_SSE_NEON)
1604
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1605
+ #elif defined(SIMDE__CONVERT_VECTOR)
1606
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1607
+ #else
1608
+ SIMDE__VECTORIZE
1609
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1610
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1611
+ }
1612
+ #endif
1613
+
1614
+ return simde__m64_from_private(r_);
1615
+ #endif
1616
+ }
1617
+ #define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
1618
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1619
+ # define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
1620
+ # define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
1621
+ #endif
1622
+
1623
+ SIMDE__FUNCTION_ATTRIBUTES
1624
+ int32_t
1625
+ simde_mm_cvtt_ss2si (simde__m128 a) {
1626
+ #if defined(SIMDE_SSE_NATIVE)
1627
+ return _mm_cvtt_ss2si(a);
1628
+ #else
1629
+ simde__m128_private a_ = simde__m128_to_private(a);
1630
+
1631
+ #if defined(SIMDE_SSE_NEON)
1632
+ return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
1633
+ #else
1634
+ return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
1635
+ #endif
1636
+ #endif
1637
+ }
1638
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1639
+ # define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
1640
+ # define _mm_cvttss_si32(a) simde_mm_cvttss_si32((a))
1641
+ #endif
1642
+
1643
+ SIMDE__FUNCTION_ATTRIBUTES
1644
+ int64_t
1645
+ simde_mm_cvttss_si64 (simde__m128 a) {
1646
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(_MSC_VER)
1647
+ #if defined(__PGI)
1648
+ return _mm_cvttss_si64x(a);
1649
+ #else
1650
+ return _mm_cvttss_si64(a);
1651
+ #endif
1652
+ #else
1653
+ simde__m128_private a_ = simde__m128_to_private(a);
1654
+
1655
+ #if defined(SIMDE_SSE_NEON)
1656
+ return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1657
+ #else
1658
+ return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1659
+ #endif
1660
+ #endif
1661
+ }
1662
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1663
+ # define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
1664
+ #endif
1665
+
1666
+ SIMDE__FUNCTION_ATTRIBUTES
1667
+ simde__m128
1668
+ simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) {
1669
+ #if defined(SIMDE_SSE_NATIVE)
1670
+ return _mm_cmpord_ss(a, b);
1671
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
1672
+ return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
1673
+ #else
1674
+ simde__m128_private
1675
+ r_,
1676
+ a_ = simde__m128_to_private(a);
1677
+
1678
+ #if defined(simde_isnanf)
1679
+ r_.u32[0] = (simde_isnanf(simde_mm_cvtss_f32(a)) || simde_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0);
1680
+ SIMDE__VECTORIZE
1681
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1682
+ r_.u32[i] = a_.u32[i];
1683
+ }
1684
+ #else
1685
+ HEDLEY_UNREACHABLE();
1686
+ #endif
1687
+
1688
+ return simde__m128_from_private(r_);
1689
+ #endif
1690
+ }
1691
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1692
+ # define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
1693
+ #endif
1694
+
1695
+ SIMDE__FUNCTION_ATTRIBUTES
1696
+ simde__m128
1697
+ simde_mm_div_ps (simde__m128 a, simde__m128 b) {
1698
+ #if defined(SIMDE_SSE_NATIVE)
1699
+ return _mm_div_ps(a, b);
1700
+ #else
1701
+ simde__m128_private
1702
+ r_,
1703
+ a_ = simde__m128_to_private(a),
1704
+ b_ = simde__m128_to_private(b);
1705
+
1706
+ #if defined(SIMDE_SSE_NEON) && defined(SIMDE_ARCH_AARCH64)
1707
+ r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
1708
+ #elif defined(SIMDE_SSE_NEON)
1709
+ float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
1710
+ float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
1711
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
1712
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
1713
+ r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
1714
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1715
+ r_.f32 = a_.f32 / b_.f32;
1716
+ #else
1717
+ SIMDE__VECTORIZE
1718
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1719
+ r_.f32[i] = a_.f32[i] / b_.f32[i];
1720
+ }
1721
+ #endif
1722
+
1723
+ return simde__m128_from_private(r_);
1724
+ #endif
1725
+ }
1726
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1727
+ # define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
1728
+ #endif
1729
+
1730
+ SIMDE__FUNCTION_ATTRIBUTES
1731
+ simde__m128
1732
+ simde_mm_div_ss (simde__m128 a, simde__m128 b) {
1733
+ #if defined(SIMDE_SSE_NATIVE)
1734
+ return _mm_div_ss(a, b);
1735
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
1736
+ return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
1737
+ #else
1738
+ simde__m128_private
1739
+ r_,
1740
+ a_ = simde__m128_to_private(a),
1741
+ b_ = simde__m128_to_private(b);
1742
+
1743
+ r_.f32[0] = a_.f32[0] / b_.f32[0];
1744
+ SIMDE__VECTORIZE
1745
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1746
+ r_.f32[i] = a_.f32[i];
1747
+ }
1748
+
1749
+ return simde__m128_from_private(r_);
1750
+ #endif
1751
+ }
1752
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1753
+ # define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
1754
+ #endif
1755
+
1756
+ SIMDE__FUNCTION_ATTRIBUTES
1757
+ int16_t
1758
+ simde_mm_extract_pi16 (simde__m64 a, const int imm8)
1759
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
1760
+ simde__m64_private a_ = simde__m64_to_private(a);
1761
+ return a_.i16[imm8];
1762
+ }
1763
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(HEDLEY_PGI_VERSION)
1764
+ # if HEDLEY_HAS_WARNING("-Wvector-conversion")
1765
+ /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1766
+ # define simde_mm_extract_pi16(a, imm8) ( \
1767
+ HEDLEY_DIAGNOSTIC_PUSH \
1768
+ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
1769
+ HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
1770
+ HEDLEY_DIAGNOSTIC_POP \
1771
+ )
1772
+ # else
1773
+ # define simde_mm_extract_pi16(a, imm8) ((int16_t) (_mm_extract_pi16(a, imm8)))
1774
+ # endif
1775
+ #elif defined(SIMDE_SSE_NEON)
1776
+ # define simde_mm_extract_pi16(a, imm8) ((int16_t) (vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)))
1777
+ #endif
1778
+ #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
1779
+
1780
+ enum {
1781
+ #if defined(SIMDE_SSE_NATIVE)
1782
+ SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
1783
+ SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
1784
+ SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
1785
+ SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
1786
+ #else
1787
+ SIMDE_MM_ROUND_NEAREST
1788
+ #if defined(FE_TONEAREST)
1789
+ = FE_TONEAREST
1790
+ #endif
1791
+ ,
1792
+
1793
+ SIMDE_MM_ROUND_DOWN
1794
+ #if defined(FE_DOWNWARD)
1795
+ = FE_DOWNWARD
1796
+ #endif
1797
+ ,
1798
+
1799
+ SIMDE_MM_ROUND_UP
1800
+ #if defined(FE_UPWARD)
1801
+ = FE_UPWARD
1802
+ #endif
1803
+ ,
1804
+
1805
+ SIMDE_MM_ROUND_TOWARD_ZERO
1806
+ #if defined(FE_TOWARDZERO)
1807
+ = FE_TOWARDZERO
1808
+ #endif
1809
+ #endif
1810
+ };
1811
+
1812
+ SIMDE__FUNCTION_ATTRIBUTES
1813
+ unsigned int
1814
+ SIMDE_MM_GET_ROUNDING_MODE(void) {
1815
+ #if defined(SIMDE_SSE_NATIVE)
1816
+ return _MM_GET_ROUNDING_MODE();
1817
+ #elif defined(SIMDE_HAVE_MATH_H)
1818
+ return (unsigned int) fegetround();
1819
+ #else
1820
+ HEDLEY_UNREACHABLE();
1821
+ #endif
1822
+ }
1823
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1824
+ # define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), imm8)
1825
+ #endif
1826
+
1827
+ SIMDE__FUNCTION_ATTRIBUTES
1828
+ void
1829
+ SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) {
1830
+ #if defined(SIMDE_SSE_NATIVE)
1831
+ _MM_SET_ROUNDING_MODE(a);
1832
+ #else
1833
+ fesetround((int) a);
1834
+ #endif
1835
+ }
1836
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1837
+ # define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
1838
+ #endif
1839
+
1840
+ SIMDE__FUNCTION_ATTRIBUTES
1841
+ simde__m64
1842
+ simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8)
1843
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
1844
+ simde__m64_private
1845
+ r_,
1846
+ a_ = simde__m64_to_private(a);
1847
+
1848
+ r_.i64[0] = a_.i64[0];
1849
+ r_.i16[imm8] = i;
1850
+
1851
+ return simde__m64_from_private(r_);
1852
+ }
1853
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(__PGI)
1854
+ # if HEDLEY_HAS_WARNING("-Wvector-conversion")
1855
+ /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1856
+ # define ssimde_mm_insert_pi16(a, i, imm8) ( \
1857
+ HEDLEY_DIAGNOSTIC_PUSH \
1858
+ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
1859
+ (_mm_insert_pi16((a), (i), (imm8))) \
1860
+ HEDLEY_DIAGNOSTIC_POP \
1861
+ )
1862
+ # else
1863
+ # define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
1864
+ # endif
1865
+ #elif defined(SIMDE_SSE_NEON)
1866
+ # define simde_mm_insert_pi16(a, i, imm8) simde__m64_from_private((simde__m64_private) { .neon_i16 = vset_lane_s16(i, simde__m64_to_private(a).neon_i16, (imm8)) })
1867
+ #endif
1868
+ #define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
1869
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1870
+ # define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1871
+ # define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1872
+ #endif
1873
+
1874
+ SIMDE__FUNCTION_ATTRIBUTES
1875
+ simde__m128
1876
+ simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
1877
+ simde_assert_aligned(16, mem_addr);
1878
+
1879
+ #if defined(SIMDE_SSE_NATIVE)
1880
+ return _mm_load_ps(mem_addr);
1881
+ #else
1882
+ simde__m128_private r_;
1883
+
1884
+ #if defined(SIMDE_SSE_NEON)
1885
+ r_.neon_f32 = vld1q_f32(mem_addr);
1886
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
1887
+ r_.altivec_f32 = vec_ld(0, mem_addr);
1888
+ #else
1889
+ r_ = *SIMDE_CAST_ALIGN(16, simde__m128_private const*, mem_addr);
1890
+ #endif
1891
+
1892
+ return simde__m128_from_private(r_);
1893
+ #endif
1894
+ }
1895
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1896
+ # define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
1897
+ #endif
1898
+
1899
+ SIMDE__FUNCTION_ATTRIBUTES
1900
+ simde__m128
1901
+ simde_mm_load_ps1 (simde_float32 const* mem_addr) {
1902
+ #if defined(SIMDE_SSE_NATIVE)
1903
+ return _mm_load_ps1(mem_addr);
1904
+ #else
1905
+ simde__m128_private r_;
1906
+
1907
+ #if defined(SIMDE_SSE_NEON)
1908
+ r_.neon_f32 = vld1q_dup_f32(mem_addr);
1909
+ #else
1910
+ r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
1911
+ #endif
1912
+
1913
+ return simde__m128_from_private(r_);
1914
+ #endif
1915
+ }
1916
+ #define simde_mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1917
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1918
+ # define _mm_load_ps1(mem_addr) simde_mm_load_ps1(mem_addr)
1919
+ # define _mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1920
+ #endif
1921
+
1922
+ SIMDE__FUNCTION_ATTRIBUTES
1923
+ simde__m128
1924
+ simde_mm_load_ss (simde_float32 const* mem_addr) {
1925
+ #if defined(SIMDE_SSE_NATIVE)
1926
+ return _mm_load_ss(mem_addr);
1927
+ #else
1928
+ simde__m128_private r_;
1929
+
1930
+ #if defined(SIMDE_SSE_NEON)
1931
+ r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1932
+ #else
1933
+ r_.f32[0] = *mem_addr;
1934
+ r_.i32[1] = 0;
1935
+ r_.i32[2] = 0;
1936
+ r_.i32[3] = 0;
1937
+ #endif
1938
+
1939
+ return simde__m128_from_private(r_);
1940
+ #endif
1941
+ }
1942
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1943
+ # define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
1944
+ #endif
1945
+
1946
+ SIMDE__FUNCTION_ATTRIBUTES
1947
+ simde__m128
1948
+ simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) {
1949
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1950
+ return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
1951
+ #else
1952
+ simde__m128_private
1953
+ r_,
1954
+ a_ = simde__m128_to_private(a);
1955
+
1956
+ #if defined(SIMDE_SSE_NEON)
1957
+ r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)));
1958
+ #else
1959
+ simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr);
1960
+ r_.f32[0] = a_.f32[0];
1961
+ r_.f32[1] = a_.f32[1];
1962
+ r_.f32[2] = b_.f32[0];
1963
+ r_.f32[3] = b_.f32[1];
1964
+ #endif
1965
+
1966
+ return simde__m128_from_private(r_);
1967
+ #endif
1968
+ }
1969
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1970
+ # define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), (simde__m64 const*) (mem_addr))
1971
+ #endif
1972
+
1973
+ /* The SSE documentation says that there are no alignment requirements
1974
+ for mem_addr. Unfortunately they used the __m64 type for the argument
1975
+ which is supposed to be 8-byte aligned, so some compilers (like clang
1976
+ with -Wcast-align) will generate a warning if you try to cast, say,
1977
+ a simde_float32* to a simde__m64* for this function.
1978
+
1979
+ I think the choice of argument type is unfortunate, but I do think we
1980
+ need to stick to it here. If there is demand I can always add something
1981
+ like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
1982
+ SIMDE__FUNCTION_ATTRIBUTES
1983
+ simde__m128
1984
+ simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) {
1985
+ #if defined(SIMDE_SSE_NATIVE)
1986
+ return _mm_loadl_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
1987
+ #else
1988
+ simde__m128_private
1989
+ r_,
1990
+ a_ = simde__m128_to_private(a);
1991
+
1992
+ #if defined(SIMDE_SSE_NEON)
1993
+ r_.neon_f32 = vcombine_f32(vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32));
1994
+ #else
1995
+ simde__m64_private b_;
1996
+ simde_memcpy(&b_, mem_addr, sizeof(b_));
1997
+ r_.i32[0] = b_.i32[0];
1998
+ r_.i32[1] = b_.i32[1];
1999
+ r_.i32[2] = a_.i32[2];
2000
+ r_.i32[3] = a_.i32[3];
2001
+ #endif
2002
+
2003
+ return simde__m128_from_private(r_);
2004
+ #endif
2005
+ }
2006
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2007
+ # define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), (simde__m64 const*) (mem_addr))
2008
+ #endif
2009
+
2010
+ SIMDE__FUNCTION_ATTRIBUTES
2011
+ simde__m128
2012
+ simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
2013
+ simde_assert_aligned(16, mem_addr);
2014
+
2015
+ #if defined(SIMDE_SSE_NATIVE)
2016
+ return _mm_loadr_ps(mem_addr);
2017
+ #else
2018
+ simde__m128_private
2019
+ r_,
2020
+ v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
2021
+
2022
+ #if defined(SIMDE_SSE_NEON)
2023
+ r_.neon_f32 = vrev64q_f32(v_.neon_f32);
2024
+ r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
2025
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2026
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
2027
+ #else
2028
+ r_.f32[0] = v_.f32[3];
2029
+ r_.f32[1] = v_.f32[2];
2030
+ r_.f32[2] = v_.f32[1];
2031
+ r_.f32[3] = v_.f32[0];
2032
+ #endif
2033
+
2034
+ return simde__m128_from_private(r_);
2035
+ #endif
2036
+ }
2037
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2038
+ # define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
2039
+ #endif
2040
+
2041
+ SIMDE__FUNCTION_ATTRIBUTES
2042
+ simde__m128
2043
+ simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
2044
+ #if defined(SIMDE_SSE_NATIVE)
2045
+ return _mm_loadu_ps(mem_addr);
2046
+ #else
2047
+ simde__m128_private r_;
2048
+
2049
+ #if defined(SIMDE_SSE_NEON)
2050
+ r_.neon_f32 = vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr));
2051
+ #else
2052
+ r_.f32[0] = mem_addr[0];
2053
+ r_.f32[1] = mem_addr[1];
2054
+ r_.f32[2] = mem_addr[2];
2055
+ r_.f32[3] = mem_addr[3];
2056
+ #endif
2057
+
2058
+ return simde__m128_from_private(r_);
2059
+ #endif
2060
+ }
2061
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2062
+ # define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
2063
+ #endif
2064
+
2065
+ SIMDE__FUNCTION_ATTRIBUTES
2066
+ void
2067
+ simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, int8_t* mem_addr) {
2068
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2069
+ _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
2070
+ #else
2071
+ simde__m64_private
2072
+ a_ = simde__m64_to_private(a),
2073
+ mask_ = simde__m64_to_private(mask);
2074
+
2075
+ SIMDE__VECTORIZE
2076
+ for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++)
2077
+ if (mask_.i8[i] < 0)
2078
+ mem_addr[i] = a_.i8[i];
2079
+ #endif
2080
+ }
2081
+ #define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr)
2082
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2083
+ # define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64(a, (mask), mem_addr)
2084
+ #endif
2085
+
2086
+ SIMDE__FUNCTION_ATTRIBUTES
2087
+ simde__m64
2088
+ simde_mm_max_pi16 (simde__m64 a, simde__m64 b) {
2089
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2090
+ return _mm_max_pi16(a, b);
2091
+ #else
2092
+ simde__m64_private
2093
+ r_,
2094
+ a_ = simde__m64_to_private(a),
2095
+ b_ = simde__m64_to_private(b);
2096
+
2097
+ #if defined(SIMDE_SSE_NEON)
2098
+ r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
2099
+ #else
2100
+ SIMDE__VECTORIZE
2101
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2102
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2103
+ }
2104
+ #endif
2105
+
2106
+ return simde__m64_from_private(r_);
2107
+ #endif
2108
+ }
2109
+ #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2110
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2111
+ # define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
2112
+ # define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2113
+ #endif
2114
+
2115
+ SIMDE__FUNCTION_ATTRIBUTES
2116
+ simde__m128
2117
+ simde_mm_max_ps (simde__m128 a, simde__m128 b) {
2118
+ #if defined(SIMDE_SSE_NATIVE)
2119
+ return _mm_max_ps(a, b);
2120
+ #else
2121
+ simde__m128_private
2122
+ r_,
2123
+ a_ = simde__m128_to_private(a),
2124
+ b_ = simde__m128_to_private(b);
2125
+
2126
+ #if defined(SIMDE_SSE_NEON)
2127
+ r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
2128
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
2129
+ r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
2130
+ #else
2131
+ SIMDE__VECTORIZE
2132
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2133
+ r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2134
+ }
2135
+ #endif
2136
+
2137
+ return simde__m128_from_private(r_);
2138
+ #endif
2139
+ }
2140
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2141
+ # define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
2142
+ #endif
2143
+
2144
+ SIMDE__FUNCTION_ATTRIBUTES
2145
+ simde__m64
2146
+ simde_mm_max_pu8 (simde__m64 a, simde__m64 b) {
2147
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2148
+ return _mm_max_pu8(a, b);
2149
+ #else
2150
+ simde__m64_private
2151
+ r_,
2152
+ a_ = simde__m64_to_private(a),
2153
+ b_ = simde__m64_to_private(b);
2154
+
2155
+ #if defined(SIMDE_SSE_NEON)
2156
+ r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
2157
+ #else
2158
+ SIMDE__VECTORIZE
2159
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2160
+ r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2161
+ }
2162
+ #endif
2163
+
2164
+ return simde__m64_from_private(r_);
2165
+ #endif
2166
+ }
2167
+ #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2168
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2169
+ # define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
2170
+ # define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2171
+ #endif
2172
+
2173
+ SIMDE__FUNCTION_ATTRIBUTES
2174
+ simde__m128
2175
+ simde_mm_max_ss (simde__m128 a, simde__m128 b) {
2176
+ #if defined(SIMDE_SSE_NATIVE)
2177
+ return _mm_max_ss(a, b);
2178
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2179
+ return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
2180
+ #else
2181
+ simde__m128_private
2182
+ r_,
2183
+ a_ = simde__m128_to_private(a),
2184
+ b_ = simde__m128_to_private(b);
2185
+
2186
+ r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2187
+ r_.f32[1] = a_.f32[1];
2188
+ r_.f32[2] = a_.f32[2];
2189
+ r_.f32[3] = a_.f32[3];
2190
+
2191
+ return simde__m128_from_private(r_);
2192
+ #endif
2193
+ }
2194
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2195
+ # define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
2196
+ #endif
2197
+
2198
+ SIMDE__FUNCTION_ATTRIBUTES
2199
+ simde__m64
2200
+ simde_mm_min_pi16 (simde__m64 a, simde__m64 b) {
2201
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2202
+ return _mm_min_pi16(a, b);
2203
+ #else
2204
+ simde__m64_private
2205
+ r_,
2206
+ a_ = simde__m64_to_private(a),
2207
+ b_ = simde__m64_to_private(b);
2208
+
2209
+ #if defined(SIMDE_SSE_NEON)
2210
+ r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
2211
+ #else
2212
+ SIMDE__VECTORIZE
2213
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2214
+ r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2215
+ }
2216
+ #endif
2217
+
2218
+ return simde__m64_from_private(r_);
2219
+ #endif
2220
+ }
2221
+ #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
2222
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2223
+ # define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
2224
+ # define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
2225
+ #endif
2226
+
2227
+ SIMDE__FUNCTION_ATTRIBUTES
2228
+ simde__m128
2229
+ simde_mm_min_ps (simde__m128 a, simde__m128 b) {
2230
+ #if defined(SIMDE_SSE_NATIVE)
2231
+ return _mm_min_ps(a, b);
2232
+ #else
2233
+ simde__m128_private
2234
+ r_,
2235
+ a_ = simde__m128_to_private(a),
2236
+ b_ = simde__m128_to_private(b);
2237
+
2238
+ #if defined(SIMDE_SSE_NEON)
2239
+ r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32);
2240
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
2241
+ r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
2242
+ #else
2243
+ SIMDE__VECTORIZE
2244
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2245
+ r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2246
+ }
2247
+ #endif
2248
+
2249
+ return simde__m128_from_private(r_);
2250
+ #endif
2251
+ }
2252
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2253
+ # define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
2254
+ #endif
2255
+
2256
+ SIMDE__FUNCTION_ATTRIBUTES
2257
+ simde__m64
2258
+ simde_mm_min_pu8 (simde__m64 a, simde__m64 b) {
2259
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2260
+ return _mm_min_pu8(a, b);
2261
+ #else
2262
+ simde__m64_private
2263
+ r_,
2264
+ a_ = simde__m64_to_private(a),
2265
+ b_ = simde__m64_to_private(b);
2266
+
2267
+ #if defined(SIMDE_SSE_NEON)
2268
+ r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
2269
+ #else
2270
+ SIMDE__VECTORIZE
2271
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2272
+ r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2273
+ }
2274
+ #endif
2275
+
2276
+ return simde__m64_from_private(r_);
2277
+ #endif
2278
+ }
2279
+ #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
2280
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2281
+ # define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
2282
+ # define _m_pminub(a, b) simde_mm_min_pu8(a, b)
2283
+ #endif
2284
+
2285
+ SIMDE__FUNCTION_ATTRIBUTES
2286
+ simde__m128
2287
+ simde_mm_min_ss (simde__m128 a, simde__m128 b) {
2288
+ #if defined(SIMDE_SSE_NATIVE)
2289
+ return _mm_min_ss(a, b);
2290
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2291
+ return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
2292
+ #else
2293
+ simde__m128_private
2294
+ r_,
2295
+ a_ = simde__m128_to_private(a),
2296
+ b_ = simde__m128_to_private(b);
2297
+
2298
+ r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2299
+ r_.f32[1] = a_.f32[1];
2300
+ r_.f32[2] = a_.f32[2];
2301
+ r_.f32[3] = a_.f32[3];
2302
+
2303
+ return simde__m128_from_private(r_);
2304
+ #endif
2305
+ }
2306
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2307
+ # define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
2308
+ #endif
2309
+
2310
+ SIMDE__FUNCTION_ATTRIBUTES
2311
+ simde__m128
2312
+ simde_mm_movehl_ps (simde__m128 a, simde__m128 b) {
2313
+ #if defined(SIMDE_SSE_NATIVE)
2314
+ return _mm_movehl_ps(a, b);
2315
+ #else
2316
+ simde__m128_private
2317
+ r_,
2318
+ a_ = simde__m128_to_private(a),
2319
+ b_ = simde__m128_to_private(b);
2320
+
2321
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2322
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
2323
+ #else
2324
+ r_.f32[0] = b_.f32[2];
2325
+ r_.f32[1] = b_.f32[3];
2326
+ r_.f32[2] = a_.f32[2];
2327
+ r_.f32[3] = a_.f32[3];
2328
+ #endif
2329
+
2330
+ return simde__m128_from_private(r_);
2331
+ #endif
2332
+ }
2333
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2334
+ # define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
2335
+ #endif
2336
+
2337
+ SIMDE__FUNCTION_ATTRIBUTES
2338
+ simde__m128
2339
+ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) {
2340
+ #if defined(SIMDE_SSE_NATIVE)
2341
+ return _mm_movelh_ps(a, b);
2342
+ #else
2343
+ simde__m128_private
2344
+ r_,
2345
+ a_ = simde__m128_to_private(a),
2346
+ b_ = simde__m128_to_private(b);
2347
+
2348
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2349
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
2350
+ #else
2351
+ r_.f32[0] = a_.f32[0];
2352
+ r_.f32[1] = a_.f32[1];
2353
+ r_.f32[2] = b_.f32[0];
2354
+ r_.f32[3] = b_.f32[1];
2355
+ #endif
2356
+
2357
+ return simde__m128_from_private(r_);
2358
+ #endif
2359
+ }
2360
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2361
+ # define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
2362
+ #endif
2363
+
2364
+ SIMDE__FUNCTION_ATTRIBUTES
2365
+ int
2366
+ simde_mm_movemask_pi8 (simde__m64 a) {
2367
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2368
+ return _mm_movemask_pi8(a);
2369
+ #else
2370
+ simde__m64_private a_ = simde__m64_to_private(a);
2371
+ int r = 0;
2372
+ const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
2373
+
2374
+ SIMDE__VECTORIZE_REDUCTION(|:r)
2375
+ for (size_t i = 0 ; i < nmemb ; i++) {
2376
+ r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
2377
+ }
2378
+
2379
+ return r;
2380
+ #endif
2381
+ }
2382
+ #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
2383
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2384
+ # define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
2385
+ #endif
2386
+
2387
+ SIMDE__FUNCTION_ATTRIBUTES
2388
+ int
2389
+ simde_mm_movemask_ps (simde__m128 a) {
2390
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2391
+ return _mm_movemask_ps(a);
2392
+ #else
2393
+ int r = 0;
2394
+ simde__m128_private a_ = simde__m128_to_private(a);
2395
+
2396
+ #if defined(SIMDE_SSE_NEON)
2397
+ /* TODO: check to see if NEON version is faster than the portable version */
2398
+ static const uint32x4_t movemask = { 1, 2, 4, 8 };
2399
+ static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
2400
+ uint32x4_t t0 = a_.neon_u32;
2401
+ uint32x4_t t1 = vtstq_u32(t0, highbit);
2402
+ uint32x4_t t2 = vandq_u32(t1, movemask);
2403
+ uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
2404
+ r = vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
2405
+ #else
2406
+ SIMDE__VECTORIZE_REDUCTION(|:r)
2407
+ for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) {
2408
+ r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
2409
+ }
2410
+ #endif
2411
+
2412
+ return r;
2413
+ #endif
2414
+ }
2415
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2416
+ # define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
2417
+ #endif
2418
+
2419
+ SIMDE__FUNCTION_ATTRIBUTES
2420
+ simde__m128
2421
+ simde_mm_mul_ps (simde__m128 a, simde__m128 b) {
2422
+ #if defined(SIMDE_SSE_NATIVE)
2423
+ return _mm_mul_ps(a, b);
2424
+ #else
2425
+ simde__m128_private
2426
+ r_,
2427
+ a_ = simde__m128_to_private(a),
2428
+ b_ = simde__m128_to_private(b);
2429
+
2430
+ #if defined(SIMDE_SSE_NEON)
2431
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
2432
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
2433
+ r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
2434
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2435
+ r_.f32 = a_.f32 * b_.f32;
2436
+ #else
2437
+ SIMDE__VECTORIZE
2438
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2439
+ r_.f32[i] = a_.f32[i] * b_.f32[i];
2440
+ }
2441
+ #endif
2442
+
2443
+ return simde__m128_from_private(r_);
2444
+ #endif
2445
+ }
2446
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2447
+ # define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
2448
+ #endif
2449
+
2450
+ SIMDE__FUNCTION_ATTRIBUTES
2451
+ simde__m128
2452
+ simde_mm_mul_ss (simde__m128 a, simde__m128 b) {
2453
+ #if defined(SIMDE_SSE_NATIVE)
2454
+ return _mm_mul_ss(a, b);
2455
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2456
+ return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
2457
+ #else
2458
+ simde__m128_private
2459
+ r_,
2460
+ a_ = simde__m128_to_private(a),
2461
+ b_ = simde__m128_to_private(b);
2462
+
2463
+ r_.f32[0] = a_.f32[0] * b_.f32[0];
2464
+ r_.f32[1] = a_.f32[1];
2465
+ r_.f32[2] = a_.f32[2];
2466
+ r_.f32[3] = a_.f32[3];
2467
+
2468
+ return simde__m128_from_private(r_);
2469
+ #endif
2470
+ }
2471
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2472
+ # define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
2473
+ #endif
2474
+
2475
+ SIMDE__FUNCTION_ATTRIBUTES
2476
+ simde__m64
2477
+ simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) {
2478
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2479
+ return _mm_mulhi_pu16(a, b);
2480
+ #else
2481
+ simde__m64_private
2482
+ r_,
2483
+ a_ = simde__m64_to_private(a),
2484
+ b_ = simde__m64_to_private(b);
2485
+
2486
+ SIMDE__VECTORIZE
2487
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
2488
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16)));
2489
+ }
2490
+
2491
+ return simde__m64_from_private(r_);
2492
+ #endif
2493
+ }
2494
+ #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
2495
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2496
+ # define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
2497
+ #endif
2498
+
2499
+ SIMDE__FUNCTION_ATTRIBUTES
2500
+ simde__m128
2501
+ simde_mm_or_ps (simde__m128 a, simde__m128 b) {
2502
+ #if defined(SIMDE_SSE_NATIVE)
2503
+ return _mm_or_ps(a, b);
2504
+ #else
2505
+ simde__m128_private
2506
+ r_,
2507
+ a_ = simde__m128_to_private(a),
2508
+ b_ = simde__m128_to_private(b);
2509
+
2510
+ #if defined(SIMDE_SSE_NEON)
2511
+ r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
2512
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2513
+ r_.i32f = a_.i32f | b_.i32f;
2514
+ #else
2515
+ SIMDE__VECTORIZE
2516
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
2517
+ r_.u32[i] = a_.u32[i] | b_.u32[i];
2518
+ }
2519
+ #endif
2520
+
2521
+ return simde__m128_from_private(r_);
2522
+ #endif
2523
+ }
2524
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2525
+ # define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
2526
+ #endif
2527
+
2528
+ SIMDE__FUNCTION_ATTRIBUTES
2529
+ void
2530
+ simde_mm_prefetch (char const* p, int i) {
2531
+ (void) p;
2532
+ (void) i;
2533
+ }
2534
+ #if defined(SIMDE_SSE_NATIVE)
2535
+ # define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
2536
+ #endif
2537
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2538
+ # define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
2539
+ #endif
2540
+
2541
+ SIMDE__FUNCTION_ATTRIBUTES
2542
+ simde__m128
2543
+ simde_mm_rcp_ps (simde__m128 a) {
2544
+ #if defined(SIMDE_SSE_NATIVE)
2545
+ return _mm_rcp_ps(a);
2546
+ #else
2547
+ simde__m128_private
2548
+ r_,
2549
+ a_ = simde__m128_to_private(a);
2550
+
2551
+ #if defined(SIMDE_SSE_NEON)
2552
+ float32x4_t recip = vrecpeq_f32(a_.neon_f32);
2553
+
2554
+ # if !defined(SIMDE_MM_RCP_PS_ITERS)
2555
+ # define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS
2556
+ # endif
2557
+
2558
+ for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS ; ++i) {
2559
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
2560
+ }
2561
+
2562
+ r_.neon_f32 = recip;
2563
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2564
+ r_.f32 = 1.0f / a_.f32;
2565
+ #else
2566
+ SIMDE__VECTORIZE
2567
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2568
+ r_.f32[i] = 1.0f / a_.f32[i];
2569
+ }
2570
+ #endif
2571
+
2572
+ return simde__m128_from_private(r_);
2573
+ #endif
2574
+ }
2575
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2576
+ # define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
2577
+ #endif
2578
+
2579
+ SIMDE__FUNCTION_ATTRIBUTES
2580
+ simde__m128
2581
+ simde_mm_rcp_ss (simde__m128 a) {
2582
+ #if defined(SIMDE_SSE_NATIVE)
2583
+ return _mm_rcp_ss(a);
2584
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2585
+ return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
2586
+ #else
2587
+ simde__m128_private
2588
+ r_,
2589
+ a_ = simde__m128_to_private(a);
2590
+
2591
+ r_.f32[0] = 1.0f / a_.f32[0];
2592
+ r_.f32[1] = a_.f32[1];
2593
+ r_.f32[2] = a_.f32[2];
2594
+ r_.f32[3] = a_.f32[3];
2595
+
2596
+ return simde__m128_from_private(r_);
2597
+ #endif
2598
+ }
2599
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2600
+ # define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
2601
+ #endif
2602
+
2603
+ SIMDE__FUNCTION_ATTRIBUTES
2604
+ simde__m128
2605
+ simde_mm_rsqrt_ps (simde__m128 a) {
2606
+ #if defined(SIMDE_SSE_NATIVE)
2607
+ return _mm_rsqrt_ps(a);
2608
+ #else
2609
+ simde__m128_private
2610
+ r_,
2611
+ a_ = simde__m128_to_private(a);
2612
+
2613
+ #if defined(SIMDE_SSE_NEON)
2614
+ r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
2615
+ #elif defined(__STDC_IEC_559__)
2616
+ /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */
2617
+ SIMDE__VECTORIZE
2618
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2619
+ r_.i32[i] = INT32_C(0x5f3759df) - (a_.i32[i] >> 1);
2620
+
2621
+ #if SIMDE_ACCURACY_ITERS > 2
2622
+ const float half = SIMDE_FLOAT32_C(0.5) * a_.f32[i];
2623
+ for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++)
2624
+ r_.f32[i] *= SIMDE_FLOAT32_C(1.5) - (half * r_.f32[i] * r_.f32[i]);
2625
+ #endif
2626
+ }
2627
+ #elif defined(SIMDE_HAVE_MATH_H)
2628
+ SIMDE__VECTORIZE
2629
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2630
+ r_.f32[i] = 1.0f / sqrtf(a_.f32[i]);
2631
+ }
2632
+ #else
2633
+ HEDLEY_UNREACHABLE();
2634
+ #endif
2635
+
2636
+ return simde__m128_from_private(r_);
2637
+ #endif
2638
+ }
2639
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2640
+ # define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
2641
+ #endif
2642
+
2643
+ SIMDE__FUNCTION_ATTRIBUTES
2644
+ simde__m128
2645
+ simde_mm_rsqrt_ss (simde__m128 a) {
2646
+ #if defined(SIMDE_SSE_NATIVE)
2647
+ return _mm_rsqrt_ss(a);
2648
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2649
+ return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
2650
+ #else
2651
+ simde__m128_private
2652
+ r_,
2653
+ a_ = simde__m128_to_private(a);
2654
+
2655
+ #if defined(__STDC_IEC_559__)
2656
+ {
2657
+ r_.i32[0] = INT32_C(0x5f3759df) - (a_.i32[0] >> 1);
2658
+
2659
+ #if SIMDE_ACCURACY_ITERS > 2
2660
+ float half = SIMDE_FLOAT32_C(0.5) * a_.f32[0];
2661
+ for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++)
2662
+ r_.f32[0] *= SIMDE_FLOAT32_C(1.5) - (half * r_.f32[0] * r_.f32[0]);
2663
+ #endif
2664
+ }
2665
+ r_.f32[0] = 1.0f / sqrtf(a_.f32[0]);
2666
+ r_.f32[1] = a_.f32[1];
2667
+ r_.f32[2] = a_.f32[2];
2668
+ r_.f32[3] = a_.f32[3];
2669
+ #elif defined(SIMDE_HAVE_MATH_H)
2670
+ r_.f32[0] = 1.0f / sqrtf(a_.f32[0]);
2671
+ r_.f32[1] = a_.f32[1];
2672
+ r_.f32[2] = a_.f32[2];
2673
+ r_.f32[3] = a_.f32[3];
2674
+ #else
2675
+ HEDLEY_UNREACHABLE();
2676
+ #endif
2677
+
2678
+ return simde__m128_from_private(r_);
2679
+ #endif
2680
+ }
2681
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2682
+ # define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
2683
+ #endif
2684
+
2685
+ SIMDE__FUNCTION_ATTRIBUTES
2686
+ simde__m64
2687
+ simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) {
2688
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2689
+ return _mm_sad_pu8(a, b);
2690
+ #else
2691
+ simde__m64_private
2692
+ r_,
2693
+ a_ = simde__m64_to_private(a),
2694
+ b_ = simde__m64_to_private(b);
2695
+ uint16_t sum = 0;
2696
+
2697
+ #if defined(SIMDE_HAVE_STDLIB_H)
2698
+ SIMDE__VECTORIZE_REDUCTION(+:sum)
2699
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2700
+ sum += (uint8_t) abs(a_.u8[i] - b_.u8[i]);
2701
+ }
2702
+
2703
+ r_.i16[0] = (int16_t) sum;
2704
+ r_.i16[1] = 0;
2705
+ r_.i16[2] = 0;
2706
+ r_.i16[3] = 0;
2707
+ #else
2708
+ HEDLEY_UNREACHABLE();
2709
+ #endif
2710
+
2711
+ return simde__m64_from_private(r_);
2712
+ #endif
2713
+ }
2714
+ #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2715
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2716
+ # define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
2717
+ # define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2718
+ #endif
2719
+
2720
+ SIMDE__FUNCTION_ATTRIBUTES
2721
+ simde__m128
2722
+ simde_mm_set_ss (simde_float32 a) {
2723
+ #if defined(SIMDE_SSE_NATIVE)
2724
+ return _mm_set_ss(a);
2725
+ #elif defined(SIMDE_SSE_NEON)
2726
+ return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
2727
+ #else
2728
+ return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), a);
2729
+ #endif
2730
+ }
2731
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2732
+ # define _mm_set_ss(a) simde_mm_set_ss(a)
2733
+ #endif
2734
+
2735
+ SIMDE__FUNCTION_ATTRIBUTES
2736
+ simde__m128
2737
+ simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
2738
+ #if defined(SIMDE_SSE_NATIVE)
2739
+ return _mm_setr_ps(e3, e2, e1, e0);
2740
+ #elif defined(SIMDE_SSE_NEON)
2741
+ SIMDE_ALIGN(16) simde_float32 data[4] = { e3, e2, e1, e0 };
2742
+ return vld1q_f32(data);
2743
+ #else
2744
+ return simde_mm_set_ps(e0, e1, e2, e3);
2745
+ #endif
2746
+ }
2747
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2748
+ # define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
2749
+ #endif
2750
+
2751
+ SIMDE__FUNCTION_ATTRIBUTES
2752
+ simde__m128
2753
+ simde_mm_setzero_ps (void) {
2754
+ #if defined(SIMDE_SSE_NATIVE)
2755
+ return _mm_setzero_ps();
2756
+ #elif defined(SIMDE_SSE_NEON)
2757
+ return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
2758
+ #else
2759
+ simde__m128 r;
2760
+ simde_memset(&r, 0, sizeof(r));
2761
+ return r;
2762
+ #endif
2763
+ }
2764
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2765
+ # define _mm_setzero_ps() simde_mm_setzero_ps()
2766
+ #endif
2767
+
2768
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2769
+ HEDLEY_DIAGNOSTIC_PUSH
2770
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
2771
+ #endif
2772
+
2773
+ SIMDE__FUNCTION_ATTRIBUTES
2774
+ simde__m128
2775
+ simde_mm_undefined_ps (void) {
2776
+ simde__m128_private r_;
2777
+
2778
+ #if defined(SIMDE__HAVE_UNDEFINED128)
2779
+ r_.n = _mm_undefined_ps();
2780
+ #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2781
+ r_ = simde__m128_to_private(simde_mm_setzero_ps());
2782
+ #endif
2783
+
2784
+ return simde__m128_from_private(r_);
2785
+ }
2786
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2787
+ # define _mm_undefined_ps() simde_mm_undefined_ps()
2788
+ #endif
2789
+
2790
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2791
+ HEDLEY_DIAGNOSTIC_POP
2792
+ #endif
2793
+
2794
+ SIMDE__FUNCTION_ATTRIBUTES
2795
+ simde__m128
2796
+ simde_mm_setone_ps (void) {
2797
+ simde__m128 t = simde_mm_setzero_ps();
2798
+ return simde_mm_cmpeq_ps(t, t);
2799
+ }
2800
+
2801
+ SIMDE__FUNCTION_ATTRIBUTES
2802
+ void
2803
+ simde_mm_sfence (void) {
2804
+ /* TODO: Use Hedley. */
2805
+ #if defined(SIMDE_SSE_NATIVE)
2806
+ _mm_sfence();
2807
+ #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2808
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
2809
+ #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
2810
+ # if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
2811
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
2812
+ # else
2813
+ atomic_thread_fence(memory_order_seq_cst);
2814
+ # endif
2815
+ #elif defined(_MSC_VER)
2816
+ MemoryBarrier();
2817
+ #elif HEDLEY_HAS_EXTENSION(c_atomic)
2818
+ __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
2819
+ #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
2820
+ __sync_synchronize();
2821
+ #elif defined(_OPENMP)
2822
+ # pragma omp critical(simde_mm_sfence_)
2823
+ { }
2824
+ #endif
2825
+ }
2826
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2827
+ # define _mm_sfence() simde_mm_sfence()
2828
+ #endif
2829
+
2830
+ #define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2831
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2832
+ # define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
2833
+ #endif
2834
+
2835
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(__PGI)
2836
+ # define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
2837
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2838
+ # define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \
2839
+ const simde__m64_private simde__tmp_a_ = simde__m64_to_private(a); \
2840
+ simde__m64_from_private((simde__m64_private) { .i16 = \
2841
+ SIMDE__SHUFFLE_VECTOR(16, 8, \
2842
+ (simde__tmp_a_).i16, \
2843
+ (simde__tmp_a_).i16, \
2844
+ (((imm8) ) & 3), \
2845
+ (((imm8) >> 2) & 3), \
2846
+ (((imm8) >> 4) & 3), \
2847
+ (((imm8) >> 6) & 3)) }); }))
2848
+ #else
2849
+ SIMDE__FUNCTION_ATTRIBUTES
2850
+ simde__m64
2851
+ simde_mm_shuffle_pi16 (simde__m64 a, const int imm8)
2852
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
2853
+ simde__m64_private r_;
2854
+ simde__m64_private a_ = simde__m64_to_private(a);
2855
+
2856
+ for (size_t i = 0 ; i < sizeof(r_.i16) / sizeof(r_.i16[0]) ; i++) {
2857
+ r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
2858
+ }
2859
+
2860
+ HEDLEY_DIAGNOSTIC_PUSH
2861
+ #if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
2862
+ # pragma clang diagnostic ignored "-Wconditional-uninitialized"
2863
+ #endif
2864
+ return simde__m64_from_private(r_);
2865
+ HEDLEY_DIAGNOSTIC_POP
2866
+ }
2867
+ #endif
2868
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2869
+ # define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
2870
+ #else
2871
+ # define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2872
+ #endif
2873
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2874
+ # define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2875
+ # define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2876
+ #endif
2877
+
2878
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2879
+ # define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
2880
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2881
+ # define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \
2882
+ simde__m128_from_private((simde__m128_private) { .f32 = \
2883
+ SIMDE__SHUFFLE_VECTOR(32, 16, \
2884
+ simde__m128_to_private(a).f32, \
2885
+ simde__m128_to_private(b).f32, \
2886
+ (((imm8) ) & 3), \
2887
+ (((imm8) >> 2) & 3), \
2888
+ (((imm8) >> 4) & 3) + 4, \
2889
+ (((imm8) >> 6) & 3) + 4) }); }))
2890
+ #else
2891
+ SIMDE__FUNCTION_ATTRIBUTES
2892
+ simde__m128
2893
+ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8)
2894
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
2895
+ simde__m128_private
2896
+ r_,
2897
+ a_ = simde__m128_to_private(a),
2898
+ b_ = simde__m128_to_private(b);
2899
+
2900
+ r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
2901
+ r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
2902
+ r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
2903
+ r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
2904
+
2905
+ return simde__m128_from_private(r_);
2906
+ }
2907
+ #endif
2908
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2909
+ # define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
2910
+ #endif
2911
+
2912
+ SIMDE__FUNCTION_ATTRIBUTES
2913
+ simde__m128
2914
+ simde_mm_sqrt_ps (simde__m128 a) {
2915
+ #if defined(SIMDE_SSE_NATIVE)
2916
+ return _mm_sqrt_ps(a);
2917
+ #else
2918
+ simde__m128_private
2919
+ r_,
2920
+ a_ = simde__m128_to_private(a);
2921
+
2922
+ #if defined(SIMDE_SSE_NEON)
2923
+ float32x4_t recipsq = vrsqrteq_f32(a_.neon_f32);
2924
+ float32x4_t sq = vrecpeq_f32(recipsq);
2925
+ /* ??? use step versions of both sqrt and recip for better accuracy? */
2926
+ r_.neon_f32 = sq;
2927
+ #elif defined(SIMDE_HAVE_MATH_H)
2928
+ SIMDE__VECTORIZE
2929
+ for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) {
2930
+ r_.f32[i] = sqrtf(a_.f32[i]);
2931
+ }
2932
+ #else
2933
+ HEDLEY_UNREACHABLE();
2934
+ #endif
2935
+
2936
+ return simde__m128_from_private(r_);
2937
+ #endif
2938
+ }
2939
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2940
+ # define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
2941
+ #endif
2942
+
2943
+ SIMDE__FUNCTION_ATTRIBUTES
2944
+ simde__m128
2945
+ simde_mm_sqrt_ss (simde__m128 a) {
2946
+ #if defined(SIMDE_SSE_NATIVE)
2947
+ return _mm_sqrt_ss(a);
2948
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2949
+ return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
2950
+ #else
2951
+ simde__m128_private
2952
+ r_,
2953
+ a_ = simde__m128_to_private(a);
2954
+
2955
+ #if defined(SIMDE_HAVE_MATH_H)
2956
+ r_.f32[0] = sqrtf(a_.f32[0]);
2957
+ r_.f32[1] = a_.f32[1];
2958
+ r_.f32[2] = a_.f32[2];
2959
+ r_.f32[3] = a_.f32[3];
2960
+ #else
2961
+ HEDLEY_UNREACHABLE();
2962
+ #endif
2963
+
2964
+ return simde__m128_from_private(r_);
2965
+ #endif
2966
+ }
2967
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2968
+ # define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
2969
+ #endif
2970
+
2971
+ SIMDE__FUNCTION_ATTRIBUTES
2972
+ void
2973
+ simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) {
2974
+ simde_assert_aligned(16, mem_addr);
2975
+
2976
+ #if defined(SIMDE_SSE_NATIVE)
2977
+ _mm_store_ps(mem_addr, a);
2978
+ #else
2979
+ simde__m128_private a_ = simde__m128_to_private(a);
2980
+
2981
+ #if defined(SIMDE_SSE_NEON)
2982
+ vst1q_f32(mem_addr, a_.neon_f32);
2983
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
2984
+ wasm_v128_store(mem_addr, a_.wasm_v128);
2985
+ #else
2986
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
2987
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
2988
+ mem_addr[i] = a_.f32[i];
2989
+ }
2990
+ #endif
2991
+ #endif
2992
+ }
2993
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2994
+ # define _mm_store_ps(mem_addr, a) simde_mm_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
2995
+ #endif
2996
+
2997
+ SIMDE__FUNCTION_ATTRIBUTES
2998
+ void
2999
+ simde_mm_store_ps1 (simde_float32 mem_addr[4], simde__m128 a) {
3000
+ simde_assert_aligned(16, mem_addr);
3001
+
3002
+ #if defined(SIMDE_SSE_NATIVE)
3003
+ _mm_store_ps1(mem_addr, a);
3004
+ #else
3005
+ simde__m128_private a_ = simde__m128_to_private(a);
3006
+
3007
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
3008
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
3009
+ mem_addr[i] = a_.f32[0];
3010
+ }
3011
+ #endif
3012
+ }
3013
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3014
+ # define _mm_store_ps1(mem_addr, a) simde_mm_store_ps1(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3015
+ #endif
3016
+
3017
+ SIMDE__FUNCTION_ATTRIBUTES
3018
+ void
3019
+ simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) {
3020
+ #if defined(SIMDE_SSE_NATIVE)
3021
+ _mm_store_ss(mem_addr, a);
3022
+ #else
3023
+ simde__m128_private a_ = simde__m128_to_private(a);
3024
+
3025
+ #if defined(SIMDE_SSE_NEON)
3026
+ vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
3027
+ #else
3028
+ *mem_addr = a_.f32[0];
3029
+ #endif
3030
+ #endif
3031
+ }
3032
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3033
+ # define _mm_store_ss(mem_addr, a) simde_mm_store_ss(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3034
+ #endif
3035
+
3036
+ SIMDE__FUNCTION_ATTRIBUTES
3037
+ void
3038
+ simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) {
3039
+ simde_assert_aligned(16, mem_addr);
3040
+
3041
+ #if defined(SIMDE_SSE_NATIVE)
3042
+ _mm_store1_ps(mem_addr, a);
3043
+ #else
3044
+ simde_mm_store_ps1(mem_addr, a);
3045
+ #endif
3046
+ }
3047
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3048
+ # define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3049
+ #endif
3050
+
3051
+ SIMDE__FUNCTION_ATTRIBUTES
3052
+ void
3053
+ simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) {
3054
+ #if defined(SIMDE_SSE_NATIVE)
3055
+ _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3056
+ #else
3057
+ simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr);
3058
+ simde__m128_private a_ = simde__m128_to_private(a);
3059
+
3060
+ dest_->f32[0] = a_.f32[2];
3061
+ dest_->f32[1] = a_.f32[3];
3062
+ #endif
3063
+ }
3064
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3065
+ # define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
3066
+ #endif
3067
+
3068
+ SIMDE__FUNCTION_ATTRIBUTES
3069
+ void
3070
+ simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) {
3071
+ #if defined(SIMDE_SSE_NATIVE)
3072
+ _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3073
+ #else
3074
+ simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr);
3075
+ simde__m128_private a_ = simde__m128_to_private(a);
3076
+
3077
+ dest_->f32[0] = a_.f32[0];
3078
+ dest_->f32[1] = a_.f32[1];
3079
+ #endif
3080
+ }
3081
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3082
+ # define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
3083
+ #endif
3084
+
3085
+ SIMDE__FUNCTION_ATTRIBUTES
3086
+ void
3087
+ simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) {
3088
+ simde_assert_aligned(16, mem_addr);
3089
+
3090
+ #if defined(SIMDE_SSE_NATIVE)
3091
+ _mm_storer_ps(mem_addr, a);
3092
+ #else
3093
+ simde__m128_private a_ = simde__m128_to_private(a);
3094
+
3095
+ #if defined(SIMDE__SHUFFLE_VECTOR)
3096
+ a_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
3097
+ simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
3098
+ #else
3099
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
3100
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
3101
+ mem_addr[i] = a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
3102
+ }
3103
+ #endif
3104
+ #endif
3105
+ }
3106
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3107
+ # define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3108
+ #endif
3109
+
3110
+ SIMDE__FUNCTION_ATTRIBUTES
3111
+ void
3112
+ simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) {
3113
+ #if defined(SIMDE_SSE_NATIVE)
3114
+ _mm_storeu_ps(mem_addr, a);
3115
+ #else
3116
+ simde__m128_private a_ = simde__m128_to_private(a);
3117
+
3118
+ #if defined(SIMDE_SSE_NEON)
3119
+ vst1q_f32(mem_addr, a_.neon_f32);
3120
+ #else
3121
+ simde_memcpy(mem_addr, &a_, sizeof(a_));
3122
+ #endif
3123
+ #endif
3124
+ }
3125
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3126
+ # define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3127
+ #endif
3128
+
3129
+ SIMDE__FUNCTION_ATTRIBUTES
3130
+ simde__m128
3131
+ simde_mm_sub_ps (simde__m128 a, simde__m128 b) {
3132
+ #if defined(SIMDE_SSE_NATIVE)
3133
+ return _mm_sub_ps(a, b);
3134
+ #else
3135
+ simde__m128_private
3136
+ r_,
3137
+ a_ = simde__m128_to_private(a),
3138
+ b_ = simde__m128_to_private(b);
3139
+
3140
+ #if defined(SIMDE_SSE_NEON)
3141
+ r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
3142
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
3143
+ r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
3144
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3145
+ r_.f32 = a_.f32 - b_.f32;
3146
+ #else
3147
+ SIMDE__VECTORIZE
3148
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
3149
+ r_.f32[i] = a_.f32[i] - b_.f32[i];
3150
+ }
3151
+ #endif
3152
+
3153
+ return simde__m128_from_private(r_);
3154
+ #endif
3155
+ }
3156
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3157
+ # define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
3158
+ #endif
3159
+
3160
+ SIMDE__FUNCTION_ATTRIBUTES
3161
+ simde__m128
3162
+ simde_mm_sub_ss (simde__m128 a, simde__m128 b) {
3163
+ #if defined(SIMDE_SSE_NATIVE)
3164
+ return _mm_sub_ss(a, b);
3165
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
3166
+ return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
3167
+ #else
3168
+ simde__m128_private
3169
+ r_,
3170
+ a_ = simde__m128_to_private(a),
3171
+ b_ = simde__m128_to_private(b);
3172
+
3173
+ r_.f32[0] = a_.f32[0] - b_.f32[0];
3174
+ r_.f32[1] = a_.f32[1];
3175
+ r_.f32[2] = a_.f32[2];
3176
+ r_.f32[3] = a_.f32[3];
3177
+
3178
+ return simde__m128_from_private(r_);
3179
+ #endif
3180
+ }
3181
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3182
+ # define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
3183
+ #endif
3184
+
3185
+ SIMDE__FUNCTION_ATTRIBUTES
3186
+ int
3187
+ simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) {
3188
+ #if defined(SIMDE_SSE_NATIVE)
3189
+ return _mm_ucomieq_ss(a, b);
3190
+ #else
3191
+ simde__m128_private
3192
+ a_ = simde__m128_to_private(a),
3193
+ b_ = simde__m128_to_private(b);
3194
+ int r;
3195
+
3196
+ #if defined(SIMDE_HAVE_FENV_H)
3197
+ fenv_t envp;
3198
+ int x = feholdexcept(&envp);
3199
+ r = a_.f32[0] == b_.f32[0];
3200
+ if (HEDLEY_LIKELY(x == 0))
3201
+ fesetenv(&envp);
3202
+ #else
3203
+ HEDLEY_UNREACHABLE();
3204
+ #endif
3205
+
3206
+ return r;
3207
+ #endif
3208
+ }
3209
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3210
+ # define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
3211
+ #endif
3212
+
3213
+ SIMDE__FUNCTION_ATTRIBUTES
3214
+ int
3215
+ simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) {
3216
+ #if defined(SIMDE_SSE_NATIVE)
3217
+ return _mm_ucomige_ss(a, b);
3218
+ #else
3219
+ simde__m128_private
3220
+ a_ = simde__m128_to_private(a),
3221
+ b_ = simde__m128_to_private(b);
3222
+ int r;
3223
+
3224
+ #if defined(SIMDE_HAVE_FENV_H)
3225
+ fenv_t envp;
3226
+ int x = feholdexcept(&envp);
3227
+ r = a_.f32[0] >= b_.f32[0];
3228
+ if (HEDLEY_LIKELY(x == 0))
3229
+ fesetenv(&envp);
3230
+ #else
3231
+ HEDLEY_UNREACHABLE();
3232
+ #endif
3233
+
3234
+ return r;
3235
+ #endif
3236
+ }
3237
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3238
+ # define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
3239
+ #endif
3240
+
3241
+ SIMDE__FUNCTION_ATTRIBUTES
3242
+ int
3243
+ simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) {
3244
+ #if defined(SIMDE_SSE_NATIVE)
3245
+ return _mm_ucomigt_ss(a, b);
3246
+ #else
3247
+ simde__m128_private
3248
+ a_ = simde__m128_to_private(a),
3249
+ b_ = simde__m128_to_private(b);
3250
+ int r;
3251
+
3252
+ #if defined(SIMDE_HAVE_FENV_H)
3253
+ fenv_t envp;
3254
+ int x = feholdexcept(&envp);
3255
+ r = a_.f32[0] > b_.f32[0];
3256
+ if (HEDLEY_LIKELY(x == 0))
3257
+ fesetenv(&envp);
3258
+ #else
3259
+ HEDLEY_UNREACHABLE();
3260
+ #endif
3261
+
3262
+ return r;
3263
+ #endif
3264
+ }
3265
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3266
+ # define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
3267
+ #endif
3268
+
3269
+ SIMDE__FUNCTION_ATTRIBUTES
3270
+ int
3271
+ simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) {
3272
+ #if defined(SIMDE_SSE_NATIVE)
3273
+ return _mm_ucomile_ss(a, b);
3274
+ #else
3275
+ simde__m128_private
3276
+ a_ = simde__m128_to_private(a),
3277
+ b_ = simde__m128_to_private(b);
3278
+ int r;
3279
+
3280
+ #if defined(SIMDE_HAVE_FENV_H)
3281
+ fenv_t envp;
3282
+ int x = feholdexcept(&envp);
3283
+ r = a_.f32[0] <= b_.f32[0];
3284
+ if (HEDLEY_LIKELY(x == 0))
3285
+ fesetenv(&envp);
3286
+ #else
3287
+ HEDLEY_UNREACHABLE();
3288
+ #endif
3289
+
3290
+ return r;
3291
+ #endif
3292
+ }
3293
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3294
+ # define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
3295
+ #endif
3296
+
3297
+ SIMDE__FUNCTION_ATTRIBUTES
3298
+ int
3299
+ simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) {
3300
+ #if defined(SIMDE_SSE_NATIVE)
3301
+ return _mm_ucomilt_ss(a, b);
3302
+ #else
3303
+ simde__m128_private
3304
+ a_ = simde__m128_to_private(a),
3305
+ b_ = simde__m128_to_private(b);
3306
+ int r;
3307
+
3308
+ #if defined(SIMDE_HAVE_FENV_H)
3309
+ fenv_t envp;
3310
+ int x = feholdexcept(&envp);
3311
+ r = a_.f32[0] < b_.f32[0];
3312
+ if (HEDLEY_LIKELY(x == 0))
3313
+ fesetenv(&envp);
3314
+ #else
3315
+ HEDLEY_UNREACHABLE();
3316
+ #endif
3317
+
3318
+ return r;
3319
+ #endif
3320
+ }
3321
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3322
+ # define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
3323
+ #endif
3324
+
3325
+ SIMDE__FUNCTION_ATTRIBUTES
3326
+ int
3327
+ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) {
3328
+ #if defined(SIMDE_SSE_NATIVE)
3329
+ return _mm_ucomineq_ss(a, b);
3330
+ #else
3331
+ simde__m128_private
3332
+ a_ = simde__m128_to_private(a),
3333
+ b_ = simde__m128_to_private(b);
3334
+ int r;
3335
+
3336
+ #if defined(SIMDE_HAVE_FENV_H)
3337
+ fenv_t envp;
3338
+ int x = feholdexcept(&envp);
3339
+ r = a_.f32[0] != b_.f32[0];
3340
+ if (HEDLEY_LIKELY(x == 0))
3341
+ fesetenv(&envp);
3342
+ #else
3343
+ HEDLEY_UNREACHABLE();
3344
+ #endif
3345
+
3346
+ return r;
3347
+ #endif
3348
+ }
3349
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3350
+ # define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
3351
+ #endif
3352
+
3353
+ #if defined(SIMDE_SSE_NATIVE)
3354
+ # if defined(__has_builtin)
3355
+ # if __has_builtin(__builtin_ia32_undef128)
3356
+ # define SIMDE__HAVE_UNDEFINED128
3357
+ # endif
3358
+ # elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER)
3359
+ # define SIMDE__HAVE_UNDEFINED128
3360
+ # endif
3361
+ #endif
3362
+
3363
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3364
+ HEDLEY_DIAGNOSTIC_PUSH
3365
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
3366
+ #endif
3367
+
3368
+ SIMDE__FUNCTION_ATTRIBUTES
3369
+ simde__m128
3370
+ simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) {
3371
+ #if defined(SIMDE_SSE_NATIVE)
3372
+ return _mm_unpackhi_ps(a, b);
3373
+ #else
3374
+ simde__m128_private
3375
+ r_,
3376
+ a_ = simde__m128_to_private(a),
3377
+ b_ = simde__m128_to_private(b);
3378
+
3379
+ #if defined(SIMDE_SSE_NEON)
3380
+ float32x2_t a1 = vget_high_f32(a_.neon_f32);
3381
+ float32x2_t b1 = vget_high_f32(b_.neon_f32);
3382
+ float32x2x2_t result = vzip_f32(a1, b1);
3383
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3384
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
3385
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
3386
+ #else
3387
+ r_.f32[0] = a_.f32[2];
3388
+ r_.f32[1] = b_.f32[2];
3389
+ r_.f32[2] = a_.f32[3];
3390
+ r_.f32[3] = b_.f32[3];
3391
+ #endif
3392
+
3393
+ return simde__m128_from_private(r_);
3394
+ #endif
3395
+ }
3396
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3397
+ # define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
3398
+ #endif
3399
+
3400
+ SIMDE__FUNCTION_ATTRIBUTES
3401
+ simde__m128
3402
+ simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) {
3403
+ #if defined(SIMDE_SSE_NATIVE)
3404
+ return _mm_unpacklo_ps(a, b);
3405
+ #else
3406
+ simde__m128_private
3407
+ r_,
3408
+ a_ = simde__m128_to_private(a),
3409
+ b_ = simde__m128_to_private(b);
3410
+
3411
+ #if defined(SIMDE__SHUFFLE_VECTOR)
3412
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
3413
+ #elif defined(SIMDE_SSE_NEON)
3414
+ float32x2_t a1 = vget_low_f32(a_.neon_f32);
3415
+ float32x2_t b1 = vget_low_f32(b_.neon_f32);
3416
+ float32x2x2_t result = vzip_f32(a1, b1);
3417
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3418
+ #else
3419
+ r_.f32[0] = a_.f32[0];
3420
+ r_.f32[1] = b_.f32[0];
3421
+ r_.f32[2] = a_.f32[1];
3422
+ r_.f32[3] = b_.f32[1];
3423
+ #endif
3424
+
3425
+ return simde__m128_from_private(r_);
3426
+ #endif
3427
+ }
3428
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3429
+ # define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
3430
+ #endif
3431
+
3432
+ SIMDE__FUNCTION_ATTRIBUTES
3433
+ simde__m128
3434
+ simde_mm_xor_ps (simde__m128 a, simde__m128 b) {
3435
+ #if defined(SIMDE_SSE_NATIVE)
3436
+ return _mm_xor_ps(a, b);
3437
+ #else
3438
+ simde__m128_private
3439
+ r_,
3440
+ a_ = simde__m128_to_private(a),
3441
+ b_ = simde__m128_to_private(b);
3442
+
3443
+ #if defined(SIMDE_SSE_NEON)
3444
+ r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
3445
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3446
+ r_.i32f = a_.i32f ^ b_.i32f;
3447
+ #else
3448
+ SIMDE__VECTORIZE
3449
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
3450
+ r_.u32[i] = a_.u32[i] ^ b_.u32[i];
3451
+ }
3452
+ #endif
3453
+
3454
+ return simde__m128_from_private(r_);
3455
+ #endif
3456
+ }
3457
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3458
+ # define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
3459
+ #endif
3460
+
3461
+ SIMDE__FUNCTION_ATTRIBUTES
3462
+ void
3463
+ simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) {
3464
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
3465
+ _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3466
+ #else
3467
+ simde__m64_private*
3468
+ dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr),
3469
+ a_ = simde__m64_to_private(a);
3470
+
3471
+ #if defined(SIMDE_SSE_NEON)
3472
+ dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
3473
+ #else
3474
+ dest->i64[0] = a_.i64[0];
3475
+ #endif
3476
+ #endif
3477
+ }
3478
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3479
+ # define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
3480
+ #endif
3481
+
3482
+ SIMDE__FUNCTION_ATTRIBUTES
3483
+ void
3484
+ simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) {
3485
+ simde_assert_aligned(16, mem_addr);
3486
+
3487
+ #if defined(SIMDE_SSE_NATIVE)
3488
+ _mm_stream_ps(mem_addr, a);
3489
+ #else
3490
+ simde__m128_private a_ = simde__m128_to_private(a);
3491
+
3492
+ #if defined(SIMDE_SSE_NEON)
3493
+ vst1q_f32(mem_addr, a_.neon_f32);
3494
+ #else
3495
+ SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3496
+ simde_memcpy(mem_addr, &a_, sizeof(a_));
3497
+ #endif
3498
+ #endif
3499
+ }
3500
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3501
+ # define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3502
+ #endif
3503
+
3504
+ SIMDE__FUNCTION_ATTRIBUTES
3505
+ uint32_t
3506
+ simde_mm_getcsr (void) {
3507
+ #if defined(SIMDE_SSE_NATIVE)
3508
+ return _mm_getcsr();
3509
+ #else
3510
+ uint32_t r = 0;
3511
+
3512
+ #if defined(SIMDE_HAVE_FENV_H)
3513
+ int rounding_mode = fegetround();
3514
+
3515
+ switch(rounding_mode) {
3516
+ #if defined(FE_TONEAREST)
3517
+ case FE_TONEAREST:
3518
+ break;
3519
+ #endif
3520
+ #if defined(FE_UPWARD)
3521
+ case FE_UPWARD:
3522
+ r |= 2 << 13;
3523
+ break;
3524
+ #endif
3525
+ #if defined(FE_DOWNWARD)
3526
+ case FE_DOWNWARD:
3527
+ r |= 1 << 13;
3528
+ break;
3529
+ #endif
3530
+ #if defined(FE_TOWARDZERO)
3531
+ case FE_TOWARDZERO:
3532
+ r = 3 << 13;
3533
+ break;
3534
+ #endif
3535
+ }
3536
+ #else
3537
+ HEDLEY_UNREACHABLE();
3538
+ #endif
3539
+
3540
+ return r;
3541
+ #endif
3542
+ }
3543
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3544
+ # define _mm_getcsr() simde_mm_getcsr()
3545
+ #endif
3546
+
3547
+ SIMDE__FUNCTION_ATTRIBUTES
3548
+ void
3549
+ simde_mm_setcsr (uint32_t a) {
3550
+ #if defined(SIMDE_SSE_NATIVE)
3551
+ _mm_setcsr(a);
3552
+ #else
3553
+ switch((a >> 13) & 3) {
3554
+ #if defined(FE_TONEAREST)
3555
+ case 0:
3556
+ fesetround(FE_TONEAREST);
3557
+ #endif
3558
+ #if defined(FE_DOWNWARD)
3559
+ break;
3560
+ case 1:
3561
+ fesetround(FE_DOWNWARD);
3562
+ #endif
3563
+ #if defined(FE_UPWARD)
3564
+ break;
3565
+ case 2:
3566
+ fesetround(FE_UPWARD);
3567
+ #endif
3568
+ #if defined(FE_TOWARDZERO)
3569
+ break;
3570
+ case 3:
3571
+ fesetround(FE_TOWARDZERO);
3572
+ break;
3573
+ #endif
3574
+ }
3575
+ #endif
3576
+ }
3577
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3578
+ # define _mm_setcsr(a) simde_mm_setcsr(a)
3579
+ #endif
3580
+
3581
+ #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3582
+ do { \
3583
+ simde__m128 tmp3, tmp2, tmp1, tmp0; \
3584
+ tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
3585
+ tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
3586
+ tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
3587
+ tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
3588
+ row0 = simde_mm_movelh_ps(tmp0, tmp2); \
3589
+ row1 = simde_mm_movehl_ps(tmp2, tmp0); \
3590
+ row2 = simde_mm_movelh_ps(tmp1, tmp3); \
3591
+ row3 = simde_mm_movehl_ps(tmp3, tmp1); \
3592
+ } while (0)
3593
+
3594
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3595
+ # define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
3596
+ #endif
3597
+
3598
+ #if defined(_MM_EXCEPT_INVALID)
3599
+ # define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
3600
+ #else
3601
+ # define SIMDE_MM_EXCEPT_INVALID (0x0001)
3602
+ #endif
3603
+ #if defined(_MM_EXCEPT_DENORM)
3604
+ # define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
3605
+ #else
3606
+ # define SIMDE_MM_EXCEPT_DENORM (0x0002)
3607
+ #endif
3608
+ #if defined(_MM_EXCEPT_DIV_ZERO)
3609
+ # define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
3610
+ #else
3611
+ # define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
3612
+ #endif
3613
+ #if defined(_MM_EXCEPT_OVERFLOW)
3614
+ # define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
3615
+ #else
3616
+ # define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
3617
+ #endif
3618
+ #if defined(_MM_EXCEPT_UNDERFLOW)
3619
+ # define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
3620
+ #else
3621
+ # define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
3622
+ #endif
3623
+ #if defined(_MM_EXCEPT_INEXACT)
3624
+ # define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
3625
+ #else
3626
+ # define SIMDE_MM_EXCEPT_INEXACT (0x0020)
3627
+ #endif
3628
+ #if defined(_MM_EXCEPT_MASK)
3629
+ # define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
3630
+ #else
3631
+ # define SIMDE_MM_EXCEPT_MASK \
3632
+ (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \
3633
+ SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
3634
+ SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
3635
+ #endif
3636
+
3637
+ #if defined(_MM_MASK_INVALID)
3638
+ # define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
3639
+ #else
3640
+ # define SIMDE_MM_MASK_INVALID (0x0080)
3641
+ #endif
3642
+ #if defined(_MM_MASK_DENORM)
3643
+ # define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
3644
+ #else
3645
+ # define SIMDE_MM_MASK_DENORM (0x0100)
3646
+ #endif
3647
+ #if defined(_MM_MASK_DIV_ZERO)
3648
+ # define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
3649
+ #else
3650
+ # define SIMDE_MM_MASK_DIV_ZERO (0x0200)
3651
+ #endif
3652
+ #if defined(_MM_MASK_OVERFLOW)
3653
+ # define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
3654
+ #else
3655
+ # define SIMDE_MM_MASK_OVERFLOW (0x0400)
3656
+ #endif
3657
+ #if defined(_MM_MASK_UNDERFLOW)
3658
+ # define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
3659
+ #else
3660
+ # define SIMDE_MM_MASK_UNDERFLOW (0x0800)
3661
+ #endif
3662
+ #if defined(_MM_MASK_INEXACT)
3663
+ # define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
3664
+ #else
3665
+ # define SIMDE_MM_MASK_INEXACT (0x1000)
3666
+ #endif
3667
+ #if defined(_MM_MASK_MASK)
3668
+ # define SIMDE_MM_MASK_MASK _MM_MASK_MASK
3669
+ #else
3670
+ # define SIMDE_MM_MASK_MASK \
3671
+ (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \
3672
+ SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
3673
+ SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
3674
+ #endif
3675
+
3676
+ #if defined(_MM_FLUSH_ZERO_MASK)
3677
+ # define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
3678
+ #else
3679
+ # define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
3680
+ #endif
3681
+ #if defined(_MM_FLUSH_ZERO_ON)
3682
+ # define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
3683
+ #else
3684
+ # define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
3685
+ #endif
3686
+ #if defined(_MM_FLUSH_ZERO_OFF)
3687
+ # define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
3688
+ #else
3689
+ # define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
3690
+ #endif
3691
+
3692
+ SIMDE__END_DECLS
3693
+
3694
+ HEDLEY_DIAGNOSTIC_POP
3695
+
3696
+ #endif /* !defined(SIMDE__SSE_H) */