minimap2 0.2.25.0 → 0.2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/ext/minimap2/Makefile +6 -2
  4. data/ext/minimap2/NEWS.md +38 -0
  5. data/ext/minimap2/README.md +9 -3
  6. data/ext/minimap2/align.c +5 -3
  7. data/ext/minimap2/cookbook.md +2 -2
  8. data/ext/minimap2/format.c +7 -4
  9. data/ext/minimap2/kalloc.c +20 -1
  10. data/ext/minimap2/kalloc.h +13 -2
  11. data/ext/minimap2/ksw2.h +1 -0
  12. data/ext/minimap2/ksw2_extd2_sse.c +1 -1
  13. data/ext/minimap2/ksw2_exts2_sse.c +79 -40
  14. data/ext/minimap2/ksw2_extz2_sse.c +1 -1
  15. data/ext/minimap2/lchain.c +15 -16
  16. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  17. data/ext/minimap2/lib/simde/COPYING +20 -0
  18. data/ext/minimap2/lib/simde/README.md +333 -0
  19. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  20. data/ext/minimap2/lib/simde/meson.build +33 -0
  21. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  29. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  30. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  31. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  32. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  33. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  34. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  35. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  36. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  37. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  38. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  39. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  40. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  41. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  42. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  43. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  44. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  45. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  46. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  47. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  48. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  49. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  50. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  51. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  52. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  53. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  54. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  55. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  56. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  57. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  58. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  59. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  60. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  61. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  62. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  63. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  64. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  65. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  66. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  67. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  68. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  69. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  70. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  71. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  72. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  73. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  74. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  75. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  76. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  77. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  78. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  79. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  80. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  81. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  82. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  83. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  84. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  85. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  86. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  87. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  88. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  89. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  90. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  91. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  92. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  93. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  94. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  95. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  96. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  97. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  98. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  99. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  100. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  101. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  102. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  103. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  104. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  105. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  106. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  107. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  108. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  109. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  110. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  111. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  112. data/ext/minimap2/main.c +13 -6
  113. data/ext/minimap2/map.c +0 -5
  114. data/ext/minimap2/minimap.h +40 -31
  115. data/ext/minimap2/minimap2.1 +19 -5
  116. data/ext/minimap2/misc/paftools.js +545 -24
  117. data/ext/minimap2/options.c +1 -1
  118. data/ext/minimap2/pyproject.toml +2 -0
  119. data/ext/minimap2/python/mappy.pyx +3 -1
  120. data/ext/minimap2/seed.c +1 -1
  121. data/ext/minimap2/setup.py +32 -22
  122. data/lib/minimap2/version.rb +1 -1
  123. metadata +100 -3
@@ -0,0 +1,3696 @@
1
+ /* Permission is hereby granted, free of charge, to any person
2
+ * obtaining a copy of this software and associated documentation
3
+ * files (the "Software"), to deal in the Software without
4
+ * restriction, including without limitation the rights to use, copy,
5
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
6
+ * of the Software, and to permit persons to whom the Software is
7
+ * furnished to do so, subject to the following conditions:
8
+ *
9
+ * The above copyright notice and this permission notice shall be
10
+ * included in all copies or substantial portions of the Software.
11
+ *
12
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ * SOFTWARE.
20
+ *
21
+ * Copyright:
22
+ * 2017-2020 Evan Nemerson <evan@nemerson.com>
23
+ * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
24
+ * 2015 Brandon Rowlett <browlett@nvidia.com>
25
+ * 2015 Ken Fast <kfast@gdeb.com>
26
+ */
27
+
28
+ #if !defined(SIMDE__SSE_H)
29
+ # if !defined(SIMDE__SSE_H)
30
+ # define SIMDE__SSE_H
31
+ # endif
32
+ # include "mmx.h"
33
+
34
+ HEDLEY_DIAGNOSTIC_PUSH
35
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
36
+
37
+ # if defined(SIMDE_SSE_NATIVE)
38
+ # undef SIMDE_SSE_NATIVE
39
+ # endif
40
+ # if defined(SIMDE_ARCH_X86_SSE) && !defined(SIMDE_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
41
+ # define SIMDE_SSE_NATIVE
42
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && !defined(SIMDE_NO_NEON)
43
+ # define SIMDE_SSE_NEON
44
+ # elif defined(SIMDE_ARCH_WASM_SIMD128)
45
+ # define SIMDE_SSE_WASM_SIMD128
46
+ # elif defined(SIMDE_ARCH_POWER_ALTIVEC)
47
+ # define SIMDE_SSE_POWER_ALTIVEC
48
+ # endif
49
+
50
+ # if defined(SIMDE_SSE_NATIVE)
51
+ # include <xmmintrin.h>
52
+ # else
53
+ # if defined(SIMDE_SSE_NEON)
54
+ # include <arm_neon.h>
55
+ # endif
56
+ # if defined(SIMDE_SSE_WASM_SIMD128)
57
+ # if !defined(__wasm_unimplemented_simd128__)
58
+ # define __wasm_unimplemented_simd128__
59
+ # endif
60
+ # include <wasm_simd128.h>
61
+ # endif
62
+ # if defined(SIMDE_SSE_POWER_ALTIVEC)
63
+ # include <altivec.h>
64
+ # endif
65
+
66
+ # if !defined(HEDLEY_INTEL_VERSION) && !defined(HEDLEY_EMSCRIPTEN_VERSION) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
67
+ # include <stdatomic.h>
68
+ # elif defined(_WIN32)
69
+ # include <windows.h>
70
+ # endif
71
+ # endif
72
+
73
+ SIMDE__BEGIN_DECLS
74
+
75
+ typedef union {
76
+ #if defined(SIMDE_VECTOR_SUBSCRIPT)
77
+ SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
78
+ SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
79
+ SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
80
+ SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
81
+ SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
82
+ SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
83
+ SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
84
+ SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
85
+ #if defined(SIMDE__HAVE_INT128)
86
+ SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
87
+ SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
88
+ #endif
89
+ SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
90
+ SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
91
+ SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
92
+ #else
93
+ SIMDE_ALIGN(16) int8_t i8[16];
94
+ SIMDE_ALIGN(16) int16_t i16[8];
95
+ SIMDE_ALIGN(16) int32_t i32[4];
96
+ SIMDE_ALIGN(16) int64_t i64[2];
97
+ SIMDE_ALIGN(16) uint8_t u8[16];
98
+ SIMDE_ALIGN(16) uint16_t u16[8];
99
+ SIMDE_ALIGN(16) uint32_t u32[4];
100
+ SIMDE_ALIGN(16) uint64_t u64[2];
101
+ #if defined(SIMDE__HAVE_INT128)
102
+ SIMDE_ALIGN(16) simde_int128 i128[1];
103
+ SIMDE_ALIGN(16) simde_uint128 u128[1];
104
+ #endif
105
+ SIMDE_ALIGN(16) simde_float32 f32[4];
106
+ SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
107
+ SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
108
+ #endif
109
+
110
+ SIMDE_ALIGN(16) simde__m64_private m64_private[2];
111
+ SIMDE_ALIGN(16) simde__m64 m64[2];
112
+
113
+ #if defined(SIMDE_SSE_NATIVE)
114
+ SIMDE_ALIGN(16) __m128 n;
115
+ #elif defined(SIMDE_SSE_NEON)
116
+ SIMDE_ALIGN(16) int8x16_t neon_i8;
117
+ SIMDE_ALIGN(16) int16x8_t neon_i16;
118
+ SIMDE_ALIGN(16) int32x4_t neon_i32;
119
+ SIMDE_ALIGN(16) int64x2_t neon_i64;
120
+ SIMDE_ALIGN(16) uint8x16_t neon_u8;
121
+ SIMDE_ALIGN(16) uint16x8_t neon_u16;
122
+ SIMDE_ALIGN(16) uint32x4_t neon_u32;
123
+ SIMDE_ALIGN(16) uint64x2_t neon_u64;
124
+ SIMDE_ALIGN(16) float32x4_t neon_f32;
125
+ #if defined(SIMDE_ARCH_AARCH64)
126
+ SIMDE_ALIGN(16) float64x2_t neon_f64;
127
+ #endif
128
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
129
+ SIMDE_ALIGN(16) v128_t wasm_v128;
130
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
131
+ SIMDE_ALIGN(16) vector unsigned char altivec_u8;
132
+ SIMDE_ALIGN(16) vector unsigned short altivec_u16;
133
+ SIMDE_ALIGN(16) vector unsigned int altivec_u32;
134
+ SIMDE_ALIGN(16) vector unsigned long long altivec_u64;
135
+ SIMDE_ALIGN(16) vector signed char altivec_i8;
136
+ SIMDE_ALIGN(16) vector signed short altivec_i16;
137
+ SIMDE_ALIGN(16) vector signed int altivec_i32;
138
+ SIMDE_ALIGN(16) vector signed long long altivec_i64;
139
+ SIMDE_ALIGN(16) vector float altivec_f32;
140
+ SIMDE_ALIGN(16) vector double altivec_f64;
141
+ #endif
142
+ } simde__m128_private;
143
+
144
+ #if defined(SIMDE_SSE_NATIVE)
145
+ typedef __m128 simde__m128;
146
+ #elif defined(SIMDE_SSE_NEON)
147
+ typedef float32x4_t simde__m128;
148
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
149
+ typedef v128_t simde__m128;
150
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
151
+ typedef vector float simde__m128;
152
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT)
153
+ typedef simde_float32 simde__m128 SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
154
+ #else
155
+ typedef simde__m128_private simde__m128;
156
+ #endif
157
+
158
+ #if !defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
159
+ #define SIMDE_SSE_ENABLE_NATIVE_ALIASES
160
+ typedef simde__m128 __m128;
161
+ #endif
162
+
163
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
164
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), "simde__m128_private size incorrect");
165
+ #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
166
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, "simde__m128 is not 16-byte aligned");
167
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, "simde__m128_private is not 16-byte aligned");
168
+ #endif
169
+
170
+ SIMDE__FUNCTION_ATTRIBUTES
171
+ simde__m128
172
+ simde__m128_from_private(simde__m128_private v) {
173
+ simde__m128 r;
174
+ simde_memcpy(&r, &v, sizeof(r));
175
+ return r;
176
+ }
177
+
178
+ SIMDE__FUNCTION_ATTRIBUTES
179
+ simde__m128_private
180
+ simde__m128_to_private(simde__m128 v) {
181
+ simde__m128_private r;
182
+ simde_memcpy(&r, &v, sizeof(r));
183
+ return r;
184
+ }
185
+
186
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
187
+ HEDLEY_DIAGNOSTIC_POP
188
+ #endif
189
+
190
+ SIMDE__FUNCTION_ATTRIBUTES
191
+ simde__m128
192
+ simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
193
+ #if defined(SIMDE_SSE_NATIVE)
194
+ return _mm_set_ps(e3, e2, e1, e0);
195
+ #else
196
+ simde__m128_private r_;
197
+
198
+ #if defined(SIMDE_SSE_NEON)
199
+ SIMDE_ALIGN(16) simde_float32 data[4] = { e0, e1, e2, e3 };
200
+ r_.neon_f32 = vld1q_f32(data);
201
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
202
+ r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
203
+ #else
204
+ r_.f32[0] = e0;
205
+ r_.f32[1] = e1;
206
+ r_.f32[2] = e2;
207
+ r_.f32[3] = e3;
208
+ #endif
209
+
210
+ return simde__m128_from_private(r_);
211
+ #endif
212
+ }
213
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
214
+ # define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
215
+ #endif
216
+
217
+ SIMDE__FUNCTION_ATTRIBUTES
218
+ simde__m128
219
+ simde_mm_set_ps1 (simde_float32 a) {
220
+ #if defined(SIMDE_SSE_NATIVE)
221
+ return _mm_set_ps1(a);
222
+ #elif defined(SIMDE_SSE_NEON)
223
+ return vdupq_n_f32(a);
224
+ #else
225
+ return simde_mm_set_ps(a, a, a, a);
226
+ #endif
227
+ }
228
+ #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
229
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
230
+ # define _mm_set_ps1(a) simde_mm_set_ps1(a)
231
+ # define _mm_set1_ps(a) simde_mm_set1_ps(a)
232
+ #endif
233
+
234
+ SIMDE__FUNCTION_ATTRIBUTES
235
+ simde__m128
236
+ simde_mm_move_ss (simde__m128 a, simde__m128 b) {
237
+ #if defined(SIMDE_SSE_NATIVE)
238
+ return _mm_move_ss(a, b);
239
+ #else
240
+ simde__m128_private
241
+ r_,
242
+ a_ = simde__m128_to_private(a),
243
+ b_ = simde__m128_to_private(b);
244
+
245
+ #if defined(SIMDE_SSE_NEON)
246
+ r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
247
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
248
+ vector unsigned char m = {
249
+ 16, 17, 18, 19,
250
+ 4, 5, 6, 7,
251
+ 8, 9, 10, 11,
252
+ 12, 13, 14, 15
253
+ };
254
+ r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
255
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
256
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
257
+ #else
258
+ r_.f32[0] = b_.f32[0];
259
+ r_.f32[1] = a_.f32[1];
260
+ r_.f32[2] = a_.f32[2];
261
+ r_.f32[3] = a_.f32[3];
262
+ #endif
263
+
264
+ return simde__m128_from_private(r_);
265
+ #endif
266
+ }
267
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
268
+ # define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
269
+ #endif
270
+
271
+ SIMDE__FUNCTION_ATTRIBUTES
272
+ simde__m128
273
+ simde_mm_add_ps (simde__m128 a, simde__m128 b) {
274
+ #if defined(SIMDE_SSE_NATIVE)
275
+ return _mm_add_ps(a, b);
276
+ #else
277
+ simde__m128_private
278
+ r_,
279
+ a_ = simde__m128_to_private(a),
280
+ b_ = simde__m128_to_private(b);
281
+
282
+ #if defined(SIMDE_SSE_NEON)
283
+ r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
284
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
285
+ r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
286
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
287
+ r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
288
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
289
+ r_.f32 = a_.f32 + b_.f32;
290
+ #else
291
+ SIMDE__VECTORIZE
292
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
293
+ r_.f32[i] = a_.f32[i] + b_.f32[i];
294
+ }
295
+ #endif
296
+
297
+ return simde__m128_from_private(r_);
298
+ #endif
299
+ }
300
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
301
+ # define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
302
+ #endif
303
+
304
+ SIMDE__FUNCTION_ATTRIBUTES
305
+ simde__m128
306
+ simde_mm_add_ss (simde__m128 a, simde__m128 b) {
307
+ #if defined(SIMDE_SSE_NATIVE)
308
+ return _mm_add_ss(a, b);
309
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
310
+ return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
311
+ #else
312
+ simde__m128_private
313
+ r_,
314
+ a_ = simde__m128_to_private(a),
315
+ b_ = simde__m128_to_private(b);
316
+
317
+ r_.f32[0] = a_.f32[0] + b_.f32[0];
318
+ r_.f32[1] = a_.f32[1];
319
+ r_.f32[2] = a_.f32[2];
320
+ r_.f32[3] = a_.f32[3];
321
+
322
+ return simde__m128_from_private(r_);
323
+ #endif
324
+ }
325
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
326
+ # define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
327
+ #endif
328
+
329
+ SIMDE__FUNCTION_ATTRIBUTES
330
+ simde__m128
331
+ simde_mm_and_ps (simde__m128 a, simde__m128 b) {
332
+ #if defined(SIMDE_SSE_NATIVE)
333
+ return _mm_and_ps(a, b);
334
+ #else
335
+ simde__m128_private
336
+ r_,
337
+ a_ = simde__m128_to_private(a),
338
+ b_ = simde__m128_to_private(b);
339
+
340
+ #if defined(SIMDE_SSE_NEON)
341
+ r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
342
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
343
+ r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
344
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
345
+ r_.i32 = a_.i32 & b_.i32;
346
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
347
+ r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
348
+ #else
349
+ SIMDE__VECTORIZE
350
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
351
+ r_.i32[i] = a_.i32[i] & b_.i32[i];
352
+ }
353
+ #endif
354
+
355
+ return simde__m128_from_private(r_);
356
+ #endif
357
+ }
358
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
359
+ # define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
360
+ #endif
361
+
362
+ SIMDE__FUNCTION_ATTRIBUTES
363
+ simde__m128
364
+ simde_mm_andnot_ps (simde__m128 a, simde__m128 b) {
365
+ #if defined(SIMDE_SSE_NATIVE)
366
+ return _mm_andnot_ps(a, b);
367
+ #else
368
+ simde__m128_private
369
+ r_,
370
+ a_ = simde__m128_to_private(a),
371
+ b_ = simde__m128_to_private(b);
372
+
373
+ #if defined(SIMDE_SSE_NEON)
374
+ r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
375
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
376
+ r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
377
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
378
+ r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
379
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
380
+ r_.i32 = ~a_.i32 & b_.i32;
381
+ #else
382
+ SIMDE__VECTORIZE
383
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
384
+ r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
385
+ }
386
+ #endif
387
+
388
+ return simde__m128_from_private(r_);
389
+ #endif
390
+ }
391
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
392
+ # define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
393
+ #endif
394
+
395
+ SIMDE__FUNCTION_ATTRIBUTES
396
+ simde__m64
397
+ simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) {
398
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
399
+ return _mm_avg_pu16(a, b);
400
+ #else
401
+ simde__m64_private
402
+ r_,
403
+ a_ = simde__m64_to_private(a),
404
+ b_ = simde__m64_to_private(b);
405
+
406
+ #if defined(SIMDE_SSE_NEON)
407
+ r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
408
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE__CONVERT_VECTOR)
409
+ uint32_t wa SIMDE_VECTOR(16);
410
+ uint32_t wb SIMDE_VECTOR(16);
411
+ uint32_t wr SIMDE_VECTOR(16);
412
+ SIMDE__CONVERT_VECTOR(wa, a_.u16);
413
+ SIMDE__CONVERT_VECTOR(wb, b_.u16);
414
+ wr = (wa + wb + 1) >> 1;
415
+ SIMDE__CONVERT_VECTOR(r_.u16, wr);
416
+ #else
417
+ SIMDE__VECTORIZE
418
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
419
+ r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
420
+ }
421
+ #endif
422
+
423
+ return simde__m64_from_private(r_);
424
+ #endif
425
+ }
426
+ #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
427
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
428
+ # define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
429
+ # define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
430
+ #endif
431
+
432
+ SIMDE__FUNCTION_ATTRIBUTES
433
+ simde__m64
434
+ simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) {
435
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
436
+ return _mm_avg_pu8(a, b);
437
+ #else
438
+ simde__m64_private
439
+ r_,
440
+ a_ = simde__m64_to_private(a),
441
+ b_ = simde__m64_to_private(b);
442
+
443
+ #if defined(SIMDE_SSE_NEON)
444
+ r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
445
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE__CONVERT_VECTOR)
446
+ uint16_t wa SIMDE_VECTOR(16);
447
+ uint16_t wb SIMDE_VECTOR(16);
448
+ uint16_t wr SIMDE_VECTOR(16);
449
+ SIMDE__CONVERT_VECTOR(wa, a_.u8);
450
+ SIMDE__CONVERT_VECTOR(wb, b_.u8);
451
+ wr = (wa + wb + 1) >> 1;
452
+ SIMDE__CONVERT_VECTOR(r_.u8, wr);
453
+ #else
454
+ SIMDE__VECTORIZE
455
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
456
+ r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
457
+ }
458
+ #endif
459
+
460
+ return simde__m64_from_private(r_);
461
+ #endif
462
+ }
463
+ #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
464
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
465
+ # define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
466
+ # define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
467
+ #endif
468
+
469
+ SIMDE__FUNCTION_ATTRIBUTES
470
+ simde__m128
471
+ simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) {
472
+ #if defined(SIMDE_SSE_NATIVE)
473
+ return _mm_cmpeq_ps(a, b);
474
+ #else
475
+ simde__m128_private
476
+ r_,
477
+ a_ = simde__m128_to_private(a),
478
+ b_ = simde__m128_to_private(b);
479
+
480
+ #if defined(SIMDE_SSE_NEON)
481
+ r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
482
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
483
+ r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
484
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
485
+ r_.altivec_f32 = (vector float) vec_cmpeq(a_.altivec_f32, b_.altivec_f32);
486
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
487
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
488
+ #else
489
+ SIMDE__VECTORIZE
490
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
491
+ r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
492
+ }
493
+ #endif
494
+
495
+ return simde__m128_from_private(r_);
496
+ #endif
497
+ }
498
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
499
+ # define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
500
+ #endif
501
+
502
+ SIMDE__FUNCTION_ATTRIBUTES
503
+ simde__m128
504
+ simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) {
505
+ #if defined(SIMDE_SSE_NATIVE)
506
+ return _mm_cmpeq_ss(a, b);
507
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
508
+ return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
509
+ #else
510
+ simde__m128_private
511
+ r_,
512
+ a_ = simde__m128_to_private(a),
513
+ b_ = simde__m128_to_private(b);
514
+
515
+ r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
516
+ SIMDE__VECTORIZE
517
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
518
+ r_.u32[i] = a_.u32[i];
519
+ }
520
+
521
+ return simde__m128_from_private(r_);
522
+ #endif
523
+ }
524
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
525
+ # define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
526
+ #endif
527
+
528
+ SIMDE__FUNCTION_ATTRIBUTES
529
+ simde__m128
530
+ simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) {
531
+ #if defined(SIMDE_SSE_NATIVE)
532
+ return _mm_cmpge_ps(a, b);
533
+ #else
534
+ simde__m128_private
535
+ r_,
536
+ a_ = simde__m128_to_private(a),
537
+ b_ = simde__m128_to_private(b);
538
+
539
+ #if defined(SIMDE_SSE_NEON)
540
+ r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
541
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
542
+ r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
543
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
544
+ r_.altivec_f32 = (vector float) vec_cmpge(a_.altivec_f32, b_.altivec_f32);
545
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
546
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 >= b_.f32);
547
+ #else
548
+ SIMDE__VECTORIZE
549
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
550
+ r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
551
+ }
552
+ #endif
553
+
554
+ return simde__m128_from_private(r_);
555
+ #endif
556
+ }
557
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
558
+ # define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
559
+ #endif
560
+
561
+ SIMDE__FUNCTION_ATTRIBUTES
562
+ simde__m128
563
+ simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) {
564
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
565
+ return _mm_cmpge_ss(a, b);
566
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
567
+ return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
568
+ #else
569
+ simde__m128_private
570
+ r_,
571
+ a_ = simde__m128_to_private(a),
572
+ b_ = simde__m128_to_private(b);
573
+
574
+ r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
575
+ SIMDE__VECTORIZE
576
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
577
+ r_.u32[i] = a_.u32[i];
578
+ }
579
+
580
+ return simde__m128_from_private(r_);
581
+ #endif
582
+ }
583
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
584
+ # define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
585
+ #endif
586
+
587
+ SIMDE__FUNCTION_ATTRIBUTES
588
+ simde__m128
589
+ simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) {
590
+ #if defined(SIMDE_SSE_NATIVE)
591
+ return _mm_cmpgt_ps(a, b);
592
+ #else
593
+ simde__m128_private
594
+ r_,
595
+ a_ = simde__m128_to_private(a),
596
+ b_ = simde__m128_to_private(b);
597
+
598
+ #if defined(SIMDE_SSE_NEON)
599
+ r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
600
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
601
+ r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
602
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
603
+ r_.altivec_f32 = (vector float) vec_cmpgt(a_.altivec_f32, b_.altivec_f32);
604
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
605
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 > b_.f32);
606
+ #else
607
+ SIMDE__VECTORIZE
608
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
609
+ r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
610
+ }
611
+ #endif
612
+
613
+ return simde__m128_from_private(r_);
614
+ #endif
615
+ }
616
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
617
+ # define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
618
+ #endif
619
+
620
+ SIMDE__FUNCTION_ATTRIBUTES
621
+ simde__m128
622
+ simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) {
623
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
624
+ return _mm_cmpgt_ss(a, b);
625
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
626
+ return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
627
+ #else
628
+ simde__m128_private
629
+ r_,
630
+ a_ = simde__m128_to_private(a),
631
+ b_ = simde__m128_to_private(b);
632
+
633
+ r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
634
+ SIMDE__VECTORIZE
635
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
636
+ r_.u32[i] = a_.u32[i];
637
+ }
638
+
639
+ return simde__m128_from_private(r_);
640
+ #endif
641
+ }
642
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
643
+ # define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
644
+ #endif
645
+
646
+ SIMDE__FUNCTION_ATTRIBUTES
647
+ simde__m128
648
+ simde_mm_cmple_ps (simde__m128 a, simde__m128 b) {
649
+ #if defined(SIMDE_SSE_NATIVE)
650
+ return _mm_cmple_ps(a, b);
651
+ #else
652
+ simde__m128_private
653
+ r_,
654
+ a_ = simde__m128_to_private(a),
655
+ b_ = simde__m128_to_private(b);
656
+
657
+ #if defined(SIMDE_SSE_NEON)
658
+ r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
659
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
660
+ r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
661
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
662
+ r_.altivec_f32 = (vector float) vec_cmple(a_.altivec_f32, b_.altivec_f32);
663
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
664
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 <= b_.f32);
665
+ #else
666
+ SIMDE__VECTORIZE
667
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
668
+ r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
669
+ }
670
+ #endif
671
+
672
+ return simde__m128_from_private(r_);
673
+ #endif
674
+ }
675
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
676
+ # define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
677
+ #endif
678
+
679
+ SIMDE__FUNCTION_ATTRIBUTES
680
+ simde__m128
681
+ simde_mm_cmple_ss (simde__m128 a, simde__m128 b) {
682
+ #if defined(SIMDE_SSE_NATIVE)
683
+ return _mm_cmple_ss(a, b);
684
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
685
+ return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
686
+ #else
687
+ simde__m128_private
688
+ r_,
689
+ a_ = simde__m128_to_private(a),
690
+ b_ = simde__m128_to_private(b);
691
+
692
+ r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
693
+ SIMDE__VECTORIZE
694
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
695
+ r_.u32[i] = a_.u32[i];
696
+ }
697
+
698
+ return simde__m128_from_private(r_);
699
+ #endif
700
+ }
701
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
702
+ # define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
703
+ #endif
704
+
705
+ SIMDE__FUNCTION_ATTRIBUTES
706
+ simde__m128
707
+ simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) {
708
+ #if defined(SIMDE_SSE_NATIVE)
709
+ return _mm_cmplt_ps(a, b);
710
+ #else
711
+ simde__m128_private
712
+ r_,
713
+ a_ = simde__m128_to_private(a),
714
+ b_ = simde__m128_to_private(b);
715
+
716
+ #if defined(SIMDE_SSE_NEON)
717
+ r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
718
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
719
+ r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
720
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
721
+ r_.altivec_f32 = (vector float) vec_cmplt(a_.altivec_f32, b_.altivec_f32);
722
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
723
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 < b_.f32);
724
+ #else
725
+ SIMDE__VECTORIZE
726
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
727
+ r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
728
+ }
729
+ #endif
730
+
731
+ return simde__m128_from_private(r_);
732
+ #endif
733
+ }
734
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
735
+ # define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
736
+ #endif
737
+
738
+ SIMDE__FUNCTION_ATTRIBUTES
739
+ simde__m128
740
+ simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) {
741
+ #if defined(SIMDE_SSE_NATIVE)
742
+ return _mm_cmplt_ss(a, b);
743
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
744
+ return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
745
+ #else
746
+ simde__m128_private
747
+ r_,
748
+ a_ = simde__m128_to_private(a),
749
+ b_ = simde__m128_to_private(b);
750
+
751
+ r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
752
+ SIMDE__VECTORIZE
753
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
754
+ r_.u32[i] = a_.u32[i];
755
+ }
756
+
757
+ return simde__m128_from_private(r_);
758
+ #endif
759
+ }
760
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
761
+ # define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
762
+ #endif
763
+
764
+ SIMDE__FUNCTION_ATTRIBUTES
765
+ simde__m128
766
+ simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) {
767
+ #if defined(SIMDE_SSE_NATIVE)
768
+ return _mm_cmpneq_ps(a, b);
769
+ #else
770
+ simde__m128_private
771
+ r_,
772
+ a_ = simde__m128_to_private(a),
773
+ b_ = simde__m128_to_private(b);
774
+
775
+ #if defined(SIMDE_SSE_NEON)
776
+ r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
777
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
778
+ r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
779
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC) && (SIMDE_ARCH_POWER >= 900) && !defined(HEDLEY_IBM_VERSION)
780
+ /* vec_cmpne(vector float, vector float) is missing from XL C/C++ v16.1.1,
781
+ though the documentation (table 89 on page 432 of the IBM XL C/C++ for
782
+ Linux Compiler Reference, Version 16.1.1) shows that it should be
783
+ present. Both GCC and clang support it. */
784
+ r_.altivec_f32 = (vector float) vec_cmpne(a_.altivec_f32, b_.altivec_f32);
785
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
786
+ r_.i32 = (__typeof__(r_.i32)) (a_.f32 != b_.f32);
787
+ #else
788
+ SIMDE__VECTORIZE
789
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
790
+ r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0);
791
+ }
792
+ #endif
793
+
794
+ return simde__m128_from_private(r_);
795
+ #endif
796
+ }
797
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
798
+ # define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
799
+ #endif
800
+
801
+ SIMDE__FUNCTION_ATTRIBUTES
802
+ simde__m128
803
+ simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) {
804
+ #if defined(SIMDE_SSE_NATIVE)
805
+ return _mm_cmpneq_ss(a, b);
806
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
807
+ return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
808
+ #else
809
+ simde__m128_private
810
+ r_,
811
+ a_ = simde__m128_to_private(a),
812
+ b_ = simde__m128_to_private(b);
813
+
814
+ r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
815
+ SIMDE__VECTORIZE
816
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
817
+ r_.u32[i] = a_.u32[i];
818
+ }
819
+
820
+ return simde__m128_from_private(r_);
821
+ #endif
822
+ }
823
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
824
+ # define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
825
+ #endif
826
+
827
+ SIMDE__FUNCTION_ATTRIBUTES
828
+ simde__m128
829
+ simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) {
830
+ return simde_mm_cmplt_ps(a, b);
831
+ }
832
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
833
+ # define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
834
+ #endif
835
+
836
+ SIMDE__FUNCTION_ATTRIBUTES
837
+ simde__m128
838
+ simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) {
839
+ return simde_mm_cmplt_ss(a, b);
840
+ }
841
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
842
+ # define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
843
+ #endif
844
+
845
+ SIMDE__FUNCTION_ATTRIBUTES
846
+ simde__m128
847
+ simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) {
848
+ return simde_mm_cmple_ps(a, b);
849
+ }
850
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
851
+ # define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
852
+ #endif
853
+
854
+ SIMDE__FUNCTION_ATTRIBUTES
855
+ simde__m128
856
+ simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) {
857
+ return simde_mm_cmple_ss(a, b);
858
+ }
859
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
860
+ # define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
861
+ #endif
862
+
863
+ SIMDE__FUNCTION_ATTRIBUTES
864
+ simde__m128
865
+ simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) {
866
+ return simde_mm_cmpgt_ps(a, b);
867
+ }
868
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
869
+ # define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
870
+ #endif
871
+
872
+ SIMDE__FUNCTION_ATTRIBUTES
873
+ simde__m128
874
+ simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) {
875
+ return simde_mm_cmpgt_ss(a, b);
876
+ }
877
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
878
+ # define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
879
+ #endif
880
+
881
+ SIMDE__FUNCTION_ATTRIBUTES
882
+ simde__m128
883
+ simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) {
884
+ return simde_mm_cmpge_ps(a, b);
885
+ }
886
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
887
+ # define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
888
+ #endif
889
+
890
+ SIMDE__FUNCTION_ATTRIBUTES
891
+ simde__m128
892
+ simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) {
893
+ return simde_mm_cmpge_ss(a, b);
894
+ }
895
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
896
+ # define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
897
+ #endif
898
+
899
+ SIMDE__FUNCTION_ATTRIBUTES
900
+ simde__m128
901
+ simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) {
902
+ #if defined(SIMDE_SSE_NATIVE)
903
+ return _mm_cmpord_ps(a, b);
904
+ #else
905
+ simde__m128_private
906
+ r_,
907
+ a_ = simde__m128_to_private(a),
908
+ b_ = simde__m128_to_private(b);
909
+
910
+ #if defined(SIMDE_SSE_NEON)
911
+ /* Note: NEON does not have ordered compare builtin
912
+ Need to compare a eq a and b eq b to check for NaN
913
+ Do AND of results to get final */
914
+ uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
915
+ uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
916
+ r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
917
+ #elif defined(simde_isnanf)
918
+ SIMDE__VECTORIZE
919
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
920
+ r_.u32[i] = (simde_isnanf(a_.f32[i]) || simde_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0);
921
+ }
922
+ #else
923
+ HEDLEY_UNREACHABLE();
924
+ #endif
925
+
926
+ return simde__m128_from_private(r_);
927
+ #endif
928
+ }
929
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
930
+ # define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
931
+ #endif
932
+
933
+ SIMDE__FUNCTION_ATTRIBUTES
934
+ simde__m128
935
+ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) {
936
+ #if defined(SIMDE_SSE_NATIVE)
937
+ return _mm_cmpunord_ps(a, b);
938
+ #else
939
+ simde__m128_private
940
+ r_,
941
+ a_ = simde__m128_to_private(a),
942
+ b_ = simde__m128_to_private(b);
943
+
944
+ #if defined(simde_isnanf)
945
+ SIMDE__VECTORIZE
946
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
947
+ r_.u32[i] = (simde_isnanf(a_.f32[i]) || simde_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0);
948
+ }
949
+ #else
950
+ HEDLEY_UNREACHABLE();
951
+ #endif
952
+
953
+ return simde__m128_from_private(r_);
954
+ #endif
955
+ }
956
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
957
+ # define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
958
+ #endif
959
+
960
+ SIMDE__FUNCTION_ATTRIBUTES
961
+ simde__m128
962
+ simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) {
963
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
964
+ return _mm_cmpunord_ss(a, b);
965
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
966
+ return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
967
+ #else
968
+ simde__m128_private
969
+ r_,
970
+ a_ = simde__m128_to_private(a),
971
+ b_ = simde__m128_to_private(b);
972
+
973
+ #if defined(simde_isnanf)
974
+ r_.u32[0] = (simde_isnanf(a_.f32[0]) || simde_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0);
975
+ SIMDE__VECTORIZE
976
+ for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
977
+ r_.u32[i] = a_.u32[i];
978
+ }
979
+ #else
980
+ HEDLEY_UNREACHABLE();
981
+ #endif
982
+
983
+ return simde__m128_from_private(r_);
984
+ #endif
985
+ }
986
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
987
+ # define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
988
+ #endif
989
+
990
+ SIMDE__FUNCTION_ATTRIBUTES
991
+ int
992
+ simde_mm_comieq_ss (simde__m128 a, simde__m128 b) {
993
+ #if defined(SIMDE_SSE_NATIVE)
994
+ return _mm_comieq_ss(a, b);
995
+ #else
996
+ simde__m128_private
997
+ a_ = simde__m128_to_private(a),
998
+ b_ = simde__m128_to_private(b);
999
+
1000
+ #if defined(SIMDE_SSE_NEON)
1001
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1002
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1003
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1004
+ uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
1005
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
1006
+ #else
1007
+ return a_.f32[0] == b_.f32[0];
1008
+ #endif
1009
+ #endif
1010
+ }
1011
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1012
+ # define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
1013
+ #endif
1014
+
1015
+ SIMDE__FUNCTION_ATTRIBUTES
1016
+ int
1017
+ simde_mm_comige_ss (simde__m128 a, simde__m128 b) {
1018
+ #if defined(SIMDE_SSE_NATIVE)
1019
+ return _mm_comige_ss(a, b);
1020
+ #else
1021
+ simde__m128_private
1022
+ a_ = simde__m128_to_private(a),
1023
+ b_ = simde__m128_to_private(b);
1024
+
1025
+ #if defined(SIMDE_SSE_NEON)
1026
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1027
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1028
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1029
+ uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1030
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
1031
+ #else
1032
+ return a_.f32[0] >= b_.f32[0];
1033
+ #endif
1034
+ #endif
1035
+ }
1036
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1037
+ # define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
1038
+ #endif
1039
+
1040
+ SIMDE__FUNCTION_ATTRIBUTES
1041
+ int
1042
+ simde_mm_comigt_ss (simde__m128 a, simde__m128 b) {
1043
+ #if defined(SIMDE_SSE_NATIVE)
1044
+ return _mm_comigt_ss(a, b);
1045
+ #else
1046
+ simde__m128_private
1047
+ a_ = simde__m128_to_private(a),
1048
+ b_ = simde__m128_to_private(b);
1049
+
1050
+ #if defined(SIMDE_SSE_NEON)
1051
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1052
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1053
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1054
+ uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1055
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
1056
+ #else
1057
+ return a_.f32[0] > b_.f32[0];
1058
+ #endif
1059
+ #endif
1060
+ }
1061
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1062
+ # define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
1063
+ #endif
1064
+
1065
+ SIMDE__FUNCTION_ATTRIBUTES
1066
+ int
1067
+ simde_mm_comile_ss (simde__m128 a, simde__m128 b) {
1068
+ #if defined(SIMDE_SSE_NATIVE)
1069
+ return _mm_comile_ss(a, b);
1070
+ #else
1071
+ simde__m128_private
1072
+ a_ = simde__m128_to_private(a),
1073
+ b_ = simde__m128_to_private(b);
1074
+
1075
+ #if defined(SIMDE_SSE_NEON)
1076
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1077
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1078
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1079
+ uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
1080
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
1081
+ #else
1082
+ return a_.f32[0] <= b_.f32[0];
1083
+ #endif
1084
+ #endif
1085
+ }
1086
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1087
+ # define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
1088
+ #endif
1089
+
1090
+ SIMDE__FUNCTION_ATTRIBUTES
1091
+ int
1092
+ simde_mm_comilt_ss (simde__m128 a, simde__m128 b) {
1093
+ #if defined(SIMDE_SSE_NATIVE)
1094
+ return _mm_comilt_ss(a, b);
1095
+ #else
1096
+ simde__m128_private
1097
+ a_ = simde__m128_to_private(a),
1098
+ b_ = simde__m128_to_private(b);
1099
+
1100
+ #if defined(SIMDE_SSE_NEON)
1101
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1102
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1103
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1104
+ uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
1105
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
1106
+ #else
1107
+ return a_.f32[0] < b_.f32[0];
1108
+ #endif
1109
+ #endif
1110
+ }
1111
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1112
+ # define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
1113
+ #endif
1114
+
1115
+ SIMDE__FUNCTION_ATTRIBUTES
1116
+ int
1117
+ simde_mm_comineq_ss (simde__m128 a, simde__m128 b) {
1118
+ #if defined(SIMDE_SSE_NATIVE)
1119
+ return _mm_comineq_ss(a, b);
1120
+ #else
1121
+ simde__m128_private
1122
+ a_ = simde__m128_to_private(a),
1123
+ b_ = simde__m128_to_private(b);
1124
+
1125
+ #if defined(SIMDE_SSE_NEON)
1126
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1127
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1128
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1129
+ uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1130
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
1131
+ #else
1132
+ return a_.f32[0] != b_.f32[0];
1133
+ #endif
1134
+ #endif
1135
+ }
1136
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1137
+ # define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
1138
+ #endif
1139
+
1140
+ SIMDE__FUNCTION_ATTRIBUTES
1141
+ simde__m128
1142
+ simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) {
1143
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1144
+ return _mm_cvt_pi2ps(a, b);
1145
+ #else
1146
+ simde__m128_private
1147
+ r_,
1148
+ a_ = simde__m128_to_private(a);
1149
+ simde__m64_private b_ = simde__m64_to_private(b);
1150
+
1151
+ #if defined(SIMDE_SSE_NEON)
1152
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
1153
+ #elif defined(SIMDE__CONVERT_VECTOR)
1154
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, b_.i32);
1155
+ r_.m64_private[1] = a_.m64_private[1];
1156
+
1157
+ #else
1158
+ r_.f32[0] = (simde_float32) b_.i32[0];
1159
+ r_.f32[1] = (simde_float32) b_.i32[1];
1160
+ r_.i32[2] = a_.i32[2];
1161
+ r_.i32[3] = a_.i32[3];
1162
+ #endif
1163
+
1164
+ return simde__m128_from_private(r_);
1165
+ #endif
1166
+ }
1167
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1168
+ # define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), b)
1169
+ #endif
1170
+
1171
+ SIMDE__FUNCTION_ATTRIBUTES
1172
+ simde__m64
1173
+ simde_mm_cvt_ps2pi (simde__m128 a) {
1174
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1175
+ return _mm_cvt_ps2pi(a);
1176
+ #else
1177
+ simde__m64_private r_;
1178
+ simde__m128_private a_ = simde__m128_to_private(a);
1179
+
1180
+ #if defined(SIMDE_SSE_NEON)
1181
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1182
+ #elif defined(SIMDE__CONVERT_VECTOR) && !defined(__clang__)
1183
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1184
+ #else
1185
+ SIMDE__VECTORIZE
1186
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1187
+ r_.i32[i] = (int32_t) a_.f32[i];
1188
+ }
1189
+ #endif
1190
+
1191
+ return simde__m64_from_private(r_);
1192
+ #endif
1193
+ }
1194
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1195
+ # define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
1196
+ #endif
1197
+
1198
+ SIMDE__FUNCTION_ATTRIBUTES
1199
+ simde__m128
1200
+ simde_mm_cvt_si2ss (simde__m128 a, int32_t b) {
1201
+ #if defined(SIMDE_SSE_NATIVE)
1202
+ return _mm_cvt_si2ss(a, b);
1203
+ #else
1204
+ simde__m128_private
1205
+ r_,
1206
+ a_ = simde__m128_to_private(a);
1207
+
1208
+ #if defined(SIMDE_SSE_NEON)
1209
+ r_.neon_f32 = vsetq_lane_f32((float) b, a_.neon_f32, 0);
1210
+ #else
1211
+ r_.f32[0] = (simde_float32) b;
1212
+ r_.i32[1] = a_.i32[1];
1213
+ r_.i32[2] = a_.i32[2];
1214
+ r_.i32[3] = a_.i32[3];
1215
+ #endif
1216
+
1217
+ return simde__m128_from_private(r_);
1218
+ #endif
1219
+ }
1220
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1221
+ # define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
1222
+ #endif
1223
+
1224
+ SIMDE__FUNCTION_ATTRIBUTES
1225
+ int32_t
1226
+ simde_mm_cvt_ss2si (simde__m128 a) {
1227
+ #if defined(SIMDE_SSE_NATIVE)
1228
+ return _mm_cvt_ss2si(a);
1229
+ #else
1230
+ simde__m128_private a_ = simde__m128_to_private(a);
1231
+
1232
+ #if defined(SIMDE_SSE_NEON)
1233
+ return SIMDE_CONVERT_FTOI(int32_t, nearbyintf(vgetq_lane_f32(a_.neon_f32, 0)));
1234
+ #elif defined(SIMDE_HAVE_MATH_H)
1235
+ return SIMDE_CONVERT_FTOI(int32_t, nearbyintf(a_.f32[0]));
1236
+ #else
1237
+ HEDLEY_UNREACHABLE();
1238
+ #endif
1239
+ #endif
1240
+ }
1241
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1242
+ # define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
1243
+ #endif
1244
+
1245
+ SIMDE__FUNCTION_ATTRIBUTES
1246
+ simde__m128
1247
+ simde_mm_cvtpi16_ps (simde__m64 a) {
1248
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1249
+ return _mm_cvtpi16_ps(a);
1250
+ #else
1251
+ simde__m128_private r_;
1252
+ simde__m64_private a_ = simde__m64_to_private(a);
1253
+
1254
+ #if defined(SIMDE_SSE_NEON) && 0 /* TODO */
1255
+ r_.neon_f32 = vmovl_s16(vget_low_s16(vuzp1q_s16(a_.neon_i16, vmovq_n_s16(0))));
1256
+ #elif defined(SIMDE__CONVERT_VECTOR)
1257
+ SIMDE__CONVERT_VECTOR(r_.f32, a_.i16);
1258
+ #else
1259
+ SIMDE__VECTORIZE
1260
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1261
+ simde_float32 v = a_.i16[i];
1262
+ r_.f32[i] = v;
1263
+ }
1264
+ #endif
1265
+
1266
+ return simde__m128_from_private(r_);
1267
+ #endif
1268
+ }
1269
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1270
+ # define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
1271
+ #endif
1272
+
1273
+ SIMDE__FUNCTION_ATTRIBUTES
1274
+ simde__m128
1275
+ simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) {
1276
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1277
+ return _mm_cvtpi32_ps(a, b);
1278
+ #else
1279
+ simde__m128_private
1280
+ r_,
1281
+ a_ = simde__m128_to_private(a);
1282
+ simde__m64_private b_ = simde__m64_to_private(b);
1283
+
1284
+ #if defined(SIMDE_SSE_NEON)
1285
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32));
1286
+ #elif defined(SIMDE__CONVERT_VECTOR)
1287
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, b_.i32);
1288
+ r_.m64_private[1] = a_.m64_private[1];
1289
+ #else
1290
+ r_.f32[0] = (simde_float32) b_.i32[0];
1291
+ r_.f32[1] = (simde_float32) b_.i32[1];
1292
+ r_.i32[2] = a_.i32[2];
1293
+ r_.i32[3] = a_.i32[3];
1294
+ #endif
1295
+
1296
+ return simde__m128_from_private(r_);
1297
+ #endif
1298
+ }
1299
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1300
+ # define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
1301
+ #endif
1302
+
1303
+ SIMDE__FUNCTION_ATTRIBUTES
1304
+ simde__m128
1305
+ simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) {
1306
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1307
+ return _mm_cvtpi32x2_ps(a, b);
1308
+ #else
1309
+ simde__m128_private r_;
1310
+ simde__m64_private
1311
+ a_ = simde__m64_to_private(a),
1312
+ b_ = simde__m64_to_private(b);
1313
+
1314
+ #if defined(SIMDE_SSE_NEON)
1315
+ r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
1316
+ #elif defined(SIMDE__CONVERT_VECTOR)
1317
+ SIMDE__CONVERT_VECTOR(r_.m64_private[0].f32, a_.i32);
1318
+ SIMDE__CONVERT_VECTOR(r_.m64_private[1].f32, b_.i32);
1319
+ #else
1320
+ r_.f32[0] = (simde_float32) a_.i32[0];
1321
+ r_.f32[1] = (simde_float32) a_.i32[1];
1322
+ r_.f32[2] = (simde_float32) b_.i32[0];
1323
+ r_.f32[3] = (simde_float32) b_.i32[1];
1324
+ #endif
1325
+
1326
+ return simde__m128_from_private(r_);
1327
+ #endif
1328
+ }
1329
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1330
+ # define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
1331
+ #endif
1332
+
1333
+ SIMDE__FUNCTION_ATTRIBUTES
1334
+ simde__m128
1335
+ simde_mm_cvtpi8_ps (simde__m64 a) {
1336
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1337
+ return _mm_cvtpi8_ps(a);
1338
+ #else
1339
+ simde__m128_private r_;
1340
+ simde__m64_private a_ = simde__m64_to_private(a);
1341
+
1342
+ #if defined(SIMDE_SSE_NEON)
1343
+ r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
1344
+ #else
1345
+ r_.f32[0] = (simde_float32) a_.i8[0];
1346
+ r_.f32[1] = (simde_float32) a_.i8[1];
1347
+ r_.f32[2] = (simde_float32) a_.i8[2];
1348
+ r_.f32[3] = (simde_float32) a_.i8[3];
1349
+ #endif
1350
+
1351
+ return simde__m128_from_private(r_);
1352
+ #endif
1353
+ }
1354
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1355
+ # define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
1356
+ #endif
1357
+
1358
+ SIMDE__FUNCTION_ATTRIBUTES
1359
+ simde__m64
1360
+ simde_mm_cvtps_pi16 (simde__m128 a) {
1361
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1362
+ return _mm_cvtps_pi16(a);
1363
+ #else
1364
+ simde__m64_private r_;
1365
+ simde__m128_private a_ = simde__m128_to_private(a);
1366
+
1367
+ #if defined(SIMDE__CONVERT_VECTOR)
1368
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.f32);
1369
+ #elif defined(SIMDE_SSE_NEON)
1370
+ r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1371
+ #else
1372
+ SIMDE__VECTORIZE
1373
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1374
+ r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, a_.f32[i]);
1375
+ }
1376
+ #endif
1377
+
1378
+ return simde__m64_from_private(r_);
1379
+ #endif
1380
+ }
1381
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1382
+ # define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
1383
+ #endif
1384
+
1385
+ SIMDE__FUNCTION_ATTRIBUTES
1386
+ simde__m64
1387
+ simde_mm_cvtps_pi32 (simde__m128 a) {
1388
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1389
+ return _mm_cvtps_pi32(a);
1390
+ #else
1391
+ simde__m64_private r_;
1392
+ simde__m128_private a_ = simde__m128_to_private(a);
1393
+
1394
+ #if defined(SIMDE_SSE_NEON)
1395
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1396
+ #elif defined(SIMDE__CONVERT_VECTOR)
1397
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1398
+ #else
1399
+ SIMDE__VECTORIZE
1400
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1401
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1402
+ }
1403
+ #endif
1404
+
1405
+ return simde__m64_from_private(r_);
1406
+ #endif
1407
+ }
1408
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1409
+ # define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
1410
+ #endif
1411
+
1412
+ SIMDE__FUNCTION_ATTRIBUTES
1413
+ simde__m64
1414
+ simde_mm_cvtps_pi8 (simde__m128 a) {
1415
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1416
+ return _mm_cvtps_pi8(a);
1417
+ #else
1418
+ simde__m64_private r_;
1419
+ simde__m128_private a_ = simde__m128_to_private(a);
1420
+
1421
+ #if defined(SIMDE_SSE_NEON)
1422
+ int16x4_t b = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1423
+ int16x8_t c = vcombine_s16(b, vmov_n_s16(0));
1424
+ r_.neon_i8 = vmovn_s16(c);
1425
+ #else
1426
+ SIMDE__VECTORIZE
1427
+ for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
1428
+ r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, a_.f32[i]);
1429
+ }
1430
+ /* Note: the upper half is undefined */
1431
+ #endif
1432
+
1433
+ return simde__m64_from_private(r_);
1434
+ #endif
1435
+ }
1436
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1437
+ # define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
1438
+ #endif
1439
+
1440
+ SIMDE__FUNCTION_ATTRIBUTES
1441
+ simde__m128
1442
+ simde_mm_cvtpu16_ps (simde__m64 a) {
1443
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1444
+ return _mm_cvtpu16_ps(a);
1445
+ #else
1446
+ simde__m128_private r_;
1447
+ simde__m64_private a_ = simde__m64_to_private(a);
1448
+
1449
+ #if defined(SIMDE_SSE_NEON)
1450
+ r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
1451
+ #elif defined(SIMDE__CONVERT_VECTOR)
1452
+ SIMDE__CONVERT_VECTOR(r_.f32, a_.u16);
1453
+ #else
1454
+ SIMDE__VECTORIZE
1455
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1456
+ r_.f32[i] = (simde_float32) a_.u16[i];
1457
+ }
1458
+ #endif
1459
+
1460
+ return simde__m128_from_private(r_);
1461
+ #endif
1462
+ }
1463
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1464
+ # define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
1465
+ #endif
1466
+
1467
+ SIMDE__FUNCTION_ATTRIBUTES
1468
+ simde__m128
1469
+ simde_mm_cvtpu8_ps (simde__m64 a) {
1470
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1471
+ return _mm_cvtpu8_ps(a);
1472
+ #else
1473
+ simde__m128_private r_;
1474
+ simde__m64_private a_ = simde__m64_to_private(a);
1475
+
1476
+ #if defined(SIMDE_SSE_NEON)
1477
+ r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
1478
+ #else
1479
+ SIMDE__VECTORIZE
1480
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1481
+ r_.f32[i] = (simde_float32) a_.u8[i];
1482
+ }
1483
+ #endif
1484
+
1485
+ return simde__m128_from_private(r_);
1486
+ #endif
1487
+ }
1488
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1489
+ # define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
1490
+ #endif
1491
+
1492
+ SIMDE__FUNCTION_ATTRIBUTES
1493
+ simde__m128
1494
+ simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) {
1495
+ #if defined(SIMDE_SSE_NATIVE)
1496
+ return _mm_cvtsi32_ss(a, b);
1497
+ #else
1498
+ simde__m128_private r_;
1499
+ simde__m128_private a_ = simde__m128_to_private(a);
1500
+
1501
+ #if defined(SIMDE_SSE_NEON)
1502
+ r_.neon_f32 = vsetq_lane_f32((simde_float32) b, a_.neon_f32, 0);
1503
+ #else
1504
+ r_.f32[0] = (simde_float32) b;
1505
+ SIMDE__VECTORIZE
1506
+ for (size_t i = 1 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1507
+ r_.i32[i] = a_.i32[i];
1508
+ }
1509
+ #endif
1510
+
1511
+ return simde__m128_from_private(r_);
1512
+ #endif
1513
+ }
1514
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1515
+ # define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
1516
+ #endif
1517
+
1518
+ SIMDE__FUNCTION_ATTRIBUTES
1519
+ simde__m128
1520
+ simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) {
1521
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1522
+ #if !defined(__PGI)
1523
+ return _mm_cvtsi64_ss(a, b);
1524
+ #else
1525
+ return _mm_cvtsi64x_ss(a, b);
1526
+ #endif
1527
+ #else
1528
+ simde__m128_private r_;
1529
+ simde__m128_private a_ = simde__m128_to_private(a);
1530
+
1531
+ #if defined(SIMDE_SSE_NEON)
1532
+ r_.neon_f32 = vsetq_lane_f32((simde_float32) b, a_.neon_f32, 0);
1533
+ #else
1534
+ r_ = a_;
1535
+ r_.f32[0] = (simde_float32) b;
1536
+ #endif
1537
+
1538
+ return simde__m128_from_private(r_);
1539
+ #endif
1540
+ }
1541
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1542
+ # define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
1543
+ #endif
1544
+
1545
+ SIMDE__FUNCTION_ATTRIBUTES
1546
+ simde_float32
1547
+ simde_mm_cvtss_f32 (simde__m128 a) {
1548
+ #if defined(SIMDE_SSE_NATIVE)
1549
+ return _mm_cvtss_f32(a);
1550
+ #else
1551
+ simde__m128_private a_ = simde__m128_to_private(a);
1552
+ #if defined(SIMDE_SSE_NEON)
1553
+ return vgetq_lane_f32(a_.neon_f32, 0);
1554
+ #else
1555
+ return a_.f32[0];
1556
+ #endif
1557
+ #endif
1558
+ }
1559
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1560
+ # define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
1561
+ #endif
1562
+
1563
+ SIMDE__FUNCTION_ATTRIBUTES
1564
+ int32_t
1565
+ simde_mm_cvtss_si32 (simde__m128 a) {
1566
+ return simde_mm_cvt_ss2si(a);
1567
+ }
1568
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1569
+ # define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
1570
+ #endif
1571
+
1572
+ SIMDE__FUNCTION_ATTRIBUTES
1573
+ int64_t
1574
+ simde_mm_cvtss_si64 (simde__m128 a) {
1575
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1576
+ #if !defined(__PGI)
1577
+ return _mm_cvtss_si64(a);
1578
+ #else
1579
+ return _mm_cvtss_si64x(a);
1580
+ #endif
1581
+ #else
1582
+ simde__m128_private a_ = simde__m128_to_private(a);
1583
+ #if defined(SIMDE_SSE_NEON)
1584
+ return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1585
+ #else
1586
+ return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1587
+ #endif
1588
+ #endif
1589
+ }
1590
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1591
+ # define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
1592
+ #endif
1593
+
1594
+ SIMDE__FUNCTION_ATTRIBUTES
1595
+ simde__m64
1596
+ simde_mm_cvtt_ps2pi (simde__m128 a) {
1597
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1598
+ return _mm_cvtt_ps2pi(a);
1599
+ #else
1600
+ simde__m64_private r_;
1601
+ simde__m128_private a_ = simde__m128_to_private(a);
1602
+
1603
+ #if defined(SIMDE_SSE_NEON)
1604
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1605
+ #elif defined(SIMDE__CONVERT_VECTOR)
1606
+ SIMDE__CONVERT_VECTOR(r_.i32, a_.m64_private[0].f32);
1607
+ #else
1608
+ SIMDE__VECTORIZE
1609
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1610
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1611
+ }
1612
+ #endif
1613
+
1614
+ return simde__m64_from_private(r_);
1615
+ #endif
1616
+ }
1617
+ #define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
1618
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1619
+ # define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
1620
+ # define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
1621
+ #endif
1622
+
1623
+ SIMDE__FUNCTION_ATTRIBUTES
1624
+ int32_t
1625
+ simde_mm_cvtt_ss2si (simde__m128 a) {
1626
+ #if defined(SIMDE_SSE_NATIVE)
1627
+ return _mm_cvtt_ss2si(a);
1628
+ #else
1629
+ simde__m128_private a_ = simde__m128_to_private(a);
1630
+
1631
+ #if defined(SIMDE_SSE_NEON)
1632
+ return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
1633
+ #else
1634
+ return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
1635
+ #endif
1636
+ #endif
1637
+ }
1638
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1639
+ # define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
1640
+ # define _mm_cvttss_si32(a) simde_mm_cvttss_si32((a))
1641
+ #endif
1642
+
1643
+ SIMDE__FUNCTION_ATTRIBUTES
1644
+ int64_t
1645
+ simde_mm_cvttss_si64 (simde__m128 a) {
1646
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(_MSC_VER)
1647
+ #if defined(__PGI)
1648
+ return _mm_cvttss_si64x(a);
1649
+ #else
1650
+ return _mm_cvttss_si64(a);
1651
+ #endif
1652
+ #else
1653
+ simde__m128_private a_ = simde__m128_to_private(a);
1654
+
1655
+ #if defined(SIMDE_SSE_NEON)
1656
+ return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1657
+ #else
1658
+ return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1659
+ #endif
1660
+ #endif
1661
+ }
1662
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1663
+ # define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
1664
+ #endif
1665
+
1666
+ SIMDE__FUNCTION_ATTRIBUTES
1667
+ simde__m128
1668
+ simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) {
1669
+ #if defined(SIMDE_SSE_NATIVE)
1670
+ return _mm_cmpord_ss(a, b);
1671
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
1672
+ return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
1673
+ #else
1674
+ simde__m128_private
1675
+ r_,
1676
+ a_ = simde__m128_to_private(a);
1677
+
1678
+ #if defined(simde_isnanf)
1679
+ r_.u32[0] = (simde_isnanf(simde_mm_cvtss_f32(a)) || simde_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0);
1680
+ SIMDE__VECTORIZE
1681
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1682
+ r_.u32[i] = a_.u32[i];
1683
+ }
1684
+ #else
1685
+ HEDLEY_UNREACHABLE();
1686
+ #endif
1687
+
1688
+ return simde__m128_from_private(r_);
1689
+ #endif
1690
+ }
1691
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1692
+ # define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
1693
+ #endif
1694
+
1695
+ SIMDE__FUNCTION_ATTRIBUTES
1696
+ simde__m128
1697
+ simde_mm_div_ps (simde__m128 a, simde__m128 b) {
1698
+ #if defined(SIMDE_SSE_NATIVE)
1699
+ return _mm_div_ps(a, b);
1700
+ #else
1701
+ simde__m128_private
1702
+ r_,
1703
+ a_ = simde__m128_to_private(a),
1704
+ b_ = simde__m128_to_private(b);
1705
+
1706
+ #if defined(SIMDE_SSE_NEON) && defined(SIMDE_ARCH_AARCH64)
1707
+ r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
1708
+ #elif defined(SIMDE_SSE_NEON)
1709
+ float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
1710
+ float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
1711
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
1712
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
1713
+ r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
1714
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1715
+ r_.f32 = a_.f32 / b_.f32;
1716
+ #else
1717
+ SIMDE__VECTORIZE
1718
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1719
+ r_.f32[i] = a_.f32[i] / b_.f32[i];
1720
+ }
1721
+ #endif
1722
+
1723
+ return simde__m128_from_private(r_);
1724
+ #endif
1725
+ }
1726
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1727
+ # define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
1728
+ #endif
1729
+
1730
+ SIMDE__FUNCTION_ATTRIBUTES
1731
+ simde__m128
1732
+ simde_mm_div_ss (simde__m128 a, simde__m128 b) {
1733
+ #if defined(SIMDE_SSE_NATIVE)
1734
+ return _mm_div_ss(a, b);
1735
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
1736
+ return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
1737
+ #else
1738
+ simde__m128_private
1739
+ r_,
1740
+ a_ = simde__m128_to_private(a),
1741
+ b_ = simde__m128_to_private(b);
1742
+
1743
+ r_.f32[0] = a_.f32[0] / b_.f32[0];
1744
+ SIMDE__VECTORIZE
1745
+ for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1746
+ r_.f32[i] = a_.f32[i];
1747
+ }
1748
+
1749
+ return simde__m128_from_private(r_);
1750
+ #endif
1751
+ }
1752
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1753
+ # define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
1754
+ #endif
1755
+
1756
+ SIMDE__FUNCTION_ATTRIBUTES
1757
+ int16_t
1758
+ simde_mm_extract_pi16 (simde__m64 a, const int imm8)
1759
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
1760
+ simde__m64_private a_ = simde__m64_to_private(a);
1761
+ return a_.i16[imm8];
1762
+ }
1763
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(HEDLEY_PGI_VERSION)
1764
+ # if HEDLEY_HAS_WARNING("-Wvector-conversion")
1765
+ /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1766
+ # define simde_mm_extract_pi16(a, imm8) ( \
1767
+ HEDLEY_DIAGNOSTIC_PUSH \
1768
+ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
1769
+ HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
1770
+ HEDLEY_DIAGNOSTIC_POP \
1771
+ )
1772
+ # else
1773
+ # define simde_mm_extract_pi16(a, imm8) ((int16_t) (_mm_extract_pi16(a, imm8)))
1774
+ # endif
1775
+ #elif defined(SIMDE_SSE_NEON)
1776
+ # define simde_mm_extract_pi16(a, imm8) ((int16_t) (vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)))
1777
+ #endif
1778
+ #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
1779
+
1780
+ enum {
1781
+ #if defined(SIMDE_SSE_NATIVE)
1782
+ SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
1783
+ SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
1784
+ SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
1785
+ SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
1786
+ #else
1787
+ SIMDE_MM_ROUND_NEAREST
1788
+ #if defined(FE_TONEAREST)
1789
+ = FE_TONEAREST
1790
+ #endif
1791
+ ,
1792
+
1793
+ SIMDE_MM_ROUND_DOWN
1794
+ #if defined(FE_DOWNWARD)
1795
+ = FE_DOWNWARD
1796
+ #endif
1797
+ ,
1798
+
1799
+ SIMDE_MM_ROUND_UP
1800
+ #if defined(FE_UPWARD)
1801
+ = FE_UPWARD
1802
+ #endif
1803
+ ,
1804
+
1805
+ SIMDE_MM_ROUND_TOWARD_ZERO
1806
+ #if defined(FE_TOWARDZERO)
1807
+ = FE_TOWARDZERO
1808
+ #endif
1809
+ #endif
1810
+ };
1811
+
1812
+ SIMDE__FUNCTION_ATTRIBUTES
1813
+ unsigned int
1814
+ SIMDE_MM_GET_ROUNDING_MODE(void) {
1815
+ #if defined(SIMDE_SSE_NATIVE)
1816
+ return _MM_GET_ROUNDING_MODE();
1817
+ #elif defined(SIMDE_HAVE_MATH_H)
1818
+ return (unsigned int) fegetround();
1819
+ #else
1820
+ HEDLEY_UNREACHABLE();
1821
+ #endif
1822
+ }
1823
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1824
+ # define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), imm8)
1825
+ #endif
1826
+
1827
+ SIMDE__FUNCTION_ATTRIBUTES
1828
+ void
1829
+ SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) {
1830
+ #if defined(SIMDE_SSE_NATIVE)
1831
+ _MM_SET_ROUNDING_MODE(a);
1832
+ #else
1833
+ fesetround((int) a);
1834
+ #endif
1835
+ }
1836
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1837
+ # define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
1838
+ #endif
1839
+
1840
+ SIMDE__FUNCTION_ATTRIBUTES
1841
+ simde__m64
1842
+ simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8)
1843
+ HEDLEY_REQUIRE_MSG((imm8 & 3) == imm8, "imm8 must be in range [0, 3]") {
1844
+ simde__m64_private
1845
+ r_,
1846
+ a_ = simde__m64_to_private(a);
1847
+
1848
+ r_.i64[0] = a_.i64[0];
1849
+ r_.i16[imm8] = i;
1850
+
1851
+ return simde__m64_from_private(r_);
1852
+ }
1853
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(__PGI)
1854
+ # if HEDLEY_HAS_WARNING("-Wvector-conversion")
1855
+ /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1856
+ # define ssimde_mm_insert_pi16(a, i, imm8) ( \
1857
+ HEDLEY_DIAGNOSTIC_PUSH \
1858
+ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
1859
+ (_mm_insert_pi16((a), (i), (imm8))) \
1860
+ HEDLEY_DIAGNOSTIC_POP \
1861
+ )
1862
+ # else
1863
+ # define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
1864
+ # endif
1865
+ #elif defined(SIMDE_SSE_NEON)
1866
+ # define simde_mm_insert_pi16(a, i, imm8) simde__m64_from_private((simde__m64_private) { .neon_i16 = vset_lane_s16(i, simde__m64_to_private(a).neon_i16, (imm8)) })
1867
+ #endif
1868
+ #define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
1869
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1870
+ # define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1871
+ # define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1872
+ #endif
1873
+
1874
+ SIMDE__FUNCTION_ATTRIBUTES
1875
+ simde__m128
1876
+ simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
1877
+ simde_assert_aligned(16, mem_addr);
1878
+
1879
+ #if defined(SIMDE_SSE_NATIVE)
1880
+ return _mm_load_ps(mem_addr);
1881
+ #else
1882
+ simde__m128_private r_;
1883
+
1884
+ #if defined(SIMDE_SSE_NEON)
1885
+ r_.neon_f32 = vld1q_f32(mem_addr);
1886
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
1887
+ r_.altivec_f32 = vec_ld(0, mem_addr);
1888
+ #else
1889
+ r_ = *SIMDE_CAST_ALIGN(16, simde__m128_private const*, mem_addr);
1890
+ #endif
1891
+
1892
+ return simde__m128_from_private(r_);
1893
+ #endif
1894
+ }
1895
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1896
+ # define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
1897
+ #endif
1898
+
1899
+ SIMDE__FUNCTION_ATTRIBUTES
1900
+ simde__m128
1901
+ simde_mm_load_ps1 (simde_float32 const* mem_addr) {
1902
+ #if defined(SIMDE_SSE_NATIVE)
1903
+ return _mm_load_ps1(mem_addr);
1904
+ #else
1905
+ simde__m128_private r_;
1906
+
1907
+ #if defined(SIMDE_SSE_NEON)
1908
+ r_.neon_f32 = vld1q_dup_f32(mem_addr);
1909
+ #else
1910
+ r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
1911
+ #endif
1912
+
1913
+ return simde__m128_from_private(r_);
1914
+ #endif
1915
+ }
1916
+ #define simde_mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1917
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1918
+ # define _mm_load_ps1(mem_addr) simde_mm_load_ps1(mem_addr)
1919
+ # define _mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1920
+ #endif
1921
+
1922
+ SIMDE__FUNCTION_ATTRIBUTES
1923
+ simde__m128
1924
+ simde_mm_load_ss (simde_float32 const* mem_addr) {
1925
+ #if defined(SIMDE_SSE_NATIVE)
1926
+ return _mm_load_ss(mem_addr);
1927
+ #else
1928
+ simde__m128_private r_;
1929
+
1930
+ #if defined(SIMDE_SSE_NEON)
1931
+ r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1932
+ #else
1933
+ r_.f32[0] = *mem_addr;
1934
+ r_.i32[1] = 0;
1935
+ r_.i32[2] = 0;
1936
+ r_.i32[3] = 0;
1937
+ #endif
1938
+
1939
+ return simde__m128_from_private(r_);
1940
+ #endif
1941
+ }
1942
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1943
+ # define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
1944
+ #endif
1945
+
1946
+ SIMDE__FUNCTION_ATTRIBUTES
1947
+ simde__m128
1948
+ simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) {
1949
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
1950
+ return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
1951
+ #else
1952
+ simde__m128_private
1953
+ r_,
1954
+ a_ = simde__m128_to_private(a);
1955
+
1956
+ #if defined(SIMDE_SSE_NEON)
1957
+ r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)));
1958
+ #else
1959
+ simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr);
1960
+ r_.f32[0] = a_.f32[0];
1961
+ r_.f32[1] = a_.f32[1];
1962
+ r_.f32[2] = b_.f32[0];
1963
+ r_.f32[3] = b_.f32[1];
1964
+ #endif
1965
+
1966
+ return simde__m128_from_private(r_);
1967
+ #endif
1968
+ }
1969
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
1970
+ # define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), (simde__m64 const*) (mem_addr))
1971
+ #endif
1972
+
1973
+ /* The SSE documentation says that there are no alignment requirements
1974
+ for mem_addr. Unfortunately they used the __m64 type for the argument
1975
+ which is supposed to be 8-byte aligned, so some compilers (like clang
1976
+ with -Wcast-align) will generate a warning if you try to cast, say,
1977
+ a simde_float32* to a simde__m64* for this function.
1978
+
1979
+ I think the choice of argument type is unfortunate, but I do think we
1980
+ need to stick to it here. If there is demand I can always add something
1981
+ like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
1982
+ SIMDE__FUNCTION_ATTRIBUTES
1983
+ simde__m128
1984
+ simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) {
1985
+ #if defined(SIMDE_SSE_NATIVE)
1986
+ return _mm_loadl_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr));
1987
+ #else
1988
+ simde__m128_private
1989
+ r_,
1990
+ a_ = simde__m128_to_private(a);
1991
+
1992
+ #if defined(SIMDE_SSE_NEON)
1993
+ r_.neon_f32 = vcombine_f32(vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32));
1994
+ #else
1995
+ simde__m64_private b_;
1996
+ simde_memcpy(&b_, mem_addr, sizeof(b_));
1997
+ r_.i32[0] = b_.i32[0];
1998
+ r_.i32[1] = b_.i32[1];
1999
+ r_.i32[2] = a_.i32[2];
2000
+ r_.i32[3] = a_.i32[3];
2001
+ #endif
2002
+
2003
+ return simde__m128_from_private(r_);
2004
+ #endif
2005
+ }
2006
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2007
+ # define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), (simde__m64 const*) (mem_addr))
2008
+ #endif
2009
+
2010
+ SIMDE__FUNCTION_ATTRIBUTES
2011
+ simde__m128
2012
+ simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
2013
+ simde_assert_aligned(16, mem_addr);
2014
+
2015
+ #if defined(SIMDE_SSE_NATIVE)
2016
+ return _mm_loadr_ps(mem_addr);
2017
+ #else
2018
+ simde__m128_private
2019
+ r_,
2020
+ v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
2021
+
2022
+ #if defined(SIMDE_SSE_NEON)
2023
+ r_.neon_f32 = vrev64q_f32(v_.neon_f32);
2024
+ r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
2025
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2026
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
2027
+ #else
2028
+ r_.f32[0] = v_.f32[3];
2029
+ r_.f32[1] = v_.f32[2];
2030
+ r_.f32[2] = v_.f32[1];
2031
+ r_.f32[3] = v_.f32[0];
2032
+ #endif
2033
+
2034
+ return simde__m128_from_private(r_);
2035
+ #endif
2036
+ }
2037
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2038
+ # define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
2039
+ #endif
2040
+
2041
+ SIMDE__FUNCTION_ATTRIBUTES
2042
+ simde__m128
2043
+ simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
2044
+ #if defined(SIMDE_SSE_NATIVE)
2045
+ return _mm_loadu_ps(mem_addr);
2046
+ #else
2047
+ simde__m128_private r_;
2048
+
2049
+ #if defined(SIMDE_SSE_NEON)
2050
+ r_.neon_f32 = vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr));
2051
+ #else
2052
+ r_.f32[0] = mem_addr[0];
2053
+ r_.f32[1] = mem_addr[1];
2054
+ r_.f32[2] = mem_addr[2];
2055
+ r_.f32[3] = mem_addr[3];
2056
+ #endif
2057
+
2058
+ return simde__m128_from_private(r_);
2059
+ #endif
2060
+ }
2061
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2062
+ # define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
2063
+ #endif
2064
+
2065
+ SIMDE__FUNCTION_ATTRIBUTES
2066
+ void
2067
+ simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, int8_t* mem_addr) {
2068
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2069
+ _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
2070
+ #else
2071
+ simde__m64_private
2072
+ a_ = simde__m64_to_private(a),
2073
+ mask_ = simde__m64_to_private(mask);
2074
+
2075
+ SIMDE__VECTORIZE
2076
+ for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++)
2077
+ if (mask_.i8[i] < 0)
2078
+ mem_addr[i] = a_.i8[i];
2079
+ #endif
2080
+ }
2081
+ #define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr)
2082
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2083
+ # define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64(a, (mask), mem_addr)
2084
+ #endif
2085
+
2086
+ SIMDE__FUNCTION_ATTRIBUTES
2087
+ simde__m64
2088
+ simde_mm_max_pi16 (simde__m64 a, simde__m64 b) {
2089
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2090
+ return _mm_max_pi16(a, b);
2091
+ #else
2092
+ simde__m64_private
2093
+ r_,
2094
+ a_ = simde__m64_to_private(a),
2095
+ b_ = simde__m64_to_private(b);
2096
+
2097
+ #if defined(SIMDE_SSE_NEON)
2098
+ r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
2099
+ #else
2100
+ SIMDE__VECTORIZE
2101
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2102
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2103
+ }
2104
+ #endif
2105
+
2106
+ return simde__m64_from_private(r_);
2107
+ #endif
2108
+ }
2109
+ #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2110
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2111
+ # define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
2112
+ # define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2113
+ #endif
2114
+
2115
+ SIMDE__FUNCTION_ATTRIBUTES
2116
+ simde__m128
2117
+ simde_mm_max_ps (simde__m128 a, simde__m128 b) {
2118
+ #if defined(SIMDE_SSE_NATIVE)
2119
+ return _mm_max_ps(a, b);
2120
+ #else
2121
+ simde__m128_private
2122
+ r_,
2123
+ a_ = simde__m128_to_private(a),
2124
+ b_ = simde__m128_to_private(b);
2125
+
2126
+ #if defined(SIMDE_SSE_NEON)
2127
+ r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
2128
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
2129
+ r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
2130
+ #else
2131
+ SIMDE__VECTORIZE
2132
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2133
+ r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2134
+ }
2135
+ #endif
2136
+
2137
+ return simde__m128_from_private(r_);
2138
+ #endif
2139
+ }
2140
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2141
+ # define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
2142
+ #endif
2143
+
2144
+ SIMDE__FUNCTION_ATTRIBUTES
2145
+ simde__m64
2146
+ simde_mm_max_pu8 (simde__m64 a, simde__m64 b) {
2147
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2148
+ return _mm_max_pu8(a, b);
2149
+ #else
2150
+ simde__m64_private
2151
+ r_,
2152
+ a_ = simde__m64_to_private(a),
2153
+ b_ = simde__m64_to_private(b);
2154
+
2155
+ #if defined(SIMDE_SSE_NEON)
2156
+ r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
2157
+ #else
2158
+ SIMDE__VECTORIZE
2159
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2160
+ r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2161
+ }
2162
+ #endif
2163
+
2164
+ return simde__m64_from_private(r_);
2165
+ #endif
2166
+ }
2167
+ #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2168
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2169
+ # define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
2170
+ # define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2171
+ #endif
2172
+
2173
+ SIMDE__FUNCTION_ATTRIBUTES
2174
+ simde__m128
2175
+ simde_mm_max_ss (simde__m128 a, simde__m128 b) {
2176
+ #if defined(SIMDE_SSE_NATIVE)
2177
+ return _mm_max_ss(a, b);
2178
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2179
+ return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
2180
+ #else
2181
+ simde__m128_private
2182
+ r_,
2183
+ a_ = simde__m128_to_private(a),
2184
+ b_ = simde__m128_to_private(b);
2185
+
2186
+ r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2187
+ r_.f32[1] = a_.f32[1];
2188
+ r_.f32[2] = a_.f32[2];
2189
+ r_.f32[3] = a_.f32[3];
2190
+
2191
+ return simde__m128_from_private(r_);
2192
+ #endif
2193
+ }
2194
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2195
+ # define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
2196
+ #endif
2197
+
2198
+ SIMDE__FUNCTION_ATTRIBUTES
2199
+ simde__m64
2200
+ simde_mm_min_pi16 (simde__m64 a, simde__m64 b) {
2201
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2202
+ return _mm_min_pi16(a, b);
2203
+ #else
2204
+ simde__m64_private
2205
+ r_,
2206
+ a_ = simde__m64_to_private(a),
2207
+ b_ = simde__m64_to_private(b);
2208
+
2209
+ #if defined(SIMDE_SSE_NEON)
2210
+ r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
2211
+ #else
2212
+ SIMDE__VECTORIZE
2213
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2214
+ r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2215
+ }
2216
+ #endif
2217
+
2218
+ return simde__m64_from_private(r_);
2219
+ #endif
2220
+ }
2221
+ #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
2222
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2223
+ # define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
2224
+ # define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
2225
+ #endif
2226
+
2227
+ SIMDE__FUNCTION_ATTRIBUTES
2228
+ simde__m128
2229
+ simde_mm_min_ps (simde__m128 a, simde__m128 b) {
2230
+ #if defined(SIMDE_SSE_NATIVE)
2231
+ return _mm_min_ps(a, b);
2232
+ #else
2233
+ simde__m128_private
2234
+ r_,
2235
+ a_ = simde__m128_to_private(a),
2236
+ b_ = simde__m128_to_private(b);
2237
+
2238
+ #if defined(SIMDE_SSE_NEON)
2239
+ r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32);
2240
+ #elif defined(SIMDE_SSE_POWER_ALTIVEC)
2241
+ r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
2242
+ #else
2243
+ SIMDE__VECTORIZE
2244
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2245
+ r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2246
+ }
2247
+ #endif
2248
+
2249
+ return simde__m128_from_private(r_);
2250
+ #endif
2251
+ }
2252
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2253
+ # define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
2254
+ #endif
2255
+
2256
+ SIMDE__FUNCTION_ATTRIBUTES
2257
+ simde__m64
2258
+ simde_mm_min_pu8 (simde__m64 a, simde__m64 b) {
2259
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2260
+ return _mm_min_pu8(a, b);
2261
+ #else
2262
+ simde__m64_private
2263
+ r_,
2264
+ a_ = simde__m64_to_private(a),
2265
+ b_ = simde__m64_to_private(b);
2266
+
2267
+ #if defined(SIMDE_SSE_NEON)
2268
+ r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
2269
+ #else
2270
+ SIMDE__VECTORIZE
2271
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2272
+ r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2273
+ }
2274
+ #endif
2275
+
2276
+ return simde__m64_from_private(r_);
2277
+ #endif
2278
+ }
2279
+ #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
2280
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2281
+ # define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
2282
+ # define _m_pminub(a, b) simde_mm_min_pu8(a, b)
2283
+ #endif
2284
+
2285
+ SIMDE__FUNCTION_ATTRIBUTES
2286
+ simde__m128
2287
+ simde_mm_min_ss (simde__m128 a, simde__m128 b) {
2288
+ #if defined(SIMDE_SSE_NATIVE)
2289
+ return _mm_min_ss(a, b);
2290
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2291
+ return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
2292
+ #else
2293
+ simde__m128_private
2294
+ r_,
2295
+ a_ = simde__m128_to_private(a),
2296
+ b_ = simde__m128_to_private(b);
2297
+
2298
+ r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2299
+ r_.f32[1] = a_.f32[1];
2300
+ r_.f32[2] = a_.f32[2];
2301
+ r_.f32[3] = a_.f32[3];
2302
+
2303
+ return simde__m128_from_private(r_);
2304
+ #endif
2305
+ }
2306
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2307
+ # define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
2308
+ #endif
2309
+
2310
+ SIMDE__FUNCTION_ATTRIBUTES
2311
+ simde__m128
2312
+ simde_mm_movehl_ps (simde__m128 a, simde__m128 b) {
2313
+ #if defined(SIMDE_SSE_NATIVE)
2314
+ return _mm_movehl_ps(a, b);
2315
+ #else
2316
+ simde__m128_private
2317
+ r_,
2318
+ a_ = simde__m128_to_private(a),
2319
+ b_ = simde__m128_to_private(b);
2320
+
2321
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2322
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
2323
+ #else
2324
+ r_.f32[0] = b_.f32[2];
2325
+ r_.f32[1] = b_.f32[3];
2326
+ r_.f32[2] = a_.f32[2];
2327
+ r_.f32[3] = a_.f32[3];
2328
+ #endif
2329
+
2330
+ return simde__m128_from_private(r_);
2331
+ #endif
2332
+ }
2333
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2334
+ # define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
2335
+ #endif
2336
+
2337
+ SIMDE__FUNCTION_ATTRIBUTES
2338
+ simde__m128
2339
+ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) {
2340
+ #if defined(SIMDE_SSE_NATIVE)
2341
+ return _mm_movelh_ps(a, b);
2342
+ #else
2343
+ simde__m128_private
2344
+ r_,
2345
+ a_ = simde__m128_to_private(a),
2346
+ b_ = simde__m128_to_private(b);
2347
+
2348
+ #if defined(SIMDE__SHUFFLE_VECTOR)
2349
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
2350
+ #else
2351
+ r_.f32[0] = a_.f32[0];
2352
+ r_.f32[1] = a_.f32[1];
2353
+ r_.f32[2] = b_.f32[0];
2354
+ r_.f32[3] = b_.f32[1];
2355
+ #endif
2356
+
2357
+ return simde__m128_from_private(r_);
2358
+ #endif
2359
+ }
2360
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2361
+ # define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
2362
+ #endif
2363
+
2364
+ SIMDE__FUNCTION_ATTRIBUTES
2365
+ int
2366
+ simde_mm_movemask_pi8 (simde__m64 a) {
2367
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2368
+ return _mm_movemask_pi8(a);
2369
+ #else
2370
+ simde__m64_private a_ = simde__m64_to_private(a);
2371
+ int r = 0;
2372
+ const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
2373
+
2374
+ SIMDE__VECTORIZE_REDUCTION(|:r)
2375
+ for (size_t i = 0 ; i < nmemb ; i++) {
2376
+ r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
2377
+ }
2378
+
2379
+ return r;
2380
+ #endif
2381
+ }
2382
+ #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
2383
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2384
+ # define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
2385
+ #endif
2386
+
2387
+ SIMDE__FUNCTION_ATTRIBUTES
2388
+ int
2389
+ simde_mm_movemask_ps (simde__m128 a) {
2390
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2391
+ return _mm_movemask_ps(a);
2392
+ #else
2393
+ int r = 0;
2394
+ simde__m128_private a_ = simde__m128_to_private(a);
2395
+
2396
+ #if defined(SIMDE_SSE_NEON)
2397
+ /* TODO: check to see if NEON version is faster than the portable version */
2398
+ static const uint32x4_t movemask = { 1, 2, 4, 8 };
2399
+ static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
2400
+ uint32x4_t t0 = a_.neon_u32;
2401
+ uint32x4_t t1 = vtstq_u32(t0, highbit);
2402
+ uint32x4_t t2 = vandq_u32(t1, movemask);
2403
+ uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
2404
+ r = vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
2405
+ #else
2406
+ SIMDE__VECTORIZE_REDUCTION(|:r)
2407
+ for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) {
2408
+ r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
2409
+ }
2410
+ #endif
2411
+
2412
+ return r;
2413
+ #endif
2414
+ }
2415
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2416
+ # define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
2417
+ #endif
2418
+
2419
+ SIMDE__FUNCTION_ATTRIBUTES
2420
+ simde__m128
2421
+ simde_mm_mul_ps (simde__m128 a, simde__m128 b) {
2422
+ #if defined(SIMDE_SSE_NATIVE)
2423
+ return _mm_mul_ps(a, b);
2424
+ #else
2425
+ simde__m128_private
2426
+ r_,
2427
+ a_ = simde__m128_to_private(a),
2428
+ b_ = simde__m128_to_private(b);
2429
+
2430
+ #if defined(SIMDE_SSE_NEON)
2431
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
2432
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
2433
+ r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
2434
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2435
+ r_.f32 = a_.f32 * b_.f32;
2436
+ #else
2437
+ SIMDE__VECTORIZE
2438
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2439
+ r_.f32[i] = a_.f32[i] * b_.f32[i];
2440
+ }
2441
+ #endif
2442
+
2443
+ return simde__m128_from_private(r_);
2444
+ #endif
2445
+ }
2446
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2447
+ # define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
2448
+ #endif
2449
+
2450
+ SIMDE__FUNCTION_ATTRIBUTES
2451
+ simde__m128
2452
+ simde_mm_mul_ss (simde__m128 a, simde__m128 b) {
2453
+ #if defined(SIMDE_SSE_NATIVE)
2454
+ return _mm_mul_ss(a, b);
2455
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2456
+ return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
2457
+ #else
2458
+ simde__m128_private
2459
+ r_,
2460
+ a_ = simde__m128_to_private(a),
2461
+ b_ = simde__m128_to_private(b);
2462
+
2463
+ r_.f32[0] = a_.f32[0] * b_.f32[0];
2464
+ r_.f32[1] = a_.f32[1];
2465
+ r_.f32[2] = a_.f32[2];
2466
+ r_.f32[3] = a_.f32[3];
2467
+
2468
+ return simde__m128_from_private(r_);
2469
+ #endif
2470
+ }
2471
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2472
+ # define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
2473
+ #endif
2474
+
2475
+ SIMDE__FUNCTION_ATTRIBUTES
2476
+ simde__m64
2477
+ simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) {
2478
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2479
+ return _mm_mulhi_pu16(a, b);
2480
+ #else
2481
+ simde__m64_private
2482
+ r_,
2483
+ a_ = simde__m64_to_private(a),
2484
+ b_ = simde__m64_to_private(b);
2485
+
2486
+ SIMDE__VECTORIZE
2487
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
2488
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16)));
2489
+ }
2490
+
2491
+ return simde__m64_from_private(r_);
2492
+ #endif
2493
+ }
2494
+ #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
2495
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2496
+ # define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
2497
+ #endif
2498
+
2499
+ SIMDE__FUNCTION_ATTRIBUTES
2500
+ simde__m128
2501
+ simde_mm_or_ps (simde__m128 a, simde__m128 b) {
2502
+ #if defined(SIMDE_SSE_NATIVE)
2503
+ return _mm_or_ps(a, b);
2504
+ #else
2505
+ simde__m128_private
2506
+ r_,
2507
+ a_ = simde__m128_to_private(a),
2508
+ b_ = simde__m128_to_private(b);
2509
+
2510
+ #if defined(SIMDE_SSE_NEON)
2511
+ r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
2512
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2513
+ r_.i32f = a_.i32f | b_.i32f;
2514
+ #else
2515
+ SIMDE__VECTORIZE
2516
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
2517
+ r_.u32[i] = a_.u32[i] | b_.u32[i];
2518
+ }
2519
+ #endif
2520
+
2521
+ return simde__m128_from_private(r_);
2522
+ #endif
2523
+ }
2524
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2525
+ # define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
2526
+ #endif
2527
+
2528
+ SIMDE__FUNCTION_ATTRIBUTES
2529
+ void
2530
+ simde_mm_prefetch (char const* p, int i) {
2531
+ (void) p;
2532
+ (void) i;
2533
+ }
2534
+ #if defined(SIMDE_SSE_NATIVE)
2535
+ # define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
2536
+ #endif
2537
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2538
+ # define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
2539
+ #endif
2540
+
2541
+ SIMDE__FUNCTION_ATTRIBUTES
2542
+ simde__m128
2543
+ simde_mm_rcp_ps (simde__m128 a) {
2544
+ #if defined(SIMDE_SSE_NATIVE)
2545
+ return _mm_rcp_ps(a);
2546
+ #else
2547
+ simde__m128_private
2548
+ r_,
2549
+ a_ = simde__m128_to_private(a);
2550
+
2551
+ #if defined(SIMDE_SSE_NEON)
2552
+ float32x4_t recip = vrecpeq_f32(a_.neon_f32);
2553
+
2554
+ # if !defined(SIMDE_MM_RCP_PS_ITERS)
2555
+ # define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS
2556
+ # endif
2557
+
2558
+ for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS ; ++i) {
2559
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
2560
+ }
2561
+
2562
+ r_.neon_f32 = recip;
2563
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2564
+ r_.f32 = 1.0f / a_.f32;
2565
+ #else
2566
+ SIMDE__VECTORIZE
2567
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2568
+ r_.f32[i] = 1.0f / a_.f32[i];
2569
+ }
2570
+ #endif
2571
+
2572
+ return simde__m128_from_private(r_);
2573
+ #endif
2574
+ }
2575
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2576
+ # define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
2577
+ #endif
2578
+
2579
+ SIMDE__FUNCTION_ATTRIBUTES
2580
+ simde__m128
2581
+ simde_mm_rcp_ss (simde__m128 a) {
2582
+ #if defined(SIMDE_SSE_NATIVE)
2583
+ return _mm_rcp_ss(a);
2584
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2585
+ return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
2586
+ #else
2587
+ simde__m128_private
2588
+ r_,
2589
+ a_ = simde__m128_to_private(a);
2590
+
2591
+ r_.f32[0] = 1.0f / a_.f32[0];
2592
+ r_.f32[1] = a_.f32[1];
2593
+ r_.f32[2] = a_.f32[2];
2594
+ r_.f32[3] = a_.f32[3];
2595
+
2596
+ return simde__m128_from_private(r_);
2597
+ #endif
2598
+ }
2599
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2600
+ # define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
2601
+ #endif
2602
+
2603
+ SIMDE__FUNCTION_ATTRIBUTES
2604
+ simde__m128
2605
+ simde_mm_rsqrt_ps (simde__m128 a) {
2606
+ #if defined(SIMDE_SSE_NATIVE)
2607
+ return _mm_rsqrt_ps(a);
2608
+ #else
2609
+ simde__m128_private
2610
+ r_,
2611
+ a_ = simde__m128_to_private(a);
2612
+
2613
+ #if defined(SIMDE_SSE_NEON)
2614
+ r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
2615
+ #elif defined(__STDC_IEC_559__)
2616
+ /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */
2617
+ SIMDE__VECTORIZE
2618
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2619
+ r_.i32[i] = INT32_C(0x5f3759df) - (a_.i32[i] >> 1);
2620
+
2621
+ #if SIMDE_ACCURACY_ITERS > 2
2622
+ const float half = SIMDE_FLOAT32_C(0.5) * a_.f32[i];
2623
+ for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++)
2624
+ r_.f32[i] *= SIMDE_FLOAT32_C(1.5) - (half * r_.f32[i] * r_.f32[i]);
2625
+ #endif
2626
+ }
2627
+ #elif defined(SIMDE_HAVE_MATH_H)
2628
+ SIMDE__VECTORIZE
2629
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2630
+ r_.f32[i] = 1.0f / sqrtf(a_.f32[i]);
2631
+ }
2632
+ #else
2633
+ HEDLEY_UNREACHABLE();
2634
+ #endif
2635
+
2636
+ return simde__m128_from_private(r_);
2637
+ #endif
2638
+ }
2639
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2640
+ # define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
2641
+ #endif
2642
+
2643
+ SIMDE__FUNCTION_ATTRIBUTES
2644
+ simde__m128
2645
+ simde_mm_rsqrt_ss (simde__m128 a) {
2646
+ #if defined(SIMDE_SSE_NATIVE)
2647
+ return _mm_rsqrt_ss(a);
2648
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2649
+ return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
2650
+ #else
2651
+ simde__m128_private
2652
+ r_,
2653
+ a_ = simde__m128_to_private(a);
2654
+
2655
+ #if defined(__STDC_IEC_559__)
2656
+ {
2657
+ r_.i32[0] = INT32_C(0x5f3759df) - (a_.i32[0] >> 1);
2658
+
2659
+ #if SIMDE_ACCURACY_ITERS > 2
2660
+ float half = SIMDE_FLOAT32_C(0.5) * a_.f32[0];
2661
+ for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++)
2662
+ r_.f32[0] *= SIMDE_FLOAT32_C(1.5) - (half * r_.f32[0] * r_.f32[0]);
2663
+ #endif
2664
+ }
2665
+ r_.f32[0] = 1.0f / sqrtf(a_.f32[0]);
2666
+ r_.f32[1] = a_.f32[1];
2667
+ r_.f32[2] = a_.f32[2];
2668
+ r_.f32[3] = a_.f32[3];
2669
+ #elif defined(SIMDE_HAVE_MATH_H)
2670
+ r_.f32[0] = 1.0f / sqrtf(a_.f32[0]);
2671
+ r_.f32[1] = a_.f32[1];
2672
+ r_.f32[2] = a_.f32[2];
2673
+ r_.f32[3] = a_.f32[3];
2674
+ #else
2675
+ HEDLEY_UNREACHABLE();
2676
+ #endif
2677
+
2678
+ return simde__m128_from_private(r_);
2679
+ #endif
2680
+ }
2681
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2682
+ # define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
2683
+ #endif
2684
+
2685
+ SIMDE__FUNCTION_ATTRIBUTES
2686
+ simde__m64
2687
+ simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) {
2688
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
2689
+ return _mm_sad_pu8(a, b);
2690
+ #else
2691
+ simde__m64_private
2692
+ r_,
2693
+ a_ = simde__m64_to_private(a),
2694
+ b_ = simde__m64_to_private(b);
2695
+ uint16_t sum = 0;
2696
+
2697
+ #if defined(SIMDE_HAVE_STDLIB_H)
2698
+ SIMDE__VECTORIZE_REDUCTION(+:sum)
2699
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
2700
+ sum += (uint8_t) abs(a_.u8[i] - b_.u8[i]);
2701
+ }
2702
+
2703
+ r_.i16[0] = (int16_t) sum;
2704
+ r_.i16[1] = 0;
2705
+ r_.i16[2] = 0;
2706
+ r_.i16[3] = 0;
2707
+ #else
2708
+ HEDLEY_UNREACHABLE();
2709
+ #endif
2710
+
2711
+ return simde__m64_from_private(r_);
2712
+ #endif
2713
+ }
2714
+ #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2715
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2716
+ # define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
2717
+ # define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2718
+ #endif
2719
+
2720
+ SIMDE__FUNCTION_ATTRIBUTES
2721
+ simde__m128
2722
+ simde_mm_set_ss (simde_float32 a) {
2723
+ #if defined(SIMDE_SSE_NATIVE)
2724
+ return _mm_set_ss(a);
2725
+ #elif defined(SIMDE_SSE_NEON)
2726
+ return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
2727
+ #else
2728
+ return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), a);
2729
+ #endif
2730
+ }
2731
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2732
+ # define _mm_set_ss(a) simde_mm_set_ss(a)
2733
+ #endif
2734
+
2735
+ SIMDE__FUNCTION_ATTRIBUTES
2736
+ simde__m128
2737
+ simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
2738
+ #if defined(SIMDE_SSE_NATIVE)
2739
+ return _mm_setr_ps(e3, e2, e1, e0);
2740
+ #elif defined(SIMDE_SSE_NEON)
2741
+ SIMDE_ALIGN(16) simde_float32 data[4] = { e3, e2, e1, e0 };
2742
+ return vld1q_f32(data);
2743
+ #else
2744
+ return simde_mm_set_ps(e0, e1, e2, e3);
2745
+ #endif
2746
+ }
2747
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2748
+ # define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
2749
+ #endif
2750
+
2751
+ SIMDE__FUNCTION_ATTRIBUTES
2752
+ simde__m128
2753
+ simde_mm_setzero_ps (void) {
2754
+ #if defined(SIMDE_SSE_NATIVE)
2755
+ return _mm_setzero_ps();
2756
+ #elif defined(SIMDE_SSE_NEON)
2757
+ return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
2758
+ #else
2759
+ simde__m128 r;
2760
+ simde_memset(&r, 0, sizeof(r));
2761
+ return r;
2762
+ #endif
2763
+ }
2764
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2765
+ # define _mm_setzero_ps() simde_mm_setzero_ps()
2766
+ #endif
2767
+
2768
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2769
+ HEDLEY_DIAGNOSTIC_PUSH
2770
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
2771
+ #endif
2772
+
2773
+ SIMDE__FUNCTION_ATTRIBUTES
2774
+ simde__m128
2775
+ simde_mm_undefined_ps (void) {
2776
+ simde__m128_private r_;
2777
+
2778
+ #if defined(SIMDE__HAVE_UNDEFINED128)
2779
+ r_.n = _mm_undefined_ps();
2780
+ #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2781
+ r_ = simde__m128_to_private(simde_mm_setzero_ps());
2782
+ #endif
2783
+
2784
+ return simde__m128_from_private(r_);
2785
+ }
2786
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2787
+ # define _mm_undefined_ps() simde_mm_undefined_ps()
2788
+ #endif
2789
+
2790
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2791
+ HEDLEY_DIAGNOSTIC_POP
2792
+ #endif
2793
+
2794
+ SIMDE__FUNCTION_ATTRIBUTES
2795
+ simde__m128
2796
+ simde_mm_setone_ps (void) {
2797
+ simde__m128 t = simde_mm_setzero_ps();
2798
+ return simde_mm_cmpeq_ps(t, t);
2799
+ }
2800
+
2801
+ SIMDE__FUNCTION_ATTRIBUTES
2802
+ void
2803
+ simde_mm_sfence (void) {
2804
+ /* TODO: Use Hedley. */
2805
+ #if defined(SIMDE_SSE_NATIVE)
2806
+ _mm_sfence();
2807
+ #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2808
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
2809
+ #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
2810
+ # if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
2811
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
2812
+ # else
2813
+ atomic_thread_fence(memory_order_seq_cst);
2814
+ # endif
2815
+ #elif defined(_MSC_VER)
2816
+ MemoryBarrier();
2817
+ #elif HEDLEY_HAS_EXTENSION(c_atomic)
2818
+ __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
2819
+ #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
2820
+ __sync_synchronize();
2821
+ #elif defined(_OPENMP)
2822
+ # pragma omp critical(simde_mm_sfence_)
2823
+ { }
2824
+ #endif
2825
+ }
2826
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2827
+ # define _mm_sfence() simde_mm_sfence()
2828
+ #endif
2829
+
2830
+ #define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2831
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2832
+ # define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
2833
+ #endif
2834
+
2835
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX) && !defined(__PGI)
2836
+ # define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
2837
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2838
+ # define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \
2839
+ const simde__m64_private simde__tmp_a_ = simde__m64_to_private(a); \
2840
+ simde__m64_from_private((simde__m64_private) { .i16 = \
2841
+ SIMDE__SHUFFLE_VECTOR(16, 8, \
2842
+ (simde__tmp_a_).i16, \
2843
+ (simde__tmp_a_).i16, \
2844
+ (((imm8) ) & 3), \
2845
+ (((imm8) >> 2) & 3), \
2846
+ (((imm8) >> 4) & 3), \
2847
+ (((imm8) >> 6) & 3)) }); }))
2848
+ #else
2849
+ SIMDE__FUNCTION_ATTRIBUTES
2850
+ simde__m64
2851
+ simde_mm_shuffle_pi16 (simde__m64 a, const int imm8)
2852
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
2853
+ simde__m64_private r_;
2854
+ simde__m64_private a_ = simde__m64_to_private(a);
2855
+
2856
+ for (size_t i = 0 ; i < sizeof(r_.i16) / sizeof(r_.i16[0]) ; i++) {
2857
+ r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
2858
+ }
2859
+
2860
+ HEDLEY_DIAGNOSTIC_PUSH
2861
+ #if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
2862
+ # pragma clang diagnostic ignored "-Wconditional-uninitialized"
2863
+ #endif
2864
+ return simde__m64_from_private(r_);
2865
+ HEDLEY_DIAGNOSTIC_POP
2866
+ }
2867
+ #endif
2868
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2869
+ # define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
2870
+ #else
2871
+ # define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2872
+ #endif
2873
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2874
+ # define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2875
+ # define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2876
+ #endif
2877
+
2878
+ #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2879
+ # define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
2880
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2881
+ # define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \
2882
+ simde__m128_from_private((simde__m128_private) { .f32 = \
2883
+ SIMDE__SHUFFLE_VECTOR(32, 16, \
2884
+ simde__m128_to_private(a).f32, \
2885
+ simde__m128_to_private(b).f32, \
2886
+ (((imm8) ) & 3), \
2887
+ (((imm8) >> 2) & 3), \
2888
+ (((imm8) >> 4) & 3) + 4, \
2889
+ (((imm8) >> 6) & 3) + 4) }); }))
2890
+ #else
2891
+ SIMDE__FUNCTION_ATTRIBUTES
2892
+ simde__m128
2893
+ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8)
2894
+ HEDLEY_REQUIRE_MSG((imm8 & 0xff) == imm8, "imm8 must be in range [0, 255]") {
2895
+ simde__m128_private
2896
+ r_,
2897
+ a_ = simde__m128_to_private(a),
2898
+ b_ = simde__m128_to_private(b);
2899
+
2900
+ r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
2901
+ r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
2902
+ r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
2903
+ r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
2904
+
2905
+ return simde__m128_from_private(r_);
2906
+ }
2907
+ #endif
2908
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2909
+ # define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
2910
+ #endif
2911
+
2912
+ SIMDE__FUNCTION_ATTRIBUTES
2913
+ simde__m128
2914
+ simde_mm_sqrt_ps (simde__m128 a) {
2915
+ #if defined(SIMDE_SSE_NATIVE)
2916
+ return _mm_sqrt_ps(a);
2917
+ #else
2918
+ simde__m128_private
2919
+ r_,
2920
+ a_ = simde__m128_to_private(a);
2921
+
2922
+ #if defined(SIMDE_SSE_NEON)
2923
+ float32x4_t recipsq = vrsqrteq_f32(a_.neon_f32);
2924
+ float32x4_t sq = vrecpeq_f32(recipsq);
2925
+ /* ??? use step versions of both sqrt and recip for better accuracy? */
2926
+ r_.neon_f32 = sq;
2927
+ #elif defined(SIMDE_HAVE_MATH_H)
2928
+ SIMDE__VECTORIZE
2929
+ for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) {
2930
+ r_.f32[i] = sqrtf(a_.f32[i]);
2931
+ }
2932
+ #else
2933
+ HEDLEY_UNREACHABLE();
2934
+ #endif
2935
+
2936
+ return simde__m128_from_private(r_);
2937
+ #endif
2938
+ }
2939
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2940
+ # define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
2941
+ #endif
2942
+
2943
+ SIMDE__FUNCTION_ATTRIBUTES
2944
+ simde__m128
2945
+ simde_mm_sqrt_ss (simde__m128 a) {
2946
+ #if defined(SIMDE_SSE_NATIVE)
2947
+ return _mm_sqrt_ss(a);
2948
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
2949
+ return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
2950
+ #else
2951
+ simde__m128_private
2952
+ r_,
2953
+ a_ = simde__m128_to_private(a);
2954
+
2955
+ #if defined(SIMDE_HAVE_MATH_H)
2956
+ r_.f32[0] = sqrtf(a_.f32[0]);
2957
+ r_.f32[1] = a_.f32[1];
2958
+ r_.f32[2] = a_.f32[2];
2959
+ r_.f32[3] = a_.f32[3];
2960
+ #else
2961
+ HEDLEY_UNREACHABLE();
2962
+ #endif
2963
+
2964
+ return simde__m128_from_private(r_);
2965
+ #endif
2966
+ }
2967
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2968
+ # define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
2969
+ #endif
2970
+
2971
+ SIMDE__FUNCTION_ATTRIBUTES
2972
+ void
2973
+ simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) {
2974
+ simde_assert_aligned(16, mem_addr);
2975
+
2976
+ #if defined(SIMDE_SSE_NATIVE)
2977
+ _mm_store_ps(mem_addr, a);
2978
+ #else
2979
+ simde__m128_private a_ = simde__m128_to_private(a);
2980
+
2981
+ #if defined(SIMDE_SSE_NEON)
2982
+ vst1q_f32(mem_addr, a_.neon_f32);
2983
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
2984
+ wasm_v128_store(mem_addr, a_.wasm_v128);
2985
+ #else
2986
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
2987
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
2988
+ mem_addr[i] = a_.f32[i];
2989
+ }
2990
+ #endif
2991
+ #endif
2992
+ }
2993
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
2994
+ # define _mm_store_ps(mem_addr, a) simde_mm_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
2995
+ #endif
2996
+
2997
+ SIMDE__FUNCTION_ATTRIBUTES
2998
+ void
2999
+ simde_mm_store_ps1 (simde_float32 mem_addr[4], simde__m128 a) {
3000
+ simde_assert_aligned(16, mem_addr);
3001
+
3002
+ #if defined(SIMDE_SSE_NATIVE)
3003
+ _mm_store_ps1(mem_addr, a);
3004
+ #else
3005
+ simde__m128_private a_ = simde__m128_to_private(a);
3006
+
3007
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
3008
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
3009
+ mem_addr[i] = a_.f32[0];
3010
+ }
3011
+ #endif
3012
+ }
3013
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3014
+ # define _mm_store_ps1(mem_addr, a) simde_mm_store_ps1(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3015
+ #endif
3016
+
3017
+ SIMDE__FUNCTION_ATTRIBUTES
3018
+ void
3019
+ simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) {
3020
+ #if defined(SIMDE_SSE_NATIVE)
3021
+ _mm_store_ss(mem_addr, a);
3022
+ #else
3023
+ simde__m128_private a_ = simde__m128_to_private(a);
3024
+
3025
+ #if defined(SIMDE_SSE_NEON)
3026
+ vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
3027
+ #else
3028
+ *mem_addr = a_.f32[0];
3029
+ #endif
3030
+ #endif
3031
+ }
3032
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3033
+ # define _mm_store_ss(mem_addr, a) simde_mm_store_ss(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3034
+ #endif
3035
+
3036
+ SIMDE__FUNCTION_ATTRIBUTES
3037
+ void
3038
+ simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) {
3039
+ simde_assert_aligned(16, mem_addr);
3040
+
3041
+ #if defined(SIMDE_SSE_NATIVE)
3042
+ _mm_store1_ps(mem_addr, a);
3043
+ #else
3044
+ simde_mm_store_ps1(mem_addr, a);
3045
+ #endif
3046
+ }
3047
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3048
+ # define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3049
+ #endif
3050
+
3051
+ SIMDE__FUNCTION_ATTRIBUTES
3052
+ void
3053
+ simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) {
3054
+ #if defined(SIMDE_SSE_NATIVE)
3055
+ _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3056
+ #else
3057
+ simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr);
3058
+ simde__m128_private a_ = simde__m128_to_private(a);
3059
+
3060
+ dest_->f32[0] = a_.f32[2];
3061
+ dest_->f32[1] = a_.f32[3];
3062
+ #endif
3063
+ }
3064
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3065
+ # define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
3066
+ #endif
3067
+
3068
+ SIMDE__FUNCTION_ATTRIBUTES
3069
+ void
3070
+ simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) {
3071
+ #if defined(SIMDE_SSE_NATIVE)
3072
+ _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3073
+ #else
3074
+ simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr);
3075
+ simde__m128_private a_ = simde__m128_to_private(a);
3076
+
3077
+ dest_->f32[0] = a_.f32[0];
3078
+ dest_->f32[1] = a_.f32[1];
3079
+ #endif
3080
+ }
3081
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3082
+ # define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
3083
+ #endif
3084
+
3085
+ SIMDE__FUNCTION_ATTRIBUTES
3086
+ void
3087
+ simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) {
3088
+ simde_assert_aligned(16, mem_addr);
3089
+
3090
+ #if defined(SIMDE_SSE_NATIVE)
3091
+ _mm_storer_ps(mem_addr, a);
3092
+ #else
3093
+ simde__m128_private a_ = simde__m128_to_private(a);
3094
+
3095
+ #if defined(SIMDE__SHUFFLE_VECTOR)
3096
+ a_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
3097
+ simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
3098
+ #else
3099
+ SIMDE__VECTORIZE_ALIGNED(mem_addr:16)
3100
+ for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) {
3101
+ mem_addr[i] = a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
3102
+ }
3103
+ #endif
3104
+ #endif
3105
+ }
3106
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3107
+ # define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3108
+ #endif
3109
+
3110
+ SIMDE__FUNCTION_ATTRIBUTES
3111
+ void
3112
+ simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) {
3113
+ #if defined(SIMDE_SSE_NATIVE)
3114
+ _mm_storeu_ps(mem_addr, a);
3115
+ #else
3116
+ simde__m128_private a_ = simde__m128_to_private(a);
3117
+
3118
+ #if defined(SIMDE_SSE_NEON)
3119
+ vst1q_f32(mem_addr, a_.neon_f32);
3120
+ #else
3121
+ simde_memcpy(mem_addr, &a_, sizeof(a_));
3122
+ #endif
3123
+ #endif
3124
+ }
3125
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3126
+ # define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3127
+ #endif
3128
+
3129
+ SIMDE__FUNCTION_ATTRIBUTES
3130
+ simde__m128
3131
+ simde_mm_sub_ps (simde__m128 a, simde__m128 b) {
3132
+ #if defined(SIMDE_SSE_NATIVE)
3133
+ return _mm_sub_ps(a, b);
3134
+ #else
3135
+ simde__m128_private
3136
+ r_,
3137
+ a_ = simde__m128_to_private(a),
3138
+ b_ = simde__m128_to_private(b);
3139
+
3140
+ #if defined(SIMDE_SSE_NEON)
3141
+ r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
3142
+ #elif defined(SIMDE_SSE_WASM_SIMD128)
3143
+ r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
3144
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3145
+ r_.f32 = a_.f32 - b_.f32;
3146
+ #else
3147
+ SIMDE__VECTORIZE
3148
+ for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
3149
+ r_.f32[i] = a_.f32[i] - b_.f32[i];
3150
+ }
3151
+ #endif
3152
+
3153
+ return simde__m128_from_private(r_);
3154
+ #endif
3155
+ }
3156
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3157
+ # define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
3158
+ #endif
3159
+
3160
+ SIMDE__FUNCTION_ATTRIBUTES
3161
+ simde__m128
3162
+ simde_mm_sub_ss (simde__m128 a, simde__m128 b) {
3163
+ #if defined(SIMDE_SSE_NATIVE)
3164
+ return _mm_sub_ss(a, b);
3165
+ #elif defined(SIMDE_ASSUME_VECTORIZATION)
3166
+ return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
3167
+ #else
3168
+ simde__m128_private
3169
+ r_,
3170
+ a_ = simde__m128_to_private(a),
3171
+ b_ = simde__m128_to_private(b);
3172
+
3173
+ r_.f32[0] = a_.f32[0] - b_.f32[0];
3174
+ r_.f32[1] = a_.f32[1];
3175
+ r_.f32[2] = a_.f32[2];
3176
+ r_.f32[3] = a_.f32[3];
3177
+
3178
+ return simde__m128_from_private(r_);
3179
+ #endif
3180
+ }
3181
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3182
+ # define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
3183
+ #endif
3184
+
3185
+ SIMDE__FUNCTION_ATTRIBUTES
3186
+ int
3187
+ simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) {
3188
+ #if defined(SIMDE_SSE_NATIVE)
3189
+ return _mm_ucomieq_ss(a, b);
3190
+ #else
3191
+ simde__m128_private
3192
+ a_ = simde__m128_to_private(a),
3193
+ b_ = simde__m128_to_private(b);
3194
+ int r;
3195
+
3196
+ #if defined(SIMDE_HAVE_FENV_H)
3197
+ fenv_t envp;
3198
+ int x = feholdexcept(&envp);
3199
+ r = a_.f32[0] == b_.f32[0];
3200
+ if (HEDLEY_LIKELY(x == 0))
3201
+ fesetenv(&envp);
3202
+ #else
3203
+ HEDLEY_UNREACHABLE();
3204
+ #endif
3205
+
3206
+ return r;
3207
+ #endif
3208
+ }
3209
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3210
+ # define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
3211
+ #endif
3212
+
3213
+ SIMDE__FUNCTION_ATTRIBUTES
3214
+ int
3215
+ simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) {
3216
+ #if defined(SIMDE_SSE_NATIVE)
3217
+ return _mm_ucomige_ss(a, b);
3218
+ #else
3219
+ simde__m128_private
3220
+ a_ = simde__m128_to_private(a),
3221
+ b_ = simde__m128_to_private(b);
3222
+ int r;
3223
+
3224
+ #if defined(SIMDE_HAVE_FENV_H)
3225
+ fenv_t envp;
3226
+ int x = feholdexcept(&envp);
3227
+ r = a_.f32[0] >= b_.f32[0];
3228
+ if (HEDLEY_LIKELY(x == 0))
3229
+ fesetenv(&envp);
3230
+ #else
3231
+ HEDLEY_UNREACHABLE();
3232
+ #endif
3233
+
3234
+ return r;
3235
+ #endif
3236
+ }
3237
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3238
+ # define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
3239
+ #endif
3240
+
3241
+ SIMDE__FUNCTION_ATTRIBUTES
3242
+ int
3243
+ simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) {
3244
+ #if defined(SIMDE_SSE_NATIVE)
3245
+ return _mm_ucomigt_ss(a, b);
3246
+ #else
3247
+ simde__m128_private
3248
+ a_ = simde__m128_to_private(a),
3249
+ b_ = simde__m128_to_private(b);
3250
+ int r;
3251
+
3252
+ #if defined(SIMDE_HAVE_FENV_H)
3253
+ fenv_t envp;
3254
+ int x = feholdexcept(&envp);
3255
+ r = a_.f32[0] > b_.f32[0];
3256
+ if (HEDLEY_LIKELY(x == 0))
3257
+ fesetenv(&envp);
3258
+ #else
3259
+ HEDLEY_UNREACHABLE();
3260
+ #endif
3261
+
3262
+ return r;
3263
+ #endif
3264
+ }
3265
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3266
+ # define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
3267
+ #endif
3268
+
3269
+ SIMDE__FUNCTION_ATTRIBUTES
3270
+ int
3271
+ simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) {
3272
+ #if defined(SIMDE_SSE_NATIVE)
3273
+ return _mm_ucomile_ss(a, b);
3274
+ #else
3275
+ simde__m128_private
3276
+ a_ = simde__m128_to_private(a),
3277
+ b_ = simde__m128_to_private(b);
3278
+ int r;
3279
+
3280
+ #if defined(SIMDE_HAVE_FENV_H)
3281
+ fenv_t envp;
3282
+ int x = feholdexcept(&envp);
3283
+ r = a_.f32[0] <= b_.f32[0];
3284
+ if (HEDLEY_LIKELY(x == 0))
3285
+ fesetenv(&envp);
3286
+ #else
3287
+ HEDLEY_UNREACHABLE();
3288
+ #endif
3289
+
3290
+ return r;
3291
+ #endif
3292
+ }
3293
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3294
+ # define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
3295
+ #endif
3296
+
3297
+ SIMDE__FUNCTION_ATTRIBUTES
3298
+ int
3299
+ simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) {
3300
+ #if defined(SIMDE_SSE_NATIVE)
3301
+ return _mm_ucomilt_ss(a, b);
3302
+ #else
3303
+ simde__m128_private
3304
+ a_ = simde__m128_to_private(a),
3305
+ b_ = simde__m128_to_private(b);
3306
+ int r;
3307
+
3308
+ #if defined(SIMDE_HAVE_FENV_H)
3309
+ fenv_t envp;
3310
+ int x = feholdexcept(&envp);
3311
+ r = a_.f32[0] < b_.f32[0];
3312
+ if (HEDLEY_LIKELY(x == 0))
3313
+ fesetenv(&envp);
3314
+ #else
3315
+ HEDLEY_UNREACHABLE();
3316
+ #endif
3317
+
3318
+ return r;
3319
+ #endif
3320
+ }
3321
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3322
+ # define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
3323
+ #endif
3324
+
3325
+ SIMDE__FUNCTION_ATTRIBUTES
3326
+ int
3327
+ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) {
3328
+ #if defined(SIMDE_SSE_NATIVE)
3329
+ return _mm_ucomineq_ss(a, b);
3330
+ #else
3331
+ simde__m128_private
3332
+ a_ = simde__m128_to_private(a),
3333
+ b_ = simde__m128_to_private(b);
3334
+ int r;
3335
+
3336
+ #if defined(SIMDE_HAVE_FENV_H)
3337
+ fenv_t envp;
3338
+ int x = feholdexcept(&envp);
3339
+ r = a_.f32[0] != b_.f32[0];
3340
+ if (HEDLEY_LIKELY(x == 0))
3341
+ fesetenv(&envp);
3342
+ #else
3343
+ HEDLEY_UNREACHABLE();
3344
+ #endif
3345
+
3346
+ return r;
3347
+ #endif
3348
+ }
3349
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3350
+ # define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
3351
+ #endif
3352
+
3353
+ #if defined(SIMDE_SSE_NATIVE)
3354
+ # if defined(__has_builtin)
3355
+ # if __has_builtin(__builtin_ia32_undef128)
3356
+ # define SIMDE__HAVE_UNDEFINED128
3357
+ # endif
3358
+ # elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER)
3359
+ # define SIMDE__HAVE_UNDEFINED128
3360
+ # endif
3361
+ #endif
3362
+
3363
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3364
+ HEDLEY_DIAGNOSTIC_PUSH
3365
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
3366
+ #endif
3367
+
3368
+ SIMDE__FUNCTION_ATTRIBUTES
3369
+ simde__m128
3370
+ simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) {
3371
+ #if defined(SIMDE_SSE_NATIVE)
3372
+ return _mm_unpackhi_ps(a, b);
3373
+ #else
3374
+ simde__m128_private
3375
+ r_,
3376
+ a_ = simde__m128_to_private(a),
3377
+ b_ = simde__m128_to_private(b);
3378
+
3379
+ #if defined(SIMDE_SSE_NEON)
3380
+ float32x2_t a1 = vget_high_f32(a_.neon_f32);
3381
+ float32x2_t b1 = vget_high_f32(b_.neon_f32);
3382
+ float32x2x2_t result = vzip_f32(a1, b1);
3383
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3384
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
3385
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
3386
+ #else
3387
+ r_.f32[0] = a_.f32[2];
3388
+ r_.f32[1] = b_.f32[2];
3389
+ r_.f32[2] = a_.f32[3];
3390
+ r_.f32[3] = b_.f32[3];
3391
+ #endif
3392
+
3393
+ return simde__m128_from_private(r_);
3394
+ #endif
3395
+ }
3396
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3397
+ # define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
3398
+ #endif
3399
+
3400
+ SIMDE__FUNCTION_ATTRIBUTES
3401
+ simde__m128
3402
+ simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) {
3403
+ #if defined(SIMDE_SSE_NATIVE)
3404
+ return _mm_unpacklo_ps(a, b);
3405
+ #else
3406
+ simde__m128_private
3407
+ r_,
3408
+ a_ = simde__m128_to_private(a),
3409
+ b_ = simde__m128_to_private(b);
3410
+
3411
+ #if defined(SIMDE__SHUFFLE_VECTOR)
3412
+ r_.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
3413
+ #elif defined(SIMDE_SSE_NEON)
3414
+ float32x2_t a1 = vget_low_f32(a_.neon_f32);
3415
+ float32x2_t b1 = vget_low_f32(b_.neon_f32);
3416
+ float32x2x2_t result = vzip_f32(a1, b1);
3417
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3418
+ #else
3419
+ r_.f32[0] = a_.f32[0];
3420
+ r_.f32[1] = b_.f32[0];
3421
+ r_.f32[2] = a_.f32[1];
3422
+ r_.f32[3] = b_.f32[1];
3423
+ #endif
3424
+
3425
+ return simde__m128_from_private(r_);
3426
+ #endif
3427
+ }
3428
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3429
+ # define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
3430
+ #endif
3431
+
3432
+ SIMDE__FUNCTION_ATTRIBUTES
3433
+ simde__m128
3434
+ simde_mm_xor_ps (simde__m128 a, simde__m128 b) {
3435
+ #if defined(SIMDE_SSE_NATIVE)
3436
+ return _mm_xor_ps(a, b);
3437
+ #else
3438
+ simde__m128_private
3439
+ r_,
3440
+ a_ = simde__m128_to_private(a),
3441
+ b_ = simde__m128_to_private(b);
3442
+
3443
+ #if defined(SIMDE_SSE_NEON)
3444
+ r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
3445
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3446
+ r_.i32f = a_.i32f ^ b_.i32f;
3447
+ #else
3448
+ SIMDE__VECTORIZE
3449
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
3450
+ r_.u32[i] = a_.u32[i] ^ b_.u32[i];
3451
+ }
3452
+ #endif
3453
+
3454
+ return simde__m128_from_private(r_);
3455
+ #endif
3456
+ }
3457
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3458
+ # define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
3459
+ #endif
3460
+
3461
+ SIMDE__FUNCTION_ATTRIBUTES
3462
+ void
3463
+ simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) {
3464
+ #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_X86_MMX)
3465
+ _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a);
3466
+ #else
3467
+ simde__m64_private*
3468
+ dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr),
3469
+ a_ = simde__m64_to_private(a);
3470
+
3471
+ #if defined(SIMDE_SSE_NEON)
3472
+ dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
3473
+ #else
3474
+ dest->i64[0] = a_.i64[0];
3475
+ #endif
3476
+ #endif
3477
+ }
3478
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3479
+ # define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
3480
+ #endif
3481
+
3482
+ SIMDE__FUNCTION_ATTRIBUTES
3483
+ void
3484
+ simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) {
3485
+ simde_assert_aligned(16, mem_addr);
3486
+
3487
+ #if defined(SIMDE_SSE_NATIVE)
3488
+ _mm_stream_ps(mem_addr, a);
3489
+ #else
3490
+ simde__m128_private a_ = simde__m128_to_private(a);
3491
+
3492
+ #if defined(SIMDE_SSE_NEON)
3493
+ vst1q_f32(mem_addr, a_.neon_f32);
3494
+ #else
3495
+ SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3496
+ simde_memcpy(mem_addr, &a_, sizeof(a_));
3497
+ #endif
3498
+ #endif
3499
+ }
3500
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3501
+ # define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), (a))
3502
+ #endif
3503
+
3504
+ SIMDE__FUNCTION_ATTRIBUTES
3505
+ uint32_t
3506
+ simde_mm_getcsr (void) {
3507
+ #if defined(SIMDE_SSE_NATIVE)
3508
+ return _mm_getcsr();
3509
+ #else
3510
+ uint32_t r = 0;
3511
+
3512
+ #if defined(SIMDE_HAVE_FENV_H)
3513
+ int rounding_mode = fegetround();
3514
+
3515
+ switch(rounding_mode) {
3516
+ #if defined(FE_TONEAREST)
3517
+ case FE_TONEAREST:
3518
+ break;
3519
+ #endif
3520
+ #if defined(FE_UPWARD)
3521
+ case FE_UPWARD:
3522
+ r |= 2 << 13;
3523
+ break;
3524
+ #endif
3525
+ #if defined(FE_DOWNWARD)
3526
+ case FE_DOWNWARD:
3527
+ r |= 1 << 13;
3528
+ break;
3529
+ #endif
3530
+ #if defined(FE_TOWARDZERO)
3531
+ case FE_TOWARDZERO:
3532
+ r = 3 << 13;
3533
+ break;
3534
+ #endif
3535
+ }
3536
+ #else
3537
+ HEDLEY_UNREACHABLE();
3538
+ #endif
3539
+
3540
+ return r;
3541
+ #endif
3542
+ }
3543
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3544
+ # define _mm_getcsr() simde_mm_getcsr()
3545
+ #endif
3546
+
3547
+ SIMDE__FUNCTION_ATTRIBUTES
3548
+ void
3549
+ simde_mm_setcsr (uint32_t a) {
3550
+ #if defined(SIMDE_SSE_NATIVE)
3551
+ _mm_setcsr(a);
3552
+ #else
3553
+ switch((a >> 13) & 3) {
3554
+ #if defined(FE_TONEAREST)
3555
+ case 0:
3556
+ fesetround(FE_TONEAREST);
3557
+ #endif
3558
+ #if defined(FE_DOWNWARD)
3559
+ break;
3560
+ case 1:
3561
+ fesetround(FE_DOWNWARD);
3562
+ #endif
3563
+ #if defined(FE_UPWARD)
3564
+ break;
3565
+ case 2:
3566
+ fesetround(FE_UPWARD);
3567
+ #endif
3568
+ #if defined(FE_TOWARDZERO)
3569
+ break;
3570
+ case 3:
3571
+ fesetround(FE_TOWARDZERO);
3572
+ break;
3573
+ #endif
3574
+ }
3575
+ #endif
3576
+ }
3577
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3578
+ # define _mm_setcsr(a) simde_mm_setcsr(a)
3579
+ #endif
3580
+
3581
+ #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3582
+ do { \
3583
+ simde__m128 tmp3, tmp2, tmp1, tmp0; \
3584
+ tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
3585
+ tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
3586
+ tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
3587
+ tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
3588
+ row0 = simde_mm_movelh_ps(tmp0, tmp2); \
3589
+ row1 = simde_mm_movehl_ps(tmp2, tmp0); \
3590
+ row2 = simde_mm_movelh_ps(tmp1, tmp3); \
3591
+ row3 = simde_mm_movehl_ps(tmp3, tmp1); \
3592
+ } while (0)
3593
+
3594
+ #if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES)
3595
+ # define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
3596
+ #endif
3597
+
3598
+ #if defined(_MM_EXCEPT_INVALID)
3599
+ # define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
3600
+ #else
3601
+ # define SIMDE_MM_EXCEPT_INVALID (0x0001)
3602
+ #endif
3603
+ #if defined(_MM_EXCEPT_DENORM)
3604
+ # define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
3605
+ #else
3606
+ # define SIMDE_MM_EXCEPT_DENORM (0x0002)
3607
+ #endif
3608
+ #if defined(_MM_EXCEPT_DIV_ZERO)
3609
+ # define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
3610
+ #else
3611
+ # define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
3612
+ #endif
3613
+ #if defined(_MM_EXCEPT_OVERFLOW)
3614
+ # define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
3615
+ #else
3616
+ # define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
3617
+ #endif
3618
+ #if defined(_MM_EXCEPT_UNDERFLOW)
3619
+ # define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
3620
+ #else
3621
+ # define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
3622
+ #endif
3623
+ #if defined(_MM_EXCEPT_INEXACT)
3624
+ # define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
3625
+ #else
3626
+ # define SIMDE_MM_EXCEPT_INEXACT (0x0020)
3627
+ #endif
3628
+ #if defined(_MM_EXCEPT_MASK)
3629
+ # define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
3630
+ #else
3631
+ # define SIMDE_MM_EXCEPT_MASK \
3632
+ (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \
3633
+ SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
3634
+ SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
3635
+ #endif
3636
+
3637
+ #if defined(_MM_MASK_INVALID)
3638
+ # define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
3639
+ #else
3640
+ # define SIMDE_MM_MASK_INVALID (0x0080)
3641
+ #endif
3642
+ #if defined(_MM_MASK_DENORM)
3643
+ # define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
3644
+ #else
3645
+ # define SIMDE_MM_MASK_DENORM (0x0100)
3646
+ #endif
3647
+ #if defined(_MM_MASK_DIV_ZERO)
3648
+ # define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
3649
+ #else
3650
+ # define SIMDE_MM_MASK_DIV_ZERO (0x0200)
3651
+ #endif
3652
+ #if defined(_MM_MASK_OVERFLOW)
3653
+ # define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
3654
+ #else
3655
+ # define SIMDE_MM_MASK_OVERFLOW (0x0400)
3656
+ #endif
3657
+ #if defined(_MM_MASK_UNDERFLOW)
3658
+ # define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
3659
+ #else
3660
+ # define SIMDE_MM_MASK_UNDERFLOW (0x0800)
3661
+ #endif
3662
+ #if defined(_MM_MASK_INEXACT)
3663
+ # define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
3664
+ #else
3665
+ # define SIMDE_MM_MASK_INEXACT (0x1000)
3666
+ #endif
3667
+ #if defined(_MM_MASK_MASK)
3668
+ # define SIMDE_MM_MASK_MASK _MM_MASK_MASK
3669
+ #else
3670
+ # define SIMDE_MM_MASK_MASK \
3671
+ (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \
3672
+ SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
3673
+ SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
3674
+ #endif
3675
+
3676
+ #if defined(_MM_FLUSH_ZERO_MASK)
3677
+ # define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
3678
+ #else
3679
+ # define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
3680
+ #endif
3681
+ #if defined(_MM_FLUSH_ZERO_ON)
3682
+ # define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
3683
+ #else
3684
+ # define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
3685
+ #endif
3686
+ #if defined(_MM_FLUSH_ZERO_OFF)
3687
+ # define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
3688
+ #else
3689
+ # define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
3690
+ #endif
3691
+
3692
+ SIMDE__END_DECLS
3693
+
3694
+ HEDLEY_DIAGNOSTIC_POP
3695
+
3696
+ #endif /* !defined(SIMDE__SSE_H) */