noobs 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/COPYING +339 -0
  2. package/README.md +46 -0
  3. package/bin/64bit/obs.lib +0 -0
  4. package/binding.gyp +23 -0
  5. package/dist/bin/Qt6Core.dll +0 -0
  6. package/dist/bin/Qt6Gui.dll +0 -0
  7. package/dist/bin/Qt6Network.dll +0 -0
  8. package/dist/bin/Qt6Svg.dll +0 -0
  9. package/dist/bin/Qt6Widgets.dll +0 -0
  10. package/dist/bin/Qt6Xml.dll +0 -0
  11. package/dist/bin/avcodec-61.dll +0 -0
  12. package/dist/bin/avdevice-61.dll +0 -0
  13. package/dist/bin/avfilter-10.dll +0 -0
  14. package/dist/bin/avformat-61.dll +0 -0
  15. package/dist/bin/avutil-59.dll +0 -0
  16. package/dist/bin/datachannel.dll +0 -0
  17. package/dist/bin/libcurl.dll +0 -0
  18. package/dist/bin/libobs-d3d11.dll +0 -0
  19. package/dist/bin/libobs-opengl.dll +0 -0
  20. package/dist/bin/libobs-winrt.dll +0 -0
  21. package/dist/bin/librist.dll +0 -0
  22. package/dist/bin/libx264-164.dll +0 -0
  23. package/dist/bin/lua51.dll +0 -0
  24. package/dist/bin/obs-amf-test.exe +0 -0
  25. package/dist/bin/obs-ffmpeg-mux.exe +0 -0
  26. package/dist/bin/obs-frontend-api.dll +0 -0
  27. package/dist/bin/obs-scripting.dll +0 -0
  28. package/dist/bin/obs.dll +0 -0
  29. package/dist/bin/srt.dll +0 -0
  30. package/dist/bin/swresample-5.dll +0 -0
  31. package/dist/bin/swscale-8.dll +0 -0
  32. package/dist/bin/w32-pthreads.dll +0 -0
  33. package/dist/bin/zlib.dll +0 -0
  34. package/dist/effects/area.effect +250 -0
  35. package/dist/effects/bicubic_scale.effect +236 -0
  36. package/dist/effects/bilinear_lowres_scale.effect +123 -0
  37. package/dist/effects/color.effect +172 -0
  38. package/dist/effects/default.effect +254 -0
  39. package/dist/effects/default_rect.effect +84 -0
  40. package/dist/effects/deinterlace_base.effect +325 -0
  41. package/dist/effects/deinterlace_blend.effect +21 -0
  42. package/dist/effects/deinterlace_blend_2x.effect +21 -0
  43. package/dist/effects/deinterlace_discard.effect +21 -0
  44. package/dist/effects/deinterlace_discard_2x.effect +21 -0
  45. package/dist/effects/deinterlace_linear.effect +21 -0
  46. package/dist/effects/deinterlace_linear_2x.effect +21 -0
  47. package/dist/effects/deinterlace_yadif.effect +21 -0
  48. package/dist/effects/deinterlace_yadif_2x.effect +21 -0
  49. package/dist/effects/format_conversion.effect +1823 -0
  50. package/dist/effects/lanczos_scale.effect +292 -0
  51. package/dist/effects/opaque.effect +159 -0
  52. package/dist/effects/premultiplied_alpha.effect +38 -0
  53. package/dist/effects/repeat.effect +36 -0
  54. package/dist/effects/solid.effect +80 -0
  55. package/dist/noobs.node +0 -0
  56. package/dist/plugins/obs-ffmpeg.dll +0 -0
  57. package/dist/plugins/obs-x264.dll +0 -0
  58. package/dist/plugins/win-capture.dll +0 -0
  59. package/include/audio-monitoring/osx/mac-helpers.h +13 -0
  60. package/include/audio-monitoring/pulse/pulseaudio-wrapper.h +212 -0
  61. package/include/audio-monitoring/win32/wasapi-output.h +22 -0
  62. package/include/callback/calldata.h +195 -0
  63. package/include/callback/decl.h +61 -0
  64. package/include/callback/proc.h +52 -0
  65. package/include/callback/signal.h +73 -0
  66. package/include/graphics/axisang.h +65 -0
  67. package/include/graphics/bounds.h +108 -0
  68. package/include/graphics/device-exports.h +177 -0
  69. package/include/graphics/effect-parser.h +290 -0
  70. package/include/graphics/effect.h +190 -0
  71. package/include/graphics/graphics-internal.h +335 -0
  72. package/include/graphics/graphics.h +1024 -0
  73. package/include/graphics/half.h +100 -0
  74. package/include/graphics/image-file.h +124 -0
  75. package/include/graphics/input.h +34 -0
  76. package/include/graphics/libnsgif/libnsgif.h +142 -0
  77. package/include/graphics/math-defs.h +45 -0
  78. package/include/graphics/math-extra.h +61 -0
  79. package/include/graphics/matrix3.h +98 -0
  80. package/include/graphics/matrix4.h +102 -0
  81. package/include/graphics/plane.h +85 -0
  82. package/include/graphics/quat.h +170 -0
  83. package/include/graphics/shader-parser.h +273 -0
  84. package/include/graphics/srgb.h +177 -0
  85. package/include/graphics/vec2.h +148 -0
  86. package/include/graphics/vec3.h +224 -0
  87. package/include/graphics/vec4.h +241 -0
  88. package/include/media-io/audio-io.h +228 -0
  89. package/include/media-io/audio-math.h +43 -0
  90. package/include/media-io/audio-resampler.h +44 -0
  91. package/include/media-io/format-conversion.h +50 -0
  92. package/include/media-io/frame-rate.h +29 -0
  93. package/include/media-io/media-io-defs.h +20 -0
  94. package/include/media-io/media-remux.h +37 -0
  95. package/include/media-io/video-frame.h +64 -0
  96. package/include/media-io/video-io.h +338 -0
  97. package/include/media-io/video-scaler.h +43 -0
  98. package/include/obs-audio-controls.h +250 -0
  99. package/include/obs-av1.h +47 -0
  100. package/include/obs-avc.h +55 -0
  101. package/include/obs-config.h +52 -0
  102. package/include/obs-data.h +311 -0
  103. package/include/obs-defs.h +52 -0
  104. package/include/obs-encoder.h +361 -0
  105. package/include/obs-ffmpeg-compat.h +13 -0
  106. package/include/obs-hevc.h +81 -0
  107. package/include/obs-hotkey.h +271 -0
  108. package/include/obs-hotkeys.h +653 -0
  109. package/include/obs-interaction.h +56 -0
  110. package/include/obs-internal.h +1459 -0
  111. package/include/obs-missing-files.h +53 -0
  112. package/include/obs-module.h +181 -0
  113. package/include/obs-nal.h +37 -0
  114. package/include/obs-nix-platform.h +53 -0
  115. package/include/obs-nix-wayland.h +24 -0
  116. package/include/obs-nix-x11.h +22 -0
  117. package/include/obs-nix.h +42 -0
  118. package/include/obs-output.h +96 -0
  119. package/include/obs-properties.h +364 -0
  120. package/include/obs-scene.h +127 -0
  121. package/include/obs-service.h +115 -0
  122. package/include/obs-source.h +568 -0
  123. package/include/obs.h +2608 -0
  124. package/include/obsconfig.h +13 -0
  125. package/include/obsversion.h +5 -0
  126. package/include/util/apple/cfstring-utils.h +16 -0
  127. package/include/util/array-serializer.h +37 -0
  128. package/include/util/base.h +97 -0
  129. package/include/util/bitstream.h +28 -0
  130. package/include/util/bmem.h +94 -0
  131. package/include/util/buffered-file-serializer.h +32 -0
  132. package/include/util/c99defs.h +75 -0
  133. package/include/util/cf-lexer.h +199 -0
  134. package/include/util/cf-parser.h +281 -0
  135. package/include/util/circlebuf.h +319 -0
  136. package/include/util/config-file.h +103 -0
  137. package/include/util/crc32.h +29 -0
  138. package/include/util/curl/curl-helper.h +35 -0
  139. package/include/util/darray.h +606 -0
  140. package/include/util/deque.h +319 -0
  141. package/include/util/dstr.h +320 -0
  142. package/include/util/file-serializer.h +34 -0
  143. package/include/util/lexer.h +273 -0
  144. package/include/util/pipe.h +52 -0
  145. package/include/util/platform.h +223 -0
  146. package/include/util/profiler.h +97 -0
  147. package/include/util/serializer.h +158 -0
  148. package/include/util/simde/check.h +285 -0
  149. package/include/util/simde/debug-trap.h +117 -0
  150. package/include/util/simde/hedley.h +2123 -0
  151. package/include/util/simde/simde-align.h +481 -0
  152. package/include/util/simde/simde-arch.h +537 -0
  153. package/include/util/simde/simde-common.h +918 -0
  154. package/include/util/simde/simde-constify.h +925 -0
  155. package/include/util/simde/simde-detect-clang.h +114 -0
  156. package/include/util/simde/simde-diagnostic.h +447 -0
  157. package/include/util/simde/simde-features.h +550 -0
  158. package/include/util/simde/simde-math.h +1858 -0
  159. package/include/util/simde/x86/mmx.h +2456 -0
  160. package/include/util/simde/x86/sse.h +4479 -0
  161. package/include/util/simde/x86/sse2.h +7549 -0
  162. package/include/util/source-profiler.h +66 -0
  163. package/include/util/sse-intrin.h +32 -0
  164. package/include/util/task.h +22 -0
  165. package/include/util/text-lookup.h +45 -0
  166. package/include/util/threading-posix.h +77 -0
  167. package/include/util/threading-windows.h +142 -0
  168. package/include/util/threading.h +103 -0
  169. package/include/util/utf8.h +35 -0
  170. package/include/util/uthash.h +34 -0
  171. package/include/util/util_uint128.h +108 -0
  172. package/include/util/util_uint64.h +34 -0
  173. package/include/util/windows/device-enum.h +14 -0
  174. package/include/util/windows/obfuscate.h +16 -0
  175. package/include/util/windows/win-registry.h +37 -0
  176. package/include/util/windows/win-version.h +57 -0
  177. package/include/util/windows/window-helpers.h +47 -0
  178. package/index.d.ts +38 -0
  179. package/index.js +8 -0
  180. package/package.json +31 -0
  181. package/src/main.cpp +321 -0
  182. package/src/obs_interface.cpp +605 -0
  183. package/src/obs_interface.h +74 -0
  184. package/src/utils.cpp +80 -0
  185. package/src/utils.h +3 -0
@@ -0,0 +1,4479 @@
1
+ /* SPDX-License-Identifier: MIT
2
+ *
3
+ * Permission is hereby granted, free of charge, to any person
4
+ * obtaining a copy of this software and associated documentation
5
+ * files (the "Software"), to deal in the Software without
6
+ * restriction, including without limitation the rights to use, copy,
7
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ * of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be
12
+ * included in all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ * SOFTWARE.
22
+ *
23
+ * Copyright:
24
+ * 2017-2020 Evan Nemerson <evan@nemerson.com>
25
+ * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26
+ * 2015 Brandon Rowlett <browlett@nvidia.com>
27
+ * 2015 Ken Fast <kfast@gdeb.com>
28
+ */
29
+
30
+ #if !defined(SIMDE_X86_SSE_H)
31
+ #define SIMDE_X86_SSE_H
32
+
33
+ #include "mmx.h"
34
+
35
+ #if defined(_WIN32)
36
+ #include <windows.h>
37
+ #endif
38
+
39
+ HEDLEY_DIAGNOSTIC_PUSH
40
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
41
+ SIMDE_BEGIN_DECLS_
42
+
43
+ typedef union {
44
+ #if defined(SIMDE_VECTOR_SUBSCRIPT)
45
+ SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46
+ SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47
+ SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48
+ SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49
+ SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50
+ SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51
+ SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
52
+ SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53
+ #if defined(SIMDE_HAVE_INT128_)
54
+ SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
55
+ SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56
+ #endif
57
+ SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
58
+ SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59
+ SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60
+ #else
61
+ SIMDE_ALIGN_TO_16 int8_t i8[16];
62
+ SIMDE_ALIGN_TO_16 int16_t i16[8];
63
+ SIMDE_ALIGN_TO_16 int32_t i32[4];
64
+ SIMDE_ALIGN_TO_16 int64_t i64[2];
65
+ SIMDE_ALIGN_TO_16 uint8_t u8[16];
66
+ SIMDE_ALIGN_TO_16 uint16_t u16[8];
67
+ SIMDE_ALIGN_TO_16 uint32_t u32[4];
68
+ SIMDE_ALIGN_TO_16 uint64_t u64[2];
69
+ #if defined(SIMDE_HAVE_INT128_)
70
+ SIMDE_ALIGN_TO_16 simde_int128 i128[1];
71
+ SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
72
+ #endif
73
+ SIMDE_ALIGN_TO_16 simde_float32 f32[4];
74
+ SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
75
+ SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
76
+ #endif
77
+
78
+ SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
79
+ SIMDE_ALIGN_TO_16 simde__m64 m64[2];
80
+
81
+ #if defined(SIMDE_X86_SSE_NATIVE)
82
+ SIMDE_ALIGN_TO_16 __m128 n;
83
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
84
+ SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
85
+ SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
86
+ SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
87
+ SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
88
+ SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
89
+ SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
90
+ SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
91
+ SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
92
+ SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
93
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
94
+ SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
95
+ #endif
96
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
97
+ SIMDE_ALIGN_TO_16 v128_t wasm_v128;
98
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
99
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
100
+ SIMDE_ALIGN_TO_16
101
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
102
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
103
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
104
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
105
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
106
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
107
+ #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
108
+ SIMDE_ALIGN_TO_16
109
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
110
+ SIMDE_ALIGN_TO_16
111
+ SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
112
+ SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
113
+ #endif
114
+ #endif
115
+ } simde__m128_private;
116
+
117
+ #if defined(SIMDE_X86_SSE_NATIVE)
118
+ typedef __m128 simde__m128;
119
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
120
+ typedef float32x4_t simde__m128;
121
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
122
+ typedef v128_t simde__m128;
123
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
124
+ typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128;
125
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT)
126
+ typedef simde_float32
127
+ simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
128
+ #else
129
+ typedef simde__m128_private simde__m128;
130
+ #endif
131
+
132
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
133
+ typedef simde__m128 __m128;
134
+ #endif
135
+
136
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
137
+ HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private),
138
+ "simde__m128_private size incorrect");
139
+ #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
140
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16,
141
+ "simde__m128 is not 16-byte aligned");
142
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16,
143
+ "simde__m128_private is not 16-byte aligned");
144
+ #endif
145
+
146
+ SIMDE_FUNCTION_ATTRIBUTES
147
+ simde__m128 simde__m128_from_private(simde__m128_private v)
148
+ {
149
+ simde__m128 r;
150
+ simde_memcpy(&r, &v, sizeof(r));
151
+ return r;
152
+ }
153
+
154
+ SIMDE_FUNCTION_ATTRIBUTES
155
+ simde__m128_private simde__m128_to_private(simde__m128 v)
156
+ {
157
+ simde__m128_private r;
158
+ simde_memcpy(&r, &v, sizeof(r));
159
+ return r;
160
+ }
161
+
162
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
163
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8)
164
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16)
165
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32)
166
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64)
167
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8)
168
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16)
169
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32)
170
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64)
171
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32)
172
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
173
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64)
174
+ #endif
175
+ #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
176
+
177
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
178
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
179
+ SIMDE_POWER_ALTIVEC_VECTOR(signed char),
180
+ altivec, i8)
181
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
182
+ SIMDE_POWER_ALTIVEC_VECTOR(signed short),
183
+ altivec, i16)
184
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
185
+ SIMDE_POWER_ALTIVEC_VECTOR(signed int),
186
+ altivec, i32)
187
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
188
+ m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
189
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
190
+ m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
191
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
192
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
193
+ altivec, u32)
194
+
195
+ #if defined(SIMDE_BUG_GCC_95782)
196
+ SIMDE_FUNCTION_ATTRIBUTES
197
+ SIMDE_POWER_ALTIVEC_VECTOR(float)
198
+ simde__m128_to_altivec_f32(simde__m128 value)
199
+ {
200
+ simde__m128_private r_ = simde__m128_to_private(value);
201
+ return r_.altivec_f32;
202
+ }
203
+
204
+ SIMDE_FUNCTION_ATTRIBUTES
205
+ simde__m128 simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float)
206
+ value)
207
+ {
208
+ simde__m128_private r_;
209
+ r_.altivec_f32 = value;
210
+ return simde__m128_from_private(r_);
211
+ }
212
+ #else
213
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float),
214
+ altivec, f32)
215
+ #endif
216
+
217
+ #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
218
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
219
+ m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
220
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
221
+ m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
222
+ #endif
223
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
224
+ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128);
225
+ #endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */
226
+
227
+ enum {
228
+ #if defined(SIMDE_X86_SSE_NATIVE)
229
+ SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
230
+ SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
231
+ SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
232
+ SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
233
+ #else
234
+ SIMDE_MM_ROUND_NEAREST = 0x0000,
235
+ SIMDE_MM_ROUND_DOWN = 0x2000,
236
+ SIMDE_MM_ROUND_UP = 0x4000,
237
+ SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000
238
+ #endif
239
+ };
240
+
241
+ #if defined(_MM_FROUND_TO_NEAREST_INT)
242
+ #define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT
243
+ #define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF
244
+ #define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF
245
+ #define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO
246
+ #define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION
247
+
248
+ #define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC
249
+ #define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC
250
+ #else
251
+ #define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00
252
+ #define SIMDE_MM_FROUND_TO_NEG_INF 0x01
253
+ #define SIMDE_MM_FROUND_TO_POS_INF 0x02
254
+ #define SIMDE_MM_FROUND_TO_ZERO 0x03
255
+ #define SIMDE_MM_FROUND_CUR_DIRECTION 0x04
256
+
257
+ #define SIMDE_MM_FROUND_RAISE_EXC 0x00
258
+ #define SIMDE_MM_FROUND_NO_EXC 0x08
259
+ #endif
260
+
261
+ #define SIMDE_MM_FROUND_NINT \
262
+ (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC)
263
+ #define SIMDE_MM_FROUND_FLOOR \
264
+ (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC)
265
+ #define SIMDE_MM_FROUND_CEIL \
266
+ (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC)
267
+ #define SIMDE_MM_FROUND_TRUNC \
268
+ (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC)
269
+ #define SIMDE_MM_FROUND_RINT \
270
+ (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC)
271
+ #define SIMDE_MM_FROUND_NEARBYINT \
272
+ (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC)
273
+
274
+ #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && \
275
+ !defined(_MM_FROUND_TO_NEAREST_INT)
276
+ #define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT
277
+ #define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
278
+ #define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF
279
+ #define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO
280
+ #define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION
281
+ #define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC
282
+ #define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT
283
+ #define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR
284
+ #define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL
285
+ #define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC
286
+ #define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT
287
+ #define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT
288
+ #endif
289
+
290
+ SIMDE_FUNCTION_ATTRIBUTES
291
+ unsigned int SIMDE_MM_GET_ROUNDING_MODE(void)
292
+ {
293
+ #if defined(SIMDE_X86_SSE_NATIVE)
294
+ return _MM_GET_ROUNDING_MODE();
295
+ #elif defined(SIMDE_HAVE_FENV_H)
296
+ unsigned int vfe_mode;
297
+
298
+ switch (fegetround()) {
299
+ #if defined(FE_TONEAREST)
300
+ case FE_TONEAREST:
301
+ vfe_mode = SIMDE_MM_ROUND_NEAREST;
302
+ break;
303
+ #endif
304
+
305
+ #if defined(FE_TOWARDZERO)
306
+ case FE_TOWARDZERO:
307
+ vfe_mode = SIMDE_MM_ROUND_DOWN;
308
+ break;
309
+ #endif
310
+
311
+ #if defined(FE_UPWARD)
312
+ case FE_UPWARD:
313
+ vfe_mode = SIMDE_MM_ROUND_UP;
314
+ break;
315
+ #endif
316
+
317
+ #if defined(FE_DOWNWARD)
318
+ case FE_DOWNWARD:
319
+ vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO;
320
+ break;
321
+ #endif
322
+
323
+ default:
324
+ vfe_mode = SIMDE_MM_ROUND_NEAREST;
325
+ break;
326
+ }
327
+
328
+ return vfe_mode;
329
+ #else
330
+ return SIMDE_MM_ROUND_NEAREST;
331
+ #endif
332
+ }
333
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
334
+ #define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE()
335
+ #endif
336
+
337
+ SIMDE_FUNCTION_ATTRIBUTES
338
+ void SIMDE_MM_SET_ROUNDING_MODE(unsigned int a)
339
+ {
340
+ #if defined(SIMDE_X86_SSE_NATIVE)
341
+ _MM_SET_ROUNDING_MODE(a);
342
+ #elif defined(SIMDE_HAVE_FENV_H)
343
+ int fe_mode = FE_TONEAREST;
344
+
345
+ switch (a) {
346
+ #if defined(FE_TONEAREST)
347
+ case SIMDE_MM_ROUND_NEAREST:
348
+ fe_mode = FE_TONEAREST;
349
+ break;
350
+ #endif
351
+
352
+ #if defined(FE_TOWARDZERO)
353
+ case SIMDE_MM_ROUND_TOWARD_ZERO:
354
+ fe_mode = FE_TOWARDZERO;
355
+ break;
356
+ #endif
357
+
358
+ #if defined(FE_DOWNWARD)
359
+ case SIMDE_MM_ROUND_DOWN:
360
+ fe_mode = FE_DOWNWARD;
361
+ break;
362
+ #endif
363
+
364
+ #if defined(FE_UPWARD)
365
+ case SIMDE_MM_ROUND_UP:
366
+ fe_mode = FE_UPWARD;
367
+ break;
368
+ #endif
369
+
370
+ default:
371
+ return;
372
+ }
373
+
374
+ fesetround(fe_mode);
375
+ #else
376
+ (void)a;
377
+ #endif
378
+ }
379
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
380
+ #define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
381
+ #endif
382
+
383
+ SIMDE_FUNCTION_ATTRIBUTES
384
+ uint32_t simde_mm_getcsr(void)
385
+ {
386
+ #if defined(SIMDE_X86_SSE_NATIVE)
387
+ return _mm_getcsr();
388
+ #else
389
+ return SIMDE_MM_GET_ROUNDING_MODE();
390
+ #endif
391
+ }
392
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
393
+ #define _mm_getcsr() simde_mm_getcsr()
394
+ #endif
395
+
396
+ SIMDE_FUNCTION_ATTRIBUTES
397
+ void simde_mm_setcsr(uint32_t a)
398
+ {
399
+ #if defined(SIMDE_X86_SSE_NATIVE)
400
+ _mm_setcsr(a);
401
+ #else
402
+ SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a));
403
+ #endif
404
+ }
405
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
406
+ #define _mm_setcsr(a) simde_mm_setcsr(a)
407
+ #endif
408
+
409
+ SIMDE_FUNCTION_ATTRIBUTES
410
+ simde__m128 simde_x_mm_round_ps(simde__m128 a, int rounding, int lax_rounding)
411
+ SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15)
412
+ SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1)
413
+ {
414
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
415
+
416
+ (void)lax_rounding;
417
+
418
+ /* For architectures which lack a current direction SIMD instruction.
419
+ *
420
+ * Note that NEON actually has a current rounding mode instruction,
421
+ * but in ARMv8+ the rounding mode is ignored and nearest is always
422
+ * used, so we treat ARMv7 as having a rounding mode but ARMv8 as
423
+ * not. */
424
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ARM_NEON_A32V8)
425
+ if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
426
+ rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE())
427
+ << 13;
428
+ #endif
429
+
430
+ switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
431
+ case SIMDE_MM_FROUND_CUR_DIRECTION:
432
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
433
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
434
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
435
+ vec_round(a_.altivec_f32));
436
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
437
+ r_.neon_f32 = vrndiq_f32(a_.neon_f32);
438
+ #elif defined(simde_math_nearbyintf)
439
+ SIMDE_VECTORIZE
440
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
441
+ i++) {
442
+ r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
443
+ }
444
+ #else
445
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
446
+ #endif
447
+ break;
448
+
449
+ case SIMDE_MM_FROUND_TO_NEAREST_INT:
450
+ #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
451
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
452
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
453
+ vec_rint(a_.altivec_f32));
454
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
455
+ r_.neon_f32 = vrndnq_f32(a_.neon_f32);
456
+ #elif defined(simde_math_roundevenf)
457
+ SIMDE_VECTORIZE
458
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
459
+ i++) {
460
+ r_.f32[i] = simde_math_roundevenf(a_.f32[i]);
461
+ }
462
+ #else
463
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
464
+ #endif
465
+ break;
466
+
467
+ case SIMDE_MM_FROUND_TO_NEG_INF:
468
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
469
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
470
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
471
+ vec_floor(a_.altivec_f32));
472
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
473
+ r_.neon_f32 = vrndmq_f32(a_.neon_f32);
474
+ #elif defined(simde_math_floorf)
475
+ SIMDE_VECTORIZE
476
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
477
+ i++) {
478
+ r_.f32[i] = simde_math_floorf(a_.f32[i]);
479
+ }
480
+ #else
481
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
482
+ #endif
483
+ break;
484
+
485
+ case SIMDE_MM_FROUND_TO_POS_INF:
486
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
487
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
488
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
489
+ vec_ceil(a_.altivec_f32));
490
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
491
+ r_.neon_f32 = vrndpq_f32(a_.neon_f32);
492
+ #elif defined(simde_math_ceilf)
493
+ SIMDE_VECTORIZE
494
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
495
+ i++) {
496
+ r_.f32[i] = simde_math_ceilf(a_.f32[i]);
497
+ }
498
+ #else
499
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
500
+ #endif
501
+ break;
502
+
503
+ case SIMDE_MM_FROUND_TO_ZERO:
504
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
505
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
506
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
507
+ vec_trunc(a_.altivec_f32));
508
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
509
+ r_.neon_f32 = vrndq_f32(a_.neon_f32);
510
+ #elif defined(simde_math_truncf)
511
+ SIMDE_VECTORIZE
512
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
513
+ i++) {
514
+ r_.f32[i] = simde_math_truncf(a_.f32[i]);
515
+ }
516
+ #else
517
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
518
+ #endif
519
+ break;
520
+
521
+ default:
522
+ HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
523
+ }
524
+
525
+ return simde__m128_from_private(r_);
526
+ }
527
+ #if defined(SIMDE_X86_SSE4_1_NATIVE)
528
+ #define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding))
529
+ #else
530
+ #define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0)
531
+ #endif
532
+ #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
533
+ #define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding))
534
+ #endif
535
+
536
+ SIMDE_FUNCTION_ATTRIBUTES
537
+ simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2,
538
+ simde_float32 e1, simde_float32 e0)
539
+ {
540
+ #if defined(SIMDE_X86_SSE_NATIVE)
541
+ return _mm_set_ps(e3, e2, e1, e0);
542
+ #else
543
+ simde__m128_private r_;
544
+
545
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
546
+ SIMDE_ALIGN_TO_16 simde_float32 data[4] = {e0, e1, e2, e3};
547
+ r_.neon_f32 = vld1q_f32(data);
548
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
549
+ r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
550
+ #else
551
+ r_.f32[0] = e0;
552
+ r_.f32[1] = e1;
553
+ r_.f32[2] = e2;
554
+ r_.f32[3] = e3;
555
+ #endif
556
+
557
+ return simde__m128_from_private(r_);
558
+ #endif
559
+ }
560
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
561
+ #define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
562
+ #endif
563
+
564
+ SIMDE_FUNCTION_ATTRIBUTES
565
+ simde__m128 simde_mm_set_ps1(simde_float32 a)
566
+ {
567
+ #if defined(SIMDE_X86_SSE_NATIVE)
568
+ return _mm_set_ps1(a);
569
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
570
+ return vdupq_n_f32(a);
571
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
572
+ (void)a;
573
+ return vec_splats(a);
574
+ #else
575
+ return simde_mm_set_ps(a, a, a, a);
576
+ #endif
577
+ }
578
+ #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
579
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
580
+ #define _mm_set_ps1(a) simde_mm_set_ps1(a)
581
+ #define _mm_set1_ps(a) simde_mm_set1_ps(a)
582
+ #endif
583
+
584
+ SIMDE_FUNCTION_ATTRIBUTES
585
+ simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
586
+ {
587
+ #if defined(SIMDE_X86_SSE_NATIVE)
588
+ return _mm_move_ss(a, b);
589
+ #else
590
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
591
+ b_ = simde__m128_to_private(b);
592
+
593
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
594
+ r_.neon_f32 =
595
+ vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
596
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
597
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
598
+ m = {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
599
+ r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
600
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
601
+ r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2,
602
+ 3, 20, 21, 22, 23, 24, 25, 26, 27, 28,
603
+ 29, 30, 31);
604
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
605
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
606
+ #else
607
+ r_.f32[0] = b_.f32[0];
608
+ r_.f32[1] = a_.f32[1];
609
+ r_.f32[2] = a_.f32[2];
610
+ r_.f32[3] = a_.f32[3];
611
+ #endif
612
+
613
+ return simde__m128_from_private(r_);
614
+ #endif
615
+ }
616
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
617
+ #define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
618
+ #endif
619
+
620
+ SIMDE_FUNCTION_ATTRIBUTES
621
+ simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
622
+ {
623
+ #if defined(SIMDE_X86_SSE_NATIVE)
624
+ return _mm_add_ps(a, b);
625
+ #else
626
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
627
+ b_ = simde__m128_to_private(b);
628
+
629
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
630
+ r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
631
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
632
+ r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
633
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
634
+ r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
635
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
636
+ r_.f32 = a_.f32 + b_.f32;
637
+ #else
638
+ SIMDE_VECTORIZE
639
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
640
+ r_.f32[i] = a_.f32[i] + b_.f32[i];
641
+ }
642
+ #endif
643
+
644
+ return simde__m128_from_private(r_);
645
+ #endif
646
+ }
647
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
648
+ #define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
649
+ #endif
650
+
651
+ SIMDE_FUNCTION_ATTRIBUTES
652
+ simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
653
+ {
654
+ #if defined(SIMDE_X86_SSE_NATIVE)
655
+ return _mm_add_ss(a, b);
656
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
657
+ return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
658
+ #else
659
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
660
+ b_ = simde__m128_to_private(b);
661
+
662
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
663
+ float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0);
664
+ float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
665
+ // the upper values in the result must be the remnants of <a>.
666
+ r_.neon_f32 = vaddq_f32(a_.neon_f32, value);
667
+ #else
668
+ r_.f32[0] = a_.f32[0] + b_.f32[0];
669
+ r_.f32[1] = a_.f32[1];
670
+ r_.f32[2] = a_.f32[2];
671
+ r_.f32[3] = a_.f32[3];
672
+ #endif
673
+
674
+ return simde__m128_from_private(r_);
675
+ #endif
676
+ }
677
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
678
+ #define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
679
+ #endif
680
+
681
+ SIMDE_FUNCTION_ATTRIBUTES
682
+ simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
683
+ {
684
+ #if defined(SIMDE_X86_SSE_NATIVE)
685
+ return _mm_and_ps(a, b);
686
+ #else
687
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
688
+ b_ = simde__m128_to_private(b);
689
+
690
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
691
+ r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
692
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
693
+ r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
694
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
695
+ r_.i32 = a_.i32 & b_.i32;
696
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
697
+ r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
698
+ #else
699
+ SIMDE_VECTORIZE
700
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
701
+ r_.i32[i] = a_.i32[i] & b_.i32[i];
702
+ }
703
+ #endif
704
+
705
+ return simde__m128_from_private(r_);
706
+ #endif
707
+ }
708
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
709
+ #define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
710
+ #endif
711
+
712
+ SIMDE_FUNCTION_ATTRIBUTES
713
+ simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
714
+ {
715
+ #if defined(SIMDE_X86_SSE_NATIVE)
716
+ return _mm_andnot_ps(a, b);
717
+ #else
718
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
719
+ b_ = simde__m128_to_private(b);
720
+
721
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
722
+ r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
723
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
724
+ r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
725
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
726
+ r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
727
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
728
+ r_.i32 = ~a_.i32 & b_.i32;
729
+ #else
730
+ SIMDE_VECTORIZE
731
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
732
+ r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
733
+ }
734
+ #endif
735
+
736
+ return simde__m128_from_private(r_);
737
+ #endif
738
+ }
739
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
740
+ #define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
741
+ #endif
742
+
743
+ SIMDE_FUNCTION_ATTRIBUTES
744
+ simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
745
+ {
746
+ #if defined(SIMDE_X86_SSE_NATIVE)
747
+ return _mm_xor_ps(a, b);
748
+ #else
749
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
750
+ b_ = simde__m128_to_private(b);
751
+
752
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
753
+ r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
754
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
755
+ r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
756
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
757
+ r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
758
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
759
+ r_.i32f = a_.i32f ^ b_.i32f;
760
+ #else
761
+ SIMDE_VECTORIZE
762
+ for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
763
+ r_.u32[i] = a_.u32[i] ^ b_.u32[i];
764
+ }
765
+ #endif
766
+
767
+ return simde__m128_from_private(r_);
768
+ #endif
769
+ }
770
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
771
+ #define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
772
+ #endif
773
+
774
+ SIMDE_FUNCTION_ATTRIBUTES
775
+ simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
776
+ {
777
+ #if defined(SIMDE_X86_SSE_NATIVE)
778
+ return _mm_or_ps(a, b);
779
+ #else
780
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
781
+ b_ = simde__m128_to_private(b);
782
+
783
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
784
+ r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
785
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
786
+ r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
787
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
788
+ r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
789
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
790
+ r_.i32f = a_.i32f | b_.i32f;
791
+ #else
792
+ SIMDE_VECTORIZE
793
+ for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
794
+ r_.u32[i] = a_.u32[i] | b_.u32[i];
795
+ }
796
+ #endif
797
+
798
+ return simde__m128_from_private(r_);
799
+ #endif
800
+ }
801
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
802
+ #define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
803
+ #endif
804
+
805
+ SIMDE_FUNCTION_ATTRIBUTES
806
+ simde__m128 simde_x_mm_not_ps(simde__m128 a)
807
+ {
808
+ #if defined(SIMDE_X86_AVX512VL_NATIVE)
809
+ __m128i ai = _mm_castps_si128(a);
810
+ return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55));
811
+ #elif defined(SIMDE_X86_SSE2_NATIVE)
812
+ /* Note: we use ints instead of floats because we don't want cmpeq
813
+ * to return false for (NaN, NaN) */
814
+ __m128i ai = _mm_castps_si128(a);
815
+ return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai)));
816
+ #else
817
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
818
+
819
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
820
+ r_.neon_i32 = vmvnq_s32(a_.neon_i32);
821
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
822
+ r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
823
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
824
+ r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
825
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
826
+ r_.i32 = ~a_.i32;
827
+ #else
828
+ SIMDE_VECTORIZE
829
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
830
+ r_.i32[i] = ~(a_.i32[i]);
831
+ }
832
+ #endif
833
+
834
+ return simde__m128_from_private(r_);
835
+ #endif
836
+ }
837
+
838
+ SIMDE_FUNCTION_ATTRIBUTES
839
+ simde__m128 simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask)
840
+ {
841
+ /* This function is for when you want to blend two elements together
842
+ * according to a mask. It is similar to _mm_blendv_ps, except that
843
+ * it is undefined whether the blend is based on the highest bit in
844
+ * each lane (like blendv) or just bitwise operations. This allows
845
+ * us to implement the function efficiently everywhere.
846
+ *
847
+ * Basically, you promise that all the lanes in mask are either 0 or
848
+ * ~0. */
849
+ #if defined(SIMDE_X86_SSE4_1_NATIVE)
850
+ return _mm_blendv_ps(a, b, mask);
851
+ #else
852
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
853
+ b_ = simde__m128_to_private(b),
854
+ mask_ = simde__m128_to_private(mask);
855
+
856
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
857
+ r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
858
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
859
+ r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128,
860
+ mask_.wasm_v128);
861
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
862
+ r_.altivec_i32 =
863
+ vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32);
864
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
865
+ r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
866
+ #else
867
+ SIMDE_VECTORIZE
868
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
869
+ r_.i32[i] = a_.i32[i] ^
870
+ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
871
+ }
872
+ #endif
873
+
874
+ return simde__m128_from_private(r_);
875
+ #endif
876
+ }
877
+
878
+ SIMDE_FUNCTION_ATTRIBUTES
879
+ simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
880
+ {
881
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
882
+ return _mm_avg_pu16(a, b);
883
+ #else
884
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
885
+ b_ = simde__m64_to_private(b);
886
+
887
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
888
+ r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
889
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \
890
+ defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
891
+ defined(SIMDE_CONVERT_VECTOR_)
892
+ uint32_t wa SIMDE_VECTOR(16);
893
+ uint32_t wb SIMDE_VECTOR(16);
894
+ uint32_t wr SIMDE_VECTOR(16);
895
+ SIMDE_CONVERT_VECTOR_(wa, a_.u16);
896
+ SIMDE_CONVERT_VECTOR_(wb, b_.u16);
897
+ wr = (wa + wb + 1) >> 1;
898
+ SIMDE_CONVERT_VECTOR_(r_.u16, wr);
899
+ #else
900
+ SIMDE_VECTORIZE
901
+ for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
902
+ r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
903
+ }
904
+ #endif
905
+
906
+ return simde__m64_from_private(r_);
907
+ #endif
908
+ }
909
+ #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
910
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
911
+ #define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
912
+ #define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
913
+ #endif
914
+
915
+ SIMDE_FUNCTION_ATTRIBUTES
916
+ simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
917
+ {
918
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
919
+ return _mm_avg_pu8(a, b);
920
+ #else
921
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
922
+ b_ = simde__m64_to_private(b);
923
+
924
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
925
+ r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
926
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \
927
+ defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
928
+ defined(SIMDE_CONVERT_VECTOR_)
929
+ uint16_t wa SIMDE_VECTOR(16);
930
+ uint16_t wb SIMDE_VECTOR(16);
931
+ uint16_t wr SIMDE_VECTOR(16);
932
+ SIMDE_CONVERT_VECTOR_(wa, a_.u8);
933
+ SIMDE_CONVERT_VECTOR_(wb, b_.u8);
934
+ wr = (wa + wb + 1) >> 1;
935
+ SIMDE_CONVERT_VECTOR_(r_.u8, wr);
936
+ #else
937
+ SIMDE_VECTORIZE
938
+ for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
939
+ r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
940
+ }
941
+ #endif
942
+
943
+ return simde__m64_from_private(r_);
944
+ #endif
945
+ }
946
+ #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
947
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
948
+ #define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
949
+ #define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
950
+ #endif
951
+
952
+ SIMDE_FUNCTION_ATTRIBUTES
953
+ simde__m128 simde_x_mm_abs_ps(simde__m128 a)
954
+ {
955
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && \
956
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7, 1, 0))
957
+ return _mm512_castps512_ps128(_mm512_abs_ps(_mm512_castps128_ps512(a)));
958
+ #else
959
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
960
+
961
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
962
+ r_.neon_f32 = vabsq_f32(a_.neon_f32);
963
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
964
+ r_.altivec_f32 = vec_abs(a_.altivec_f32);
965
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
966
+ r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128);
967
+ #else
968
+ SIMDE_VECTORIZE
969
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
970
+ r_.f32[i] = simde_math_fabsf(a_.f32[i]);
971
+ }
972
+ #endif
973
+
974
+ return simde__m128_from_private(r_);
975
+ #endif
976
+ }
977
+
978
+ SIMDE_FUNCTION_ATTRIBUTES
979
+ simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
980
+ {
981
+ #if defined(SIMDE_X86_SSE_NATIVE)
982
+ return _mm_cmpeq_ps(a, b);
983
+ #else
984
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
985
+ b_ = simde__m128_to_private(b);
986
+
987
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
988
+ r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
989
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
990
+ r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
991
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
992
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
993
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
994
+ vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
995
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
996
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
997
+ #else
998
+ SIMDE_VECTORIZE
999
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1000
+ r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0)
1001
+ : UINT32_C(0);
1002
+ }
1003
+ #endif
1004
+
1005
+ return simde__m128_from_private(r_);
1006
+ #endif
1007
+ }
1008
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1009
+ #define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
1010
+ #endif
1011
+
1012
+ SIMDE_FUNCTION_ATTRIBUTES
1013
+ simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
1014
+ {
1015
+ #if defined(SIMDE_X86_SSE_NATIVE)
1016
+ return _mm_cmpeq_ss(a, b);
1017
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1018
+ return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
1019
+ #else
1020
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1021
+ b_ = simde__m128_to_private(b);
1022
+
1023
+ r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1024
+ SIMDE_VECTORIZE
1025
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1026
+ r_.u32[i] = a_.u32[i];
1027
+ }
1028
+
1029
+ return simde__m128_from_private(r_);
1030
+ #endif
1031
+ }
1032
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1033
+ #define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
1034
+ #endif
1035
+
1036
+ SIMDE_FUNCTION_ATTRIBUTES
1037
+ simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
1038
+ {
1039
+ #if defined(SIMDE_X86_SSE_NATIVE)
1040
+ return _mm_cmpge_ps(a, b);
1041
+ #else
1042
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1043
+ b_ = simde__m128_to_private(b);
1044
+
1045
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1046
+ r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1047
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1048
+ r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
1049
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1050
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1051
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1052
+ vec_cmpge(a_.altivec_f32, b_.altivec_f32));
1053
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1054
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
1055
+ #else
1056
+ SIMDE_VECTORIZE
1057
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1058
+ r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0)
1059
+ : UINT32_C(0);
1060
+ }
1061
+ #endif
1062
+
1063
+ return simde__m128_from_private(r_);
1064
+ #endif
1065
+ }
1066
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1067
+ #define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
1068
+ #endif
1069
+
1070
+ SIMDE_FUNCTION_ATTRIBUTES
1071
+ simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
1072
+ {
1073
+ #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1074
+ return _mm_cmpge_ss(a, b);
1075
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1076
+ return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
1077
+ #else
1078
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1079
+ b_ = simde__m128_to_private(b);
1080
+
1081
+ r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1082
+ SIMDE_VECTORIZE
1083
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1084
+ r_.u32[i] = a_.u32[i];
1085
+ }
1086
+
1087
+ return simde__m128_from_private(r_);
1088
+ #endif
1089
+ }
1090
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1091
+ #define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
1092
+ #endif
1093
+
1094
+ SIMDE_FUNCTION_ATTRIBUTES
1095
+ simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
1096
+ {
1097
+ #if defined(SIMDE_X86_SSE_NATIVE)
1098
+ return _mm_cmpgt_ps(a, b);
1099
+ #else
1100
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1101
+ b_ = simde__m128_to_private(b);
1102
+
1103
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1104
+ r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1105
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1106
+ r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
1107
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1108
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1109
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1110
+ vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
1111
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1112
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
1113
+ #else
1114
+ SIMDE_VECTORIZE
1115
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1116
+ r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0)
1117
+ : UINT32_C(0);
1118
+ }
1119
+ #endif
1120
+
1121
+ return simde__m128_from_private(r_);
1122
+ #endif
1123
+ }
1124
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1125
+ #define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
1126
+ #endif
1127
+
1128
+ SIMDE_FUNCTION_ATTRIBUTES
1129
+ simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
1130
+ {
1131
+ #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1132
+ return _mm_cmpgt_ss(a, b);
1133
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1134
+ return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
1135
+ #else
1136
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1137
+ b_ = simde__m128_to_private(b);
1138
+
1139
+ r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1140
+ SIMDE_VECTORIZE
1141
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1142
+ r_.u32[i] = a_.u32[i];
1143
+ }
1144
+
1145
+ return simde__m128_from_private(r_);
1146
+ #endif
1147
+ }
1148
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1149
+ #define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
1150
+ #endif
1151
+
1152
+ SIMDE_FUNCTION_ATTRIBUTES
1153
+ simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
1154
+ {
1155
+ #if defined(SIMDE_X86_SSE_NATIVE)
1156
+ return _mm_cmple_ps(a, b);
1157
+ #else
1158
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1159
+ b_ = simde__m128_to_private(b);
1160
+
1161
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1162
+ r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
1163
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1164
+ r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
1165
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1166
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1167
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1168
+ vec_cmple(a_.altivec_f32, b_.altivec_f32));
1169
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1170
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
1171
+ #else
1172
+ SIMDE_VECTORIZE
1173
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1174
+ r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0)
1175
+ : UINT32_C(0);
1176
+ }
1177
+ #endif
1178
+
1179
+ return simde__m128_from_private(r_);
1180
+ #endif
1181
+ }
1182
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1183
+ #define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
1184
+ #endif
1185
+
1186
+ SIMDE_FUNCTION_ATTRIBUTES
1187
+ simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
1188
+ {
1189
+ #if defined(SIMDE_X86_SSE_NATIVE)
1190
+ return _mm_cmple_ss(a, b);
1191
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1192
+ return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
1193
+ #else
1194
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1195
+ b_ = simde__m128_to_private(b);
1196
+
1197
+ r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1198
+ SIMDE_VECTORIZE
1199
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1200
+ r_.u32[i] = a_.u32[i];
1201
+ }
1202
+
1203
+ return simde__m128_from_private(r_);
1204
+ #endif
1205
+ }
1206
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1207
+ #define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
1208
+ #endif
1209
+
1210
+ SIMDE_FUNCTION_ATTRIBUTES
1211
+ simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
1212
+ {
1213
+ #if defined(SIMDE_X86_SSE_NATIVE)
1214
+ return _mm_cmplt_ps(a, b);
1215
+ #else
1216
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1217
+ b_ = simde__m128_to_private(b);
1218
+
1219
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1220
+ r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
1221
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1222
+ r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
1223
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1224
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1225
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1226
+ vec_cmplt(a_.altivec_f32, b_.altivec_f32));
1227
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1228
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
1229
+ #else
1230
+ SIMDE_VECTORIZE
1231
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1232
+ r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0)
1233
+ : UINT32_C(0);
1234
+ }
1235
+ #endif
1236
+
1237
+ return simde__m128_from_private(r_);
1238
+ #endif
1239
+ }
1240
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1241
+ #define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
1242
+ #endif
1243
+
1244
+ SIMDE_FUNCTION_ATTRIBUTES
1245
+ simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
1246
+ {
1247
+ #if defined(SIMDE_X86_SSE_NATIVE)
1248
+ return _mm_cmplt_ss(a, b);
1249
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1250
+ return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
1251
+ #else
1252
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1253
+ b_ = simde__m128_to_private(b);
1254
+
1255
+ r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1256
+ SIMDE_VECTORIZE
1257
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1258
+ r_.u32[i] = a_.u32[i];
1259
+ }
1260
+
1261
+ return simde__m128_from_private(r_);
1262
+ #endif
1263
+ }
1264
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1265
+ #define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
1266
+ #endif
1267
+
1268
+ SIMDE_FUNCTION_ATTRIBUTES
1269
+ simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
1270
+ {
1271
+ #if defined(SIMDE_X86_SSE_NATIVE)
1272
+ return _mm_cmpneq_ps(a, b);
1273
+ #else
1274
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1275
+ b_ = simde__m128_to_private(b);
1276
+
1277
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1278
+ r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1279
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1280
+ r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
1281
+ #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && \
1282
+ !defined(HEDLEY_IBM_VERSION)
1283
+ /* vec_cmpne(SIMDE_POWER_ALTIVEC_VECTOR(float), SIMDE_POWER_ALTIVEC_VECTOR(float))
1284
+ is missing from XL C/C++ v16.1.1,
1285
+ though the documentation (table 89 on page 432 of the IBM XL C/C++ for
1286
+ Linux Compiler Reference, Version 16.1.1) shows that it should be
1287
+ present. Both GCC and clang support it. */
1288
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1289
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1290
+ vec_cmpne(a_.altivec_f32, b_.altivec_f32));
1291
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1292
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1293
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1294
+ vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
1295
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1296
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1297
+ vec_nor(r_.altivec_f32, r_.altivec_f32));
1298
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1299
+ r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
1300
+ #else
1301
+ SIMDE_VECTORIZE
1302
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1303
+ r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0)
1304
+ : UINT32_C(0);
1305
+ }
1306
+ #endif
1307
+
1308
+ return simde__m128_from_private(r_);
1309
+ #endif
1310
+ }
1311
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1312
+ #define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
1313
+ #endif
1314
+
1315
+ SIMDE_FUNCTION_ATTRIBUTES
1316
+ simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
1317
+ {
1318
+ #if defined(SIMDE_X86_SSE_NATIVE)
1319
+ return _mm_cmpneq_ss(a, b);
1320
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1321
+ return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
1322
+ #else
1323
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1324
+ b_ = simde__m128_to_private(b);
1325
+
1326
+ r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1327
+ SIMDE_VECTORIZE
1328
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1329
+ r_.u32[i] = a_.u32[i];
1330
+ }
1331
+
1332
+ return simde__m128_from_private(r_);
1333
+ #endif
1334
+ }
1335
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1336
+ #define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
1337
+ #endif
1338
+
1339
+ SIMDE_FUNCTION_ATTRIBUTES
1340
+ simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
1341
+ {
1342
+ return simde_mm_cmplt_ps(a, b);
1343
+ }
1344
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1345
+ #define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
1346
+ #endif
1347
+
1348
+ SIMDE_FUNCTION_ATTRIBUTES
1349
+ simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
1350
+ {
1351
+ return simde_mm_cmplt_ss(a, b);
1352
+ }
1353
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1354
+ #define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
1355
+ #endif
1356
+
1357
+ SIMDE_FUNCTION_ATTRIBUTES
1358
+ simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
1359
+ {
1360
+ return simde_mm_cmple_ps(a, b);
1361
+ }
1362
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1363
+ #define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
1364
+ #endif
1365
+
1366
+ SIMDE_FUNCTION_ATTRIBUTES
1367
+ simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
1368
+ {
1369
+ return simde_mm_cmple_ss(a, b);
1370
+ }
1371
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1372
+ #define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
1373
+ #endif
1374
+
1375
+ SIMDE_FUNCTION_ATTRIBUTES
1376
+ simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
1377
+ {
1378
+ return simde_mm_cmpgt_ps(a, b);
1379
+ }
1380
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1381
+ #define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
1382
+ #endif
1383
+
1384
+ SIMDE_FUNCTION_ATTRIBUTES
1385
+ simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
1386
+ {
1387
+ return simde_mm_cmpgt_ss(a, b);
1388
+ }
1389
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1390
+ #define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
1391
+ #endif
1392
+
1393
+ SIMDE_FUNCTION_ATTRIBUTES
1394
+ simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
1395
+ {
1396
+ return simde_mm_cmpge_ps(a, b);
1397
+ }
1398
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1399
+ #define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
1400
+ #endif
1401
+
1402
+ SIMDE_FUNCTION_ATTRIBUTES
1403
+ simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
1404
+ {
1405
+ return simde_mm_cmpge_ss(a, b);
1406
+ }
1407
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1408
+ #define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
1409
+ #endif
1410
+
1411
+ SIMDE_FUNCTION_ATTRIBUTES
1412
+ simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
1413
+ {
1414
+ #if defined(SIMDE_X86_SSE_NATIVE)
1415
+ return _mm_cmpord_ps(a, b);
1416
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1417
+ return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b));
1418
+ #else
1419
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1420
+ b_ = simde__m128_to_private(b);
1421
+
1422
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1423
+ /* Note: NEON does not have ordered compare builtin
1424
+ Need to compare a eq a and b eq b to check for NaN
1425
+ Do AND of results to get final */
1426
+ uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
1427
+ uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
1428
+ r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
1429
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1430
+ r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128),
1431
+ wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128));
1432
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1433
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1434
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1435
+ vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1436
+ vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1437
+ #elif defined(simde_math_isnanf)
1438
+ SIMDE_VECTORIZE
1439
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1440
+ r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
1441
+ simde_math_isnanf(b_.f32[i]))
1442
+ ? UINT32_C(0)
1443
+ : ~UINT32_C(0);
1444
+ }
1445
+ #else
1446
+ HEDLEY_UNREACHABLE();
1447
+ #endif
1448
+
1449
+ return simde__m128_from_private(r_);
1450
+ #endif
1451
+ }
1452
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1453
+ #define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
1454
+ #endif
1455
+
1456
+ SIMDE_FUNCTION_ATTRIBUTES
1457
+ simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
1458
+ {
1459
+ #if defined(SIMDE_X86_SSE_NATIVE)
1460
+ return _mm_cmpunord_ps(a, b);
1461
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1462
+ return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b));
1463
+ #else
1464
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1465
+ b_ = simde__m128_to_private(b);
1466
+
1467
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1468
+ uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
1469
+ uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
1470
+ r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb));
1471
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1472
+ r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128),
1473
+ wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128));
1474
+ #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
1475
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1476
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1477
+ vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1478
+ vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1479
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1480
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1481
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
1482
+ vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1483
+ vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1484
+ r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32);
1485
+ #elif defined(simde_math_isnanf)
1486
+ SIMDE_VECTORIZE
1487
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1488
+ r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
1489
+ simde_math_isnanf(b_.f32[i]))
1490
+ ? ~UINT32_C(0)
1491
+ : UINT32_C(0);
1492
+ }
1493
+ #else
1494
+ HEDLEY_UNREACHABLE();
1495
+ #endif
1496
+
1497
+ return simde__m128_from_private(r_);
1498
+ #endif
1499
+ }
1500
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1501
+ #define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
1502
+ #endif
1503
+
1504
+ SIMDE_FUNCTION_ATTRIBUTES
1505
+ simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
1506
+ {
1507
+ #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1508
+ return _mm_cmpunord_ss(a, b);
1509
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1510
+ return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
1511
+ #else
1512
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
1513
+ b_ = simde__m128_to_private(b);
1514
+
1515
+ #if defined(simde_math_isnanf)
1516
+ r_.u32[0] =
1517
+ (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0]))
1518
+ ? ~UINT32_C(0)
1519
+ : UINT32_C(0);
1520
+ SIMDE_VECTORIZE
1521
+ for (size_t i = 1; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1522
+ r_.u32[i] = a_.u32[i];
1523
+ }
1524
+ #else
1525
+ HEDLEY_UNREACHABLE();
1526
+ #endif
1527
+
1528
+ return simde__m128_from_private(r_);
1529
+ #endif
1530
+ }
1531
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1532
+ #define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
1533
+ #endif
1534
+
1535
+ SIMDE_FUNCTION_ATTRIBUTES
1536
+ int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
1537
+ {
1538
+ #if defined(SIMDE_X86_SSE_NATIVE)
1539
+ return _mm_comieq_ss(a, b);
1540
+ #else
1541
+ simde__m128_private a_ = simde__m128_to_private(a),
1542
+ b_ = simde__m128_to_private(b);
1543
+
1544
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1545
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1546
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1547
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1548
+ uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
1549
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
1550
+ #else
1551
+ return a_.f32[0] == b_.f32[0];
1552
+ #endif
1553
+ #endif
1554
+ }
1555
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1556
+ #define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
1557
+ #endif
1558
+
1559
+ SIMDE_FUNCTION_ATTRIBUTES
1560
+ int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
1561
+ {
1562
+ #if defined(SIMDE_X86_SSE_NATIVE)
1563
+ return _mm_comige_ss(a, b);
1564
+ #else
1565
+ simde__m128_private a_ = simde__m128_to_private(a),
1566
+ b_ = simde__m128_to_private(b);
1567
+
1568
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1569
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1570
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1571
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1572
+ uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1573
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
1574
+ #else
1575
+ return a_.f32[0] >= b_.f32[0];
1576
+ #endif
1577
+ #endif
1578
+ }
1579
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1580
+ #define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
1581
+ #endif
1582
+
1583
+ SIMDE_FUNCTION_ATTRIBUTES
1584
+ int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
1585
+ {
1586
+ #if defined(SIMDE_X86_SSE_NATIVE)
1587
+ return _mm_comigt_ss(a, b);
1588
+ #else
1589
+ simde__m128_private a_ = simde__m128_to_private(a),
1590
+ b_ = simde__m128_to_private(b);
1591
+
1592
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1593
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1594
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1595
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1596
+ uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1597
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
1598
+ #else
1599
+ return a_.f32[0] > b_.f32[0];
1600
+ #endif
1601
+ #endif
1602
+ }
1603
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1604
+ #define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
1605
+ #endif
1606
+
1607
+ SIMDE_FUNCTION_ATTRIBUTES
1608
+ int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
1609
+ {
1610
+ #if defined(SIMDE_X86_SSE_NATIVE)
1611
+ return _mm_comile_ss(a, b);
1612
+ #else
1613
+ simde__m128_private a_ = simde__m128_to_private(a),
1614
+ b_ = simde__m128_to_private(b);
1615
+
1616
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1617
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1618
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1619
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1620
+ uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
1621
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
1622
+ #else
1623
+ return a_.f32[0] <= b_.f32[0];
1624
+ #endif
1625
+ #endif
1626
+ }
1627
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1628
+ #define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
1629
+ #endif
1630
+
1631
+ SIMDE_FUNCTION_ATTRIBUTES
1632
+ int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
1633
+ {
1634
+ #if defined(SIMDE_X86_SSE_NATIVE)
1635
+ return _mm_comilt_ss(a, b);
1636
+ #else
1637
+ simde__m128_private a_ = simde__m128_to_private(a),
1638
+ b_ = simde__m128_to_private(b);
1639
+
1640
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1641
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1642
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1643
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1644
+ uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
1645
+ return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
1646
+ #else
1647
+ return a_.f32[0] < b_.f32[0];
1648
+ #endif
1649
+ #endif
1650
+ }
1651
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1652
+ #define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
1653
+ #endif
1654
+
1655
+ SIMDE_FUNCTION_ATTRIBUTES
1656
+ int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
1657
+ {
1658
+ #if defined(SIMDE_X86_SSE_NATIVE)
1659
+ return _mm_comineq_ss(a, b);
1660
+ #else
1661
+ simde__m128_private a_ = simde__m128_to_private(a),
1662
+ b_ = simde__m128_to_private(b);
1663
+
1664
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1665
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1666
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1667
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1668
+ uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1669
+ return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
1670
+ #else
1671
+ return a_.f32[0] != b_.f32[0];
1672
+ #endif
1673
+ #endif
1674
+ }
1675
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1676
+ #define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
1677
+ #endif
1678
+
1679
+ SIMDE_FUNCTION_ATTRIBUTES
1680
+ simde__m128 simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src)
1681
+ {
1682
+ simde__m128_private r_, dest_ = simde__m128_to_private(dest),
1683
+ src_ = simde__m128_to_private(src);
1684
+
1685
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1686
+ const uint32x4_t sign_pos =
1687
+ vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0)));
1688
+ r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32);
1689
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1690
+ const v128_t sign_pos = wasm_f32x4_splat(-0.0f);
1691
+ r_.wasm_v128 =
1692
+ wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos);
1693
+ #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1694
+ #if !defined(HEDLEY_IBM_VERSION)
1695
+ r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32);
1696
+ #else
1697
+ r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32);
1698
+ #endif
1699
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1700
+ const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)
1701
+ sign_pos = HEDLEY_REINTERPRET_CAST(
1702
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
1703
+ vec_splats(-0.0f));
1704
+ r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos);
1705
+ #elif defined(SIMDE_IEEE754_STORAGE)
1706
+ (void)src_;
1707
+ (void)dest_;
1708
+ simde__m128 sign_pos = simde_mm_set1_ps(-0.0f);
1709
+ r_ = simde__m128_to_private(simde_mm_xor_ps(
1710
+ dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos)));
1711
+ #else
1712
+ SIMDE_VECTORIZE
1713
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1714
+ r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
1715
+ }
1716
+ #endif
1717
+
1718
+ return simde__m128_from_private(r_);
1719
+ }
1720
+
1721
+ SIMDE_FUNCTION_ATTRIBUTES
1722
+ simde__m128 simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src)
1723
+ {
1724
+ return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src),
1725
+ dest);
1726
+ }
1727
+
1728
+ SIMDE_FUNCTION_ATTRIBUTES
1729
+ simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
1730
+ {
1731
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1732
+ return _mm_cvt_pi2ps(a, b);
1733
+ #else
1734
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
1735
+ simde__m64_private b_ = simde__m64_to_private(b);
1736
+
1737
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1738
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1739
+ vget_high_f32(a_.neon_f32));
1740
+ #elif defined(SIMDE_CONVERT_VECTOR_)
1741
+ SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1742
+ r_.m64_private[1] = a_.m64_private[1];
1743
+ #else
1744
+ r_.f32[0] = (simde_float32)b_.i32[0];
1745
+ r_.f32[1] = (simde_float32)b_.i32[1];
1746
+ r_.i32[2] = a_.i32[2];
1747
+ r_.i32[3] = a_.i32[3];
1748
+ #endif
1749
+
1750
+ return simde__m128_from_private(r_);
1751
+ #endif
1752
+ }
1753
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1754
+ #define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b))
1755
+ #endif
1756
+
1757
+ SIMDE_FUNCTION_ATTRIBUTES
1758
+ simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
1759
+ {
1760
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1761
+ return _mm_cvt_ps2pi(a);
1762
+ #else
1763
+ simde__m64_private r_;
1764
+ simde__m128_private a_;
1765
+
1766
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1767
+ a_ = simde__m128_to_private(
1768
+ simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1769
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1770
+ #elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1771
+ a_ = simde__m128_to_private(
1772
+ simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1773
+ SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
1774
+ #else
1775
+ a_ = simde__m128_to_private(a);
1776
+
1777
+ SIMDE_VECTORIZE
1778
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1779
+ r_.i32[i] = HEDLEY_STATIC_CAST(
1780
+ int32_t, simde_math_nearbyintf(a_.f32[i]));
1781
+ }
1782
+ #endif
1783
+
1784
+ return simde__m64_from_private(r_);
1785
+ #endif
1786
+ }
1787
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1788
+ #define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
1789
+ #endif
1790
+
1791
+ SIMDE_FUNCTION_ATTRIBUTES
1792
+ simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
1793
+ {
1794
+ #if defined(SIMDE_X86_SSE_NATIVE)
1795
+ return _mm_cvt_si2ss(a, b);
1796
+ #else
1797
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
1798
+
1799
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1800
+ r_.neon_f32 =
1801
+ vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0);
1802
+ #else
1803
+ r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
1804
+ r_.i32[1] = a_.i32[1];
1805
+ r_.i32[2] = a_.i32[2];
1806
+ r_.i32[3] = a_.i32[3];
1807
+ #endif
1808
+
1809
+ return simde__m128_from_private(r_);
1810
+ #endif
1811
+ }
1812
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1813
+ #define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
1814
+ #endif
1815
+
1816
+ SIMDE_FUNCTION_ATTRIBUTES
1817
+ int32_t simde_mm_cvt_ss2si(simde__m128 a)
1818
+ {
1819
+ #if defined(SIMDE_X86_SSE_NATIVE)
1820
+ return _mm_cvt_ss2si(a);
1821
+ #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
1822
+ return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0);
1823
+ #else
1824
+ simde__m128_private a_ = simde__m128_to_private(
1825
+ simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1826
+ return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
1827
+ #endif
1828
+ }
1829
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1830
+ #define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
1831
+ #endif
1832
+
1833
+ SIMDE_FUNCTION_ATTRIBUTES
1834
+ simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
1835
+ {
1836
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1837
+ return _mm_cvtpi16_ps(a);
1838
+ #else
1839
+ simde__m128_private r_;
1840
+ simde__m64_private a_ = simde__m64_to_private(a);
1841
+
1842
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1843
+ r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16));
1844
+ #elif defined(SIMDE_CONVERT_VECTOR_)
1845
+ SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16);
1846
+ #else
1847
+ SIMDE_VECTORIZE
1848
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1849
+ simde_float32 v = a_.i16[i];
1850
+ r_.f32[i] = v;
1851
+ }
1852
+ #endif
1853
+
1854
+ return simde__m128_from_private(r_);
1855
+ #endif
1856
+ }
1857
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1858
+ #define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
1859
+ #endif
1860
+
1861
+ SIMDE_FUNCTION_ATTRIBUTES
1862
+ simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
1863
+ {
1864
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1865
+ return _mm_cvtpi32_ps(a, b);
1866
+ #else
1867
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
1868
+ simde__m64_private b_ = simde__m64_to_private(b);
1869
+
1870
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1871
+ r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1872
+ vget_high_f32(a_.neon_f32));
1873
+ #elif defined(SIMDE_CONVERT_VECTOR_)
1874
+ SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1875
+ r_.m64_private[1] = a_.m64_private[1];
1876
+ #else
1877
+ r_.f32[0] = (simde_float32)b_.i32[0];
1878
+ r_.f32[1] = (simde_float32)b_.i32[1];
1879
+ r_.i32[2] = a_.i32[2];
1880
+ r_.i32[3] = a_.i32[3];
1881
+ #endif
1882
+
1883
+ return simde__m128_from_private(r_);
1884
+ #endif
1885
+ }
1886
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1887
+ #define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
1888
+ #endif
1889
+
1890
+ SIMDE_FUNCTION_ATTRIBUTES
1891
+ simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
1892
+ {
1893
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1894
+ return _mm_cvtpi32x2_ps(a, b);
1895
+ #else
1896
+ simde__m128_private r_;
1897
+ simde__m64_private a_ = simde__m64_to_private(a),
1898
+ b_ = simde__m64_to_private(b);
1899
+
1900
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1901
+ r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
1902
+ #elif defined(SIMDE_CONVERT_VECTOR_)
1903
+ SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32);
1904
+ SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32);
1905
+ #else
1906
+ r_.f32[0] = (simde_float32)a_.i32[0];
1907
+ r_.f32[1] = (simde_float32)a_.i32[1];
1908
+ r_.f32[2] = (simde_float32)b_.i32[0];
1909
+ r_.f32[3] = (simde_float32)b_.i32[1];
1910
+ #endif
1911
+
1912
+ return simde__m128_from_private(r_);
1913
+ #endif
1914
+ }
1915
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1916
+ #define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
1917
+ #endif
1918
+
1919
+ SIMDE_FUNCTION_ATTRIBUTES
1920
+ simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
1921
+ {
1922
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1923
+ return _mm_cvtpi8_ps(a);
1924
+ #else
1925
+ simde__m128_private r_;
1926
+ simde__m64_private a_ = simde__m64_to_private(a);
1927
+
1928
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1929
+ r_.neon_f32 =
1930
+ vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
1931
+ #else
1932
+ r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]);
1933
+ r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]);
1934
+ r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]);
1935
+ r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]);
1936
+ #endif
1937
+
1938
+ return simde__m128_from_private(r_);
1939
+ #endif
1940
+ }
1941
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1942
+ #define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
1943
+ #endif
1944
+
1945
+ SIMDE_FUNCTION_ATTRIBUTES
1946
+ simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
1947
+ {
1948
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1949
+ return _mm_cvtps_pi16(a);
1950
+ #else
1951
+ simde__m64_private r_;
1952
+ simde__m128_private a_ = simde__m128_to_private(a);
1953
+
1954
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
1955
+ r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32)));
1956
+ #else
1957
+ SIMDE_VECTORIZE
1958
+ for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1959
+ r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t,
1960
+ simde_math_roundf(a_.f32[i]));
1961
+ }
1962
+ #endif
1963
+
1964
+ return simde__m64_from_private(r_);
1965
+ #endif
1966
+ }
1967
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1968
+ #define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
1969
+ #endif
1970
+
1971
+ SIMDE_FUNCTION_ATTRIBUTES
1972
+ simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
1973
+ {
1974
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1975
+ return _mm_cvtps_pi32(a);
1976
+ #else
1977
+ simde__m64_private r_;
1978
+ simde__m128_private a_ = simde__m128_to_private(a);
1979
+
1980
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
1981
+ defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
1982
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32)));
1983
+ #else
1984
+ SIMDE_VECTORIZE
1985
+ for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1986
+ simde_float32 v = simde_math_roundf(a_.f32[i]);
1987
+ #if !defined(SIMDE_FAST_CONVERSION_RANGE)
1988
+ r_.i32[i] =
1989
+ ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
1990
+ (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
1991
+ ? SIMDE_CONVERT_FTOI(int32_t, v)
1992
+ : INT32_MIN;
1993
+ #else
1994
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
1995
+ #endif
1996
+ }
1997
+ #endif
1998
+
1999
+ return simde__m64_from_private(r_);
2000
+ #endif
2001
+ }
2002
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2003
+ #define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
2004
+ #endif
2005
+
2006
+ SIMDE_FUNCTION_ATTRIBUTES
2007
+ simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
2008
+ {
2009
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2010
+ return _mm_cvtps_pi8(a);
2011
+ #else
2012
+ simde__m64_private r_;
2013
+ simde__m128_private a_ = simde__m128_to_private(a);
2014
+
2015
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471)
2016
+ /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to
2017
+ * i16, combine with an all-zero vector of i16 (which will become the upper
2018
+ * half), narrow to i8. */
2019
+ float32x4_t max =
2020
+ vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX));
2021
+ float32x4_t min =
2022
+ vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN));
2023
+ float32x4_t values =
2024
+ vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min));
2025
+ r_.neon_i8 = vmovn_s16(
2026
+ vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0)));
2027
+ #else
2028
+ SIMDE_VECTORIZE
2029
+ for (size_t i = 0; i < (sizeof(a_.f32) / sizeof(a_.f32[0])); i++) {
2030
+ if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX))
2031
+ r_.i8[i] = INT8_MAX;
2032
+ else if (a_.f32[i] <
2033
+ HEDLEY_STATIC_CAST(simde_float32, INT8_MIN))
2034
+ r_.i8[i] = INT8_MIN;
2035
+ else
2036
+ r_.i8[i] = SIMDE_CONVERT_FTOI(
2037
+ int8_t, simde_math_roundf(a_.f32[i]));
2038
+ }
2039
+ /* Note: the upper half is undefined */
2040
+ #endif
2041
+
2042
+ return simde__m64_from_private(r_);
2043
+ #endif
2044
+ }
2045
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2046
+ #define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
2047
+ #endif
2048
+
2049
+ SIMDE_FUNCTION_ATTRIBUTES
2050
+ simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
2051
+ {
2052
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2053
+ return _mm_cvtpu16_ps(a);
2054
+ #else
2055
+ simde__m128_private r_;
2056
+ simde__m64_private a_ = simde__m64_to_private(a);
2057
+
2058
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2059
+ r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
2060
+ #elif defined(SIMDE_CONVERT_VECTOR_)
2061
+ SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16);
2062
+ #else
2063
+ SIMDE_VECTORIZE
2064
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2065
+ r_.f32[i] = (simde_float32)a_.u16[i];
2066
+ }
2067
+ #endif
2068
+
2069
+ return simde__m128_from_private(r_);
2070
+ #endif
2071
+ }
2072
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2073
+ #define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
2074
+ #endif
2075
+
2076
+ SIMDE_FUNCTION_ATTRIBUTES
2077
+ simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
2078
+ {
2079
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2080
+ return _mm_cvtpu8_ps(a);
2081
+ #else
2082
+ simde__m128_private r_;
2083
+ simde__m64_private a_ = simde__m64_to_private(a);
2084
+
2085
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2086
+ r_.neon_f32 =
2087
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
2088
+ #else
2089
+ SIMDE_VECTORIZE
2090
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2091
+ r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]);
2092
+ }
2093
+ #endif
2094
+
2095
+ return simde__m128_from_private(r_);
2096
+ #endif
2097
+ }
2098
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2099
+ #define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
2100
+ #endif
2101
+
2102
+ SIMDE_FUNCTION_ATTRIBUTES
2103
+ simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
2104
+ {
2105
+ #if defined(SIMDE_X86_SSE_NATIVE)
2106
+ return _mm_cvtsi32_ss(a, b);
2107
+ #else
2108
+ simde__m128_private r_;
2109
+ simde__m128_private a_ = simde__m128_to_private(a);
2110
+
2111
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2112
+ r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
2113
+ a_.neon_f32, 0);
2114
+ #else
2115
+ r_ = a_;
2116
+ r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
2117
+ #endif
2118
+
2119
+ return simde__m128_from_private(r_);
2120
+ #endif
2121
+ }
2122
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2123
+ #define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
2124
+ #endif
2125
+
2126
+ SIMDE_FUNCTION_ATTRIBUTES
2127
+ simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
2128
+ {
2129
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
2130
+ #if !defined(__PGI)
2131
+ return _mm_cvtsi64_ss(a, b);
2132
+ #else
2133
+ return _mm_cvtsi64x_ss(a, b);
2134
+ #endif
2135
+ #else
2136
+ simde__m128_private r_;
2137
+ simde__m128_private a_ = simde__m128_to_private(a);
2138
+
2139
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2140
+ r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
2141
+ a_.neon_f32, 0);
2142
+ #else
2143
+ r_ = a_;
2144
+ r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
2145
+ #endif
2146
+
2147
+ return simde__m128_from_private(r_);
2148
+ #endif
2149
+ }
2150
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2151
+ #define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
2152
+ #endif
2153
+
2154
+ SIMDE_FUNCTION_ATTRIBUTES
2155
+ simde_float32 simde_mm_cvtss_f32(simde__m128 a)
2156
+ {
2157
+ #if defined(SIMDE_X86_SSE_NATIVE)
2158
+ return _mm_cvtss_f32(a);
2159
+ #else
2160
+ simde__m128_private a_ = simde__m128_to_private(a);
2161
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2162
+ return vgetq_lane_f32(a_.neon_f32, 0);
2163
+ #else
2164
+ return a_.f32[0];
2165
+ #endif
2166
+ #endif
2167
+ }
2168
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2169
+ #define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
2170
+ #endif
2171
+
2172
+ SIMDE_FUNCTION_ATTRIBUTES
2173
+ int32_t simde_mm_cvtss_si32(simde__m128 a)
2174
+ {
2175
+ return simde_mm_cvt_ss2si(a);
2176
+ }
2177
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2178
+ #define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
2179
+ #endif
2180
+
2181
+ SIMDE_FUNCTION_ATTRIBUTES
2182
+ int64_t simde_mm_cvtss_si64(simde__m128 a)
2183
+ {
2184
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
2185
+ #if !defined(__PGI)
2186
+ return _mm_cvtss_si64(a);
2187
+ #else
2188
+ return _mm_cvtss_si64x(a);
2189
+ #endif
2190
+ #else
2191
+ simde__m128_private a_ = simde__m128_to_private(a);
2192
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2193
+ return SIMDE_CONVERT_FTOI(
2194
+ int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0)));
2195
+ #else
2196
+ return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0]));
2197
+ #endif
2198
+ #endif
2199
+ }
2200
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2201
+ #define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
2202
+ #endif
2203
+
2204
+ SIMDE_FUNCTION_ATTRIBUTES
2205
+ simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
2206
+ {
2207
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2208
+ return _mm_cvtt_ps2pi(a);
2209
+ #else
2210
+ simde__m64_private r_;
2211
+ simde__m128_private a_ = simde__m128_to_private(a);
2212
+
2213
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2214
+ r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
2215
+ #else
2216
+ SIMDE_VECTORIZE
2217
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2218
+ simde_float32 v = a_.f32[i];
2219
+ #if !defined(SIMDE_FAST_CONVERSION_RANGE)
2220
+ r_.i32[i] =
2221
+ ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
2222
+ (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
2223
+ ? SIMDE_CONVERT_FTOI(int32_t, v)
2224
+ : INT32_MIN;
2225
+ #else
2226
+ r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2227
+ #endif
2228
+ }
2229
+ #endif
2230
+
2231
+ return simde__m64_from_private(r_);
2232
+ #endif
2233
+ }
2234
+ #define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
2235
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2236
+ #define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
2237
+ #define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
2238
+ #endif
2239
+
2240
+ SIMDE_FUNCTION_ATTRIBUTES
2241
+ int32_t simde_mm_cvtt_ss2si(simde__m128 a)
2242
+ {
2243
+ #if defined(SIMDE_X86_SSE_NATIVE)
2244
+ return _mm_cvtt_ss2si(a);
2245
+ #else
2246
+ simde__m128_private a_ = simde__m128_to_private(a);
2247
+
2248
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2249
+ return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
2250
+ #else
2251
+ simde_float32 v = a_.f32[0];
2252
+ #if !defined(SIMDE_FAST_CONVERSION_RANGE)
2253
+ return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
2254
+ (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
2255
+ ? SIMDE_CONVERT_FTOI(int32_t, v)
2256
+ : INT32_MIN;
2257
+ #else
2258
+ return SIMDE_CONVERT_FTOI(int32_t, v);
2259
+ #endif
2260
+ #endif
2261
+ #endif
2262
+ }
2263
+ #define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
2264
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2265
+ #define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
2266
+ #define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
2267
+ #endif
2268
+
2269
+ SIMDE_FUNCTION_ATTRIBUTES
2270
+ int64_t simde_mm_cvttss_si64(simde__m128 a)
2271
+ {
2272
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
2273
+ !defined(_MSC_VER)
2274
+ #if defined(__PGI)
2275
+ return _mm_cvttss_si64x(a);
2276
+ #else
2277
+ return _mm_cvttss_si64(a);
2278
+ #endif
2279
+ #else
2280
+ simde__m128_private a_ = simde__m128_to_private(a);
2281
+
2282
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2283
+ return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
2284
+ #else
2285
+ return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
2286
+ #endif
2287
+ #endif
2288
+ }
2289
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2290
+ #define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
2291
+ #endif
2292
+
2293
+ SIMDE_FUNCTION_ATTRIBUTES
2294
+ simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
2295
+ {
2296
+ #if defined(SIMDE_X86_SSE_NATIVE)
2297
+ return _mm_cmpord_ss(a, b);
2298
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2299
+ return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
2300
+ #else
2301
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
2302
+
2303
+ #if defined(simde_math_isnanf)
2304
+ r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) ||
2305
+ simde_math_isnanf(simde_mm_cvtss_f32(b)))
2306
+ ? UINT32_C(0)
2307
+ : ~UINT32_C(0);
2308
+ SIMDE_VECTORIZE
2309
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2310
+ r_.u32[i] = a_.u32[i];
2311
+ }
2312
+ #else
2313
+ HEDLEY_UNREACHABLE();
2314
+ #endif
2315
+
2316
+ return simde__m128_from_private(r_);
2317
+ #endif
2318
+ }
2319
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2320
+ #define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
2321
+ #endif
2322
+
2323
+ SIMDE_FUNCTION_ATTRIBUTES
2324
+ simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
2325
+ {
2326
+ #if defined(SIMDE_X86_SSE_NATIVE)
2327
+ return _mm_div_ps(a, b);
2328
+ #else
2329
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2330
+ b_ = simde__m128_to_private(b);
2331
+
2332
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2333
+ r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
2334
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2335
+ float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
2336
+ float32x4_t recip1 =
2337
+ vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
2338
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
2339
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2340
+ r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
2341
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
2342
+ r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32);
2343
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2344
+ r_.f32 = a_.f32 / b_.f32;
2345
+ #else
2346
+ SIMDE_VECTORIZE
2347
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2348
+ r_.f32[i] = a_.f32[i] / b_.f32[i];
2349
+ }
2350
+ #endif
2351
+
2352
+ return simde__m128_from_private(r_);
2353
+ #endif
2354
+ }
2355
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2356
+ #define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
2357
+ #endif
2358
+
2359
+ SIMDE_FUNCTION_ATTRIBUTES
2360
+ simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
2361
+ {
2362
+ #if defined(SIMDE_X86_SSE_NATIVE)
2363
+ return _mm_div_ss(a, b);
2364
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2365
+ return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
2366
+ #else
2367
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2368
+ b_ = simde__m128_to_private(b);
2369
+
2370
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2371
+ float32_t value = vgetq_lane_f32(
2372
+ simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0);
2373
+ r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2374
+ #else
2375
+ r_.f32[0] = a_.f32[0] / b_.f32[0];
2376
+ SIMDE_VECTORIZE
2377
+ for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2378
+ r_.f32[i] = a_.f32[i];
2379
+ }
2380
+ #endif
2381
+
2382
+ return simde__m128_from_private(r_);
2383
+ #endif
2384
+ }
2385
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2386
+ #define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
2387
+ #endif
2388
+
2389
+ SIMDE_FUNCTION_ATTRIBUTES
2390
+ int16_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
2391
+ SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
2392
+ {
2393
+ simde__m64_private a_ = simde__m64_to_private(a);
2394
+ return a_.i16[imm8];
2395
+ }
2396
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
2397
+ !defined(HEDLEY_PGI_VERSION)
2398
+ #if defined(SIMDE_BUG_CLANG_44589)
2399
+ #define simde_mm_extract_pi16(a, imm8) \
2400
+ (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \
2401
+ "clang diagnostic ignored \"-Wvector-conversion\"") \
2402
+ HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
2403
+ HEDLEY_DIAGNOSTIC_POP)
2404
+ #else
2405
+ #define simde_mm_extract_pi16(a, imm8) \
2406
+ HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8))
2407
+ #endif
2408
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2409
+ #define simde_mm_extract_pi16(a, imm8) \
2410
+ vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)
2411
+ #endif
2412
+ #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
2413
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2414
+ #define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8))
2415
+ #define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8))
2416
+ #endif
2417
+
2418
+ SIMDE_FUNCTION_ATTRIBUTES
2419
+ simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
2420
+ SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
2421
+ {
2422
+ simde__m64_private r_, a_ = simde__m64_to_private(a);
2423
+
2424
+ r_.i64[0] = a_.i64[0];
2425
+ r_.i16[imm8] = i;
2426
+
2427
+ return simde__m64_from_private(r_);
2428
+ }
2429
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
2430
+ !defined(__PGI)
2431
+ #if defined(SIMDE_BUG_CLANG_44589)
2432
+ #define ssimde_mm_insert_pi16(a, i, imm8) \
2433
+ (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \
2434
+ "clang diagnostic ignored \"-Wvector-conversion\"")( \
2435
+ _mm_insert_pi16((a), (i), (imm8))) HEDLEY_DIAGNOSTIC_POP)
2436
+ #else
2437
+ #define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
2438
+ #endif
2439
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2440
+ #define simde_mm_insert_pi16(a, i, imm8) \
2441
+ simde__m64_from_neon_i16( \
2442
+ vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8)))
2443
+ #endif
2444
+ #define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
2445
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2446
+ #define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
2447
+ #define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
2448
+ #endif
2449
+
2450
+ SIMDE_FUNCTION_ATTRIBUTES
2451
+ simde__m128
2452
+ simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2453
+ {
2454
+ #if defined(SIMDE_X86_SSE_NATIVE)
2455
+ return _mm_load_ps(mem_addr);
2456
+ #else
2457
+ simde__m128_private r_;
2458
+
2459
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2460
+ r_.neon_f32 = vld1q_f32(mem_addr);
2461
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
2462
+ r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
2463
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2464
+ r_.altivec_f32 = vec_ld(0, mem_addr);
2465
+ #else
2466
+ simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128),
2467
+ sizeof(r_));
2468
+ #endif
2469
+
2470
+ return simde__m128_from_private(r_);
2471
+ #endif
2472
+ }
2473
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2474
+ #define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
2475
+ #endif
2476
+
2477
+ SIMDE_FUNCTION_ATTRIBUTES
2478
+ simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
2479
+ {
2480
+ #if defined(SIMDE_X86_SSE_NATIVE)
2481
+ return _mm_load_ps1(mem_addr);
2482
+ #else
2483
+ simde__m128_private r_;
2484
+
2485
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2486
+ r_.neon_f32 = vld1q_dup_f32(mem_addr);
2487
+ #else
2488
+ r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
2489
+ #endif
2490
+
2491
+ return simde__m128_from_private(r_);
2492
+ #endif
2493
+ }
2494
+ #define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
2495
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2496
+ #define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
2497
+ #define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr)
2498
+ #endif
2499
+
2500
+ SIMDE_FUNCTION_ATTRIBUTES
2501
+ simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
2502
+ {
2503
+ #if defined(SIMDE_X86_SSE_NATIVE)
2504
+ return _mm_load_ss(mem_addr);
2505
+ #else
2506
+ simde__m128_private r_;
2507
+
2508
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2509
+ r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
2510
+ #else
2511
+ r_.f32[0] = *mem_addr;
2512
+ r_.i32[1] = 0;
2513
+ r_.i32[2] = 0;
2514
+ r_.i32[3] = 0;
2515
+ #endif
2516
+
2517
+ return simde__m128_from_private(r_);
2518
+ #endif
2519
+ }
2520
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2521
+ #define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
2522
+ #endif
2523
+
2524
+ SIMDE_FUNCTION_ATTRIBUTES
2525
+ simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
2526
+ {
2527
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2528
+ return _mm_loadh_pi(a,
2529
+ HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
2530
+ #else
2531
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
2532
+
2533
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2534
+ r_.neon_f32 = vcombine_f32(
2535
+ vget_low_f32(a_.neon_f32),
2536
+ vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)));
2537
+ #else
2538
+ simde__m64_private b_ =
2539
+ *HEDLEY_REINTERPRET_CAST(simde__m64_private const *, mem_addr);
2540
+ r_.f32[0] = a_.f32[0];
2541
+ r_.f32[1] = a_.f32[1];
2542
+ r_.f32[2] = b_.f32[0];
2543
+ r_.f32[3] = b_.f32[1];
2544
+ #endif
2545
+
2546
+ return simde__m128_from_private(r_);
2547
+ #endif
2548
+ }
2549
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2550
+ #if HEDLEY_HAS_WARNING("-Wold-style-cast")
2551
+ #define _mm_loadh_pi(a, mem_addr) \
2552
+ simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
2553
+ (mem_addr)))
2554
+ #else
2555
+ #define _mm_loadh_pi(a, mem_addr) \
2556
+ simde_mm_loadh_pi((a), (simde__m64 const *)(mem_addr))
2557
+ #endif
2558
+ #endif
2559
+
2560
+ /* The SSE documentation says that there are no alignment requirements
2561
+ for mem_addr. Unfortunately they used the __m64 type for the argument
2562
+ which is supposed to be 8-byte aligned, so some compilers (like clang
2563
+ with -Wcast-align) will generate a warning if you try to cast, say,
2564
+ a simde_float32* to a simde__m64* for this function.
2565
+
2566
+ I think the choice of argument type is unfortunate, but I do think we
2567
+ need to stick to it here. If there is demand I can always add something
2568
+ like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
2569
+ SIMDE_FUNCTION_ATTRIBUTES
2570
+ simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
2571
+ {
2572
+ #if defined(SIMDE_X86_SSE_NATIVE)
2573
+ return _mm_loadl_pi(a,
2574
+ HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
2575
+ #else
2576
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
2577
+
2578
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2579
+ r_.neon_f32 = vcombine_f32(
2580
+ vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)),
2581
+ vget_high_f32(a_.neon_f32));
2582
+ #else
2583
+ simde__m64_private b_;
2584
+ simde_memcpy(&b_, mem_addr, sizeof(b_));
2585
+ r_.i32[0] = b_.i32[0];
2586
+ r_.i32[1] = b_.i32[1];
2587
+ r_.i32[2] = a_.i32[2];
2588
+ r_.i32[3] = a_.i32[3];
2589
+ #endif
2590
+
2591
+ return simde__m128_from_private(r_);
2592
+ #endif
2593
+ }
2594
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2595
+ #if HEDLEY_HAS_WARNING("-Wold-style-cast")
2596
+ #define _mm_loadl_pi(a, mem_addr) \
2597
+ simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
2598
+ (mem_addr)))
2599
+ #else
2600
+ #define _mm_loadl_pi(a, mem_addr) \
2601
+ simde_mm_loadl_pi((a), (simde__m64 const *)(mem_addr))
2602
+ #endif
2603
+ #endif
2604
+
2605
+ SIMDE_FUNCTION_ATTRIBUTES
2606
+ simde__m128
2607
+ simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2608
+ {
2609
+ #if defined(SIMDE_X86_SSE_NATIVE)
2610
+ return _mm_loadr_ps(mem_addr);
2611
+ #else
2612
+ simde__m128_private r_,
2613
+ v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
2614
+
2615
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2616
+ r_.neon_f32 = vrev64q_f32(v_.neon_f32);
2617
+ r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
2618
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
2619
+ r_.altivec_f32 = vec_reve(v_.altivec_f32);
2620
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
2621
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
2622
+ #else
2623
+ r_.f32[0] = v_.f32[3];
2624
+ r_.f32[1] = v_.f32[2];
2625
+ r_.f32[2] = v_.f32[1];
2626
+ r_.f32[3] = v_.f32[0];
2627
+ #endif
2628
+
2629
+ return simde__m128_from_private(r_);
2630
+ #endif
2631
+ }
2632
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2633
+ #define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
2634
+ #endif
2635
+
2636
+ SIMDE_FUNCTION_ATTRIBUTES
2637
+ simde__m128
2638
+ simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2639
+ {
2640
+ #if defined(SIMDE_X86_SSE_NATIVE)
2641
+ return _mm_loadu_ps(mem_addr);
2642
+ #else
2643
+ simde__m128_private r_;
2644
+
2645
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2646
+ r_.neon_f32 =
2647
+ vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr));
2648
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2649
+ r_.wasm_v128 = wasm_v128_load(mem_addr);
2650
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
2651
+ r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
2652
+ #else
2653
+ simde_memcpy(&r_, mem_addr, sizeof(r_));
2654
+ #endif
2655
+
2656
+ return simde__m128_from_private(r_);
2657
+ #endif
2658
+ }
2659
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2660
+ #define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
2661
+ #endif
2662
+
2663
+ SIMDE_FUNCTION_ATTRIBUTES
2664
+ void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, int8_t *mem_addr)
2665
+ {
2666
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2667
+ _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
2668
+ #else
2669
+ simde__m64_private a_ = simde__m64_to_private(a),
2670
+ mask_ = simde__m64_to_private(mask);
2671
+
2672
+ SIMDE_VECTORIZE
2673
+ for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++)
2674
+ if (mask_.i8[i] < 0)
2675
+ mem_addr[i] = a_.i8[i];
2676
+ #endif
2677
+ }
2678
+ #define simde_m_maskmovq(a, mask, mem_addr) \
2679
+ simde_mm_maskmove_si64(a, mask, mem_addr)
2680
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2681
+ #define _mm_maskmove_si64(a, mask, mem_addr) \
2682
+ simde_mm_maskmove_si64( \
2683
+ (a), (mask), \
2684
+ SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
2685
+ #define _m_maskmovq(a, mask, mem_addr) \
2686
+ simde_mm_maskmove_si64( \
2687
+ (a), (mask), \
2688
+ SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
2689
+ #endif
2690
+
2691
+ SIMDE_FUNCTION_ATTRIBUTES
2692
+ simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
2693
+ {
2694
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2695
+ return _mm_max_pi16(a, b);
2696
+ #else
2697
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
2698
+ b_ = simde__m64_to_private(b);
2699
+
2700
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2701
+ r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
2702
+ #else
2703
+ SIMDE_VECTORIZE
2704
+ for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2705
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2706
+ }
2707
+ #endif
2708
+
2709
+ return simde__m64_from_private(r_);
2710
+ #endif
2711
+ }
2712
+ #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2713
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2714
+ #define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
2715
+ #define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2716
+ #endif
2717
+
2718
+ SIMDE_FUNCTION_ATTRIBUTES
2719
+ simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
2720
+ {
2721
+ #if defined(SIMDE_X86_SSE_NATIVE)
2722
+ return _mm_max_ps(a, b);
2723
+ #else
2724
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2725
+ b_ = simde__m128_to_private(b);
2726
+
2727
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS)
2728
+ r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
2729
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2730
+ r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32),
2731
+ a_.neon_f32, b_.neon_f32);
2732
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS)
2733
+ r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128);
2734
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2735
+ r_.wasm_v128 =
2736
+ wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
2737
+ wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128));
2738
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_NANS)
2739
+ r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
2740
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2741
+ r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
2742
+ vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
2743
+ #else
2744
+ SIMDE_VECTORIZE
2745
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2746
+ r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2747
+ }
2748
+ #endif
2749
+
2750
+ return simde__m128_from_private(r_);
2751
+ #endif
2752
+ }
2753
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2754
+ #define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
2755
+ #endif
2756
+
2757
+ SIMDE_FUNCTION_ATTRIBUTES
2758
+ simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
2759
+ {
2760
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2761
+ return _mm_max_pu8(a, b);
2762
+ #else
2763
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
2764
+ b_ = simde__m64_to_private(b);
2765
+
2766
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2767
+ r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
2768
+ #else
2769
+ SIMDE_VECTORIZE
2770
+ for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2771
+ r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2772
+ }
2773
+ #endif
2774
+
2775
+ return simde__m64_from_private(r_);
2776
+ #endif
2777
+ }
2778
+ #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2779
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2780
+ #define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
2781
+ #define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2782
+ #endif
2783
+
2784
+ SIMDE_FUNCTION_ATTRIBUTES
2785
+ simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
2786
+ {
2787
+ #if defined(SIMDE_X86_SSE_NATIVE)
2788
+ return _mm_max_ss(a, b);
2789
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2790
+ return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
2791
+ #else
2792
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2793
+ b_ = simde__m128_to_private(b);
2794
+
2795
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2796
+ float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0);
2797
+ r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2798
+ #else
2799
+ r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2800
+ r_.f32[1] = a_.f32[1];
2801
+ r_.f32[2] = a_.f32[2];
2802
+ r_.f32[3] = a_.f32[3];
2803
+ #endif
2804
+
2805
+ return simde__m128_from_private(r_);
2806
+ #endif
2807
+ }
2808
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2809
+ #define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
2810
+ #endif
2811
+
2812
+ SIMDE_FUNCTION_ATTRIBUTES
2813
+ simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
2814
+ {
2815
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2816
+ return _mm_min_pi16(a, b);
2817
+ #else
2818
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
2819
+ b_ = simde__m64_to_private(b);
2820
+
2821
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2822
+ r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
2823
+ #else
2824
+ SIMDE_VECTORIZE
2825
+ for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2826
+ r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2827
+ }
2828
+ #endif
2829
+
2830
+ return simde__m64_from_private(r_);
2831
+ #endif
2832
+ }
2833
+ #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
2834
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2835
+ #define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
2836
+ #define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
2837
+ #endif
2838
+
2839
+ SIMDE_FUNCTION_ATTRIBUTES
2840
+ simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
2841
+ {
2842
+ #if defined(SIMDE_X86_SSE_NATIVE)
2843
+ return _mm_min_ps(a, b);
2844
+ #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2845
+ return simde__m128_from_neon_f32(vminq_f32(simde__m128_to_neon_f32(a),
2846
+ simde__m128_to_neon_f32(b)));
2847
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2848
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2849
+ b_ = simde__m128_to_private(b);
2850
+ #if defined(SIMDE_FAST_NANS)
2851
+ r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128);
2852
+ #else
2853
+ r_.wasm_v128 =
2854
+ wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
2855
+ wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128));
2856
+ #endif
2857
+ return simde__m128_from_private(r_);
2858
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2859
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2860
+ b_ = simde__m128_to_private(b);
2861
+
2862
+ #if defined(SIMDE_FAST_NANS)
2863
+ r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
2864
+ #else
2865
+ r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
2866
+ vec_cmpgt(b_.altivec_f32, a_.altivec_f32));
2867
+ #endif
2868
+
2869
+ return simde__m128_from_private(r_);
2870
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2871
+ simde__m128 mask = simde_mm_cmplt_ps(a, b);
2872
+ return simde_mm_or_ps(simde_mm_and_ps(mask, a),
2873
+ simde_mm_andnot_ps(mask, b));
2874
+ #else
2875
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2876
+ b_ = simde__m128_to_private(b);
2877
+
2878
+ SIMDE_VECTORIZE
2879
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2880
+ r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2881
+ }
2882
+
2883
+ return simde__m128_from_private(r_);
2884
+ #endif
2885
+ }
2886
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2887
+ #define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
2888
+ #endif
2889
+
2890
+ SIMDE_FUNCTION_ATTRIBUTES
2891
+ simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
2892
+ {
2893
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2894
+ return _mm_min_pu8(a, b);
2895
+ #else
2896
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
2897
+ b_ = simde__m64_to_private(b);
2898
+
2899
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2900
+ r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
2901
+ #else
2902
+ SIMDE_VECTORIZE
2903
+ for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2904
+ r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2905
+ }
2906
+ #endif
2907
+
2908
+ return simde__m64_from_private(r_);
2909
+ #endif
2910
+ }
2911
+ #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
2912
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2913
+ #define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
2914
+ #define _m_pminub(a, b) simde_mm_min_pu8(a, b)
2915
+ #endif
2916
+
2917
+ SIMDE_FUNCTION_ATTRIBUTES
2918
+ simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
2919
+ {
2920
+ #if defined(SIMDE_X86_SSE_NATIVE)
2921
+ return _mm_min_ss(a, b);
2922
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2923
+ return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
2924
+ #else
2925
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2926
+ b_ = simde__m128_to_private(b);
2927
+
2928
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2929
+ float32_t value =
2930
+ vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0);
2931
+ r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2932
+ #else
2933
+ r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2934
+ r_.f32[1] = a_.f32[1];
2935
+ r_.f32[2] = a_.f32[2];
2936
+ r_.f32[3] = a_.f32[3];
2937
+ #endif
2938
+
2939
+ return simde__m128_from_private(r_);
2940
+ #endif
2941
+ }
2942
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2943
+ #define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
2944
+ #endif
2945
+
2946
+ SIMDE_FUNCTION_ATTRIBUTES
2947
+ simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
2948
+ {
2949
+ #if defined(SIMDE_X86_SSE_NATIVE)
2950
+ return _mm_movehl_ps(a, b);
2951
+ #else
2952
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2953
+ b_ = simde__m128_to_private(b);
2954
+
2955
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2956
+ float32x2_t a32 = vget_high_f32(a_.neon_f32);
2957
+ float32x2_t b32 = vget_high_f32(b_.neon_f32);
2958
+ r_.neon_f32 = vcombine_f32(b32, a32);
2959
+ #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
2960
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
2961
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
2962
+ vec_mergel(b_.altivec_i64, a_.altivec_i64));
2963
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
2964
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
2965
+ #else
2966
+ r_.f32[0] = b_.f32[2];
2967
+ r_.f32[1] = b_.f32[3];
2968
+ r_.f32[2] = a_.f32[2];
2969
+ r_.f32[3] = a_.f32[3];
2970
+ #endif
2971
+
2972
+ return simde__m128_from_private(r_);
2973
+ #endif
2974
+ }
2975
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2976
+ #define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
2977
+ #endif
2978
+
2979
+ SIMDE_FUNCTION_ATTRIBUTES
2980
+ simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
2981
+ {
2982
+ #if defined(SIMDE_X86_SSE_NATIVE)
2983
+ return _mm_movelh_ps(a, b);
2984
+ #else
2985
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
2986
+ b_ = simde__m128_to_private(b);
2987
+
2988
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2989
+ float32x2_t a10 = vget_low_f32(a_.neon_f32);
2990
+ float32x2_t b10 = vget_low_f32(b_.neon_f32);
2991
+ r_.neon_f32 = vcombine_f32(a10, b10);
2992
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
2993
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
2994
+ #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
2995
+ r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
2996
+ SIMDE_POWER_ALTIVEC_VECTOR(float),
2997
+ vec_mergeh(a_.altivec_i64, b_.altivec_i64));
2998
+ #else
2999
+ r_.f32[0] = a_.f32[0];
3000
+ r_.f32[1] = a_.f32[1];
3001
+ r_.f32[2] = b_.f32[0];
3002
+ r_.f32[3] = b_.f32[1];
3003
+ #endif
3004
+
3005
+ return simde__m128_from_private(r_);
3006
+ #endif
3007
+ }
3008
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3009
+ #define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
3010
+ #endif
3011
+
3012
+ SIMDE_FUNCTION_ATTRIBUTES
3013
+ int simde_mm_movemask_pi8(simde__m64 a)
3014
+ {
3015
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3016
+ return _mm_movemask_pi8(a);
3017
+ #else
3018
+ simde__m64_private a_ = simde__m64_to_private(a);
3019
+ int r = 0;
3020
+
3021
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3022
+ uint8x8_t input = a_.neon_u8;
3023
+ const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
3024
+ const uint8x8_t mask_and = vdup_n_u8(0x80);
3025
+ const int8x8_t mask_shift = vld1_s8(xr);
3026
+ const uint8x8_t mask_result =
3027
+ vshl_u8(vand_u8(input, mask_and), mask_shift);
3028
+ uint8x8_t lo = mask_result;
3029
+ r = vaddv_u8(lo);
3030
+ #else
3031
+ const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
3032
+ SIMDE_VECTORIZE_REDUCTION(| : r)
3033
+ for (size_t i = 0; i < nmemb; i++) {
3034
+ r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
3035
+ }
3036
+ #endif
3037
+
3038
+ return r;
3039
+ #endif
3040
+ }
3041
+ #define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a)
3042
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3043
+ #define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
3044
+ #define _m_pmovmskb(a) simde_mm_movemask_pi8(a)
3045
+ #endif
3046
+
3047
+ SIMDE_FUNCTION_ATTRIBUTES
3048
+ int simde_mm_movemask_ps(simde__m128 a)
3049
+ {
3050
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3051
+ return _mm_movemask_ps(a);
3052
+ #else
3053
+ int r = 0;
3054
+ simde__m128_private a_ = simde__m128_to_private(a);
3055
+
3056
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3057
+ static const int32_t shift_amount[] = {0, 1, 2, 3};
3058
+ const int32x4_t shift = vld1q_s32(shift_amount);
3059
+ uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31);
3060
+ return HEDLEY_STATIC_CAST(int, vaddvq_u32(vshlq_u32(tmp, shift)));
3061
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3062
+ // Shift out everything but the sign bits with a 32-bit unsigned shift right.
3063
+ uint64x2_t high_bits =
3064
+ vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31));
3065
+ // Merge the two pairs together with a 64-bit unsigned shift right + add.
3066
+ uint8x16_t paired =
3067
+ vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
3068
+ // Extract the result.
3069
+ return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
3070
+ #else
3071
+ SIMDE_VECTORIZE_REDUCTION(| : r)
3072
+ for (size_t i = 0; i < sizeof(a_.u32) / sizeof(a_.u32[0]); i++) {
3073
+ r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
3074
+ }
3075
+ #endif
3076
+
3077
+ return r;
3078
+ #endif
3079
+ }
3080
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3081
+ #define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
3082
+ #endif
3083
+
3084
+ SIMDE_FUNCTION_ATTRIBUTES
3085
+ simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
3086
+ {
3087
+ #if defined(SIMDE_X86_SSE_NATIVE)
3088
+ return _mm_mul_ps(a, b);
3089
+ #else
3090
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
3091
+ b_ = simde__m128_to_private(b);
3092
+
3093
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3094
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
3095
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3096
+ r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
3097
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3098
+ r_.f32 = a_.f32 * b_.f32;
3099
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3100
+ r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32);
3101
+ #else
3102
+ SIMDE_VECTORIZE
3103
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3104
+ r_.f32[i] = a_.f32[i] * b_.f32[i];
3105
+ }
3106
+ #endif
3107
+
3108
+ return simde__m128_from_private(r_);
3109
+ #endif
3110
+ }
3111
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3112
+ #define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
3113
+ #endif
3114
+
3115
+ SIMDE_FUNCTION_ATTRIBUTES
3116
+ simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
3117
+ {
3118
+ #if defined(SIMDE_X86_SSE_NATIVE)
3119
+ return _mm_mul_ss(a, b);
3120
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3121
+ return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
3122
+ #else
3123
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
3124
+ b_ = simde__m128_to_private(b);
3125
+
3126
+ r_.f32[0] = a_.f32[0] * b_.f32[0];
3127
+ r_.f32[1] = a_.f32[1];
3128
+ r_.f32[2] = a_.f32[2];
3129
+ r_.f32[3] = a_.f32[3];
3130
+
3131
+ return simde__m128_from_private(r_);
3132
+ #endif
3133
+ }
3134
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3135
+ #define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
3136
+ #endif
3137
+
3138
+ SIMDE_FUNCTION_ATTRIBUTES
3139
+ simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
3140
+ {
3141
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3142
+ return _mm_mulhi_pu16(a, b);
3143
+ #else
3144
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
3145
+ b_ = simde__m64_to_private(b);
3146
+
3147
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3148
+ const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16);
3149
+ const uint32x4_t t2 = vshrq_n_u32(t1, 16);
3150
+ const uint16x4_t t3 = vmovn_u32(t2);
3151
+ r_.neon_u16 = t3;
3152
+ #else
3153
+ SIMDE_VECTORIZE
3154
+ for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
3155
+ r_.u16[i] = HEDLEY_STATIC_CAST(
3156
+ uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
3157
+ HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >>
3158
+ UINT32_C(16)));
3159
+ }
3160
+ #endif
3161
+
3162
+ return simde__m64_from_private(r_);
3163
+ #endif
3164
+ }
3165
+ #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
3166
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3167
+ #define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
3168
+ #define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
3169
+ #endif
3170
+
3171
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION)
3172
+ #define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0)
3173
+ #define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1)
3174
+ #define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2)
3175
+ #define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3)
3176
+ #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4)
3177
+ #define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5)
3178
+ #define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6)
3179
+ #define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7)
3180
+ #else
3181
+ #define SIMDE_MM_HINT_NTA 0
3182
+ #define SIMDE_MM_HINT_T0 1
3183
+ #define SIMDE_MM_HINT_T1 2
3184
+ #define SIMDE_MM_HINT_T2 3
3185
+ #define SIMDE_MM_HINT_ENTA 4
3186
+ #define SIMDE_MM_HINT_ET0 5
3187
+ #define SIMDE_MM_HINT_ET1 6
3188
+ #define SIMDE_MM_HINT_ET2 7
3189
+ #endif
3190
+
3191
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3192
+ HEDLEY_DIAGNOSTIC_PUSH
3193
+ #if HEDLEY_HAS_WARNING("-Wreserved-id-macro")
3194
+ _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
3195
+ #endif
3196
+ #undef _MM_HINT_NTA
3197
+ #define _MM_HINT_NTA SIMDE_MM_HINT_NTA
3198
+ #undef _MM_HINT_T0
3199
+ #define _MM_HINT_T0 SIMDE_MM_HINT_T0
3200
+ #undef _MM_HINT_T1
3201
+ #define _MM_HINT_T1 SIMDE_MM_HINT_T1
3202
+ #undef _MM_HINT_T2
3203
+ #define _MM_HINT_T2 SIMDE_MM_HINT_T2
3204
+ #undef _MM_HINT_ETNA
3205
+ #define _MM_HINT_ETNA SIMDE_MM_HINT_ETNA
3206
+ #undef _MM_HINT_ET0
3207
+ #define _MM_HINT_ET0 SIMDE_MM_HINT_ET0
3208
+ #undef _MM_HINT_ET1
3209
+ #define _MM_HINT_ET1 SIMDE_MM_HINT_ET1
3210
+ #undef _MM_HINT_ET1
3211
+ #define _MM_HINT_ET2 SIMDE_MM_HINT_ET2
3212
+ HEDLEY_DIAGNOSTIC_POP
3213
+ #endif
3214
+
3215
+ SIMDE_FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i)
3216
+ {
3217
+ #if defined(HEDLEY_GCC_VERSION)
3218
+ __builtin_prefetch(p);
3219
+ #else
3220
+ (void)p;
3221
+ #endif
3222
+
3223
+ (void)i;
3224
+ }
3225
+ #if defined(SIMDE_X86_SSE_NATIVE)
3226
+ #if defined(__clang__) && \
3227
+ !SIMDE_DETECT_CLANG_VERSION_CHECK( \
3228
+ 10, 0, 0) /* https://reviews.llvm.org/D71718 */
3229
+ #define simde_mm_prefetch(p, i) \
3230
+ (__extension__({ \
3231
+ HEDLEY_DIAGNOSTIC_PUSH \
3232
+ HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
3233
+ _mm_prefetch((p), (i)); \
3234
+ HEDLEY_DIAGNOSTIC_POP \
3235
+ }))
3236
+ #else
3237
+ #define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
3238
+ #endif
3239
+ #endif
3240
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3241
+ #define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
3242
+ #endif
3243
+
3244
+ SIMDE_FUNCTION_ATTRIBUTES
3245
+ simde__m128 simde_x_mm_negate_ps(simde__m128 a)
3246
+ {
3247
+ #if defined(SIMDE_X86_SSE_NATIVE)
3248
+ return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0)));
3249
+ #else
3250
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3251
+
3252
+ #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
3253
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8, 1, 0))
3254
+ r_.altivec_f32 = vec_neg(a_.altivec_f32);
3255
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3256
+ r_.neon_f32 = vnegq_f32(a_.neon_f32);
3257
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3258
+ r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128);
3259
+ #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3260
+ r_.altivec_f32 = vec_neg(a_.altivec_f32);
3261
+ #elif defined(SIMDE_VECTOR_NEGATE)
3262
+ r_.f32 = -a_.f32;
3263
+ #else
3264
+ SIMDE_VECTORIZE
3265
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3266
+ r_.f32[i] = -a_.f32[i];
3267
+ }
3268
+ #endif
3269
+
3270
+ return simde__m128_from_private(r_);
3271
+ #endif
3272
+ }
3273
+
3274
+ SIMDE_FUNCTION_ATTRIBUTES
3275
+ simde__m128 simde_mm_rcp_ps(simde__m128 a)
3276
+ {
3277
+ #if defined(SIMDE_X86_SSE_NATIVE)
3278
+ return _mm_rcp_ps(a);
3279
+ #else
3280
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3281
+
3282
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3283
+ float32x4_t recip = vrecpeq_f32(a_.neon_f32);
3284
+
3285
+ #if SIMDE_ACCURACY_PREFERENCE > 0
3286
+ for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE; ++i) {
3287
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
3288
+ }
3289
+ #endif
3290
+
3291
+ r_.neon_f32 = recip;
3292
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3293
+ r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128);
3294
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3295
+ r_.altivec_f32 = vec_re(a_.altivec_f32);
3296
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
3297
+ r_.f32 = 1.0f / a_.f32;
3298
+ #elif defined(SIMDE_IEEE754_STORAGE)
3299
+ /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */
3300
+ SIMDE_VECTORIZE
3301
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3302
+ int32_t ix;
3303
+ simde_float32 fx = a_.f32[i];
3304
+ simde_memcpy(&ix, &fx, sizeof(ix));
3305
+ int32_t x = INT32_C(0x7EF311C3) - ix;
3306
+ simde_float32 temp;
3307
+ simde_memcpy(&temp, &x, sizeof(temp));
3308
+ r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx);
3309
+ }
3310
+ #else
3311
+ SIMDE_VECTORIZE
3312
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3313
+ r_.f32[i] = 1.0f / a_.f32[i];
3314
+ }
3315
+ #endif
3316
+
3317
+ return simde__m128_from_private(r_);
3318
+ #endif
3319
+ }
3320
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3321
+ #define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
3322
+ #endif
3323
+
3324
+ SIMDE_FUNCTION_ATTRIBUTES
3325
+ simde__m128 simde_mm_rcp_ss(simde__m128 a)
3326
+ {
3327
+ #if defined(SIMDE_X86_SSE_NATIVE)
3328
+ return _mm_rcp_ss(a);
3329
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3330
+ return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
3331
+ #else
3332
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3333
+
3334
+ r_.f32[0] = 1.0f / a_.f32[0];
3335
+ r_.f32[1] = a_.f32[1];
3336
+ r_.f32[2] = a_.f32[2];
3337
+ r_.f32[3] = a_.f32[3];
3338
+
3339
+ return simde__m128_from_private(r_);
3340
+ #endif
3341
+ }
3342
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3343
+ #define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
3344
+ #endif
3345
+
3346
+ SIMDE_FUNCTION_ATTRIBUTES
3347
+ simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
3348
+ {
3349
+ #if defined(SIMDE_X86_SSE_NATIVE)
3350
+ return _mm_rsqrt_ps(a);
3351
+ #else
3352
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3353
+
3354
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3355
+ r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
3356
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3357
+ r_.altivec_f32 = vec_rsqrte(a_.altivec_f32);
3358
+ #elif defined(SIMDE_IEEE754_STORAGE)
3359
+ /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
3360
+ Pages 100 - 103 */
3361
+ SIMDE_VECTORIZE
3362
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3363
+ #if SIMDE_ACCURACY_PREFERENCE <= 0
3364
+ r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1);
3365
+ #else
3366
+ simde_float32 x = a_.f32[i];
3367
+ simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
3368
+ int32_t ix;
3369
+
3370
+ simde_memcpy(&ix, &x, sizeof(ix));
3371
+
3372
+ #if SIMDE_ACCURACY_PREFERENCE == 1
3373
+ ix = INT32_C(0x5F375A82) - (ix >> 1);
3374
+ #else
3375
+ ix = INT32_C(0x5F37599E) - (ix >> 1);
3376
+ #endif
3377
+
3378
+ simde_memcpy(&x, &ix, sizeof(x));
3379
+
3380
+ #if SIMDE_ACCURACY_PREFERENCE >= 2
3381
+ x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3382
+ #endif
3383
+ x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3384
+
3385
+ r_.f32[i] = x;
3386
+ #endif
3387
+ }
3388
+ #elif defined(simde_math_sqrtf)
3389
+ SIMDE_VECTORIZE
3390
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3391
+ r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
3392
+ }
3393
+ #else
3394
+ HEDLEY_UNREACHABLE();
3395
+ #endif
3396
+
3397
+ return simde__m128_from_private(r_);
3398
+ #endif
3399
+ }
3400
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3401
+ #define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
3402
+ #endif
3403
+
3404
+ SIMDE_FUNCTION_ATTRIBUTES
3405
+ simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
3406
+ {
3407
+ #if defined(SIMDE_X86_SSE_NATIVE)
3408
+ return _mm_rsqrt_ss(a);
3409
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3410
+ return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
3411
+ #else
3412
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3413
+
3414
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3415
+ r_.neon_f32 =
3416
+ vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0),
3417
+ a_.neon_f32, 0);
3418
+ #elif defined(SIMDE_IEEE754_STORAGE)
3419
+ {
3420
+ #if SIMDE_ACCURACY_PREFERENCE <= 0
3421
+ r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1);
3422
+ #else
3423
+ simde_float32 x = a_.f32[0];
3424
+ simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
3425
+ int32_t ix;
3426
+
3427
+ simde_memcpy(&ix, &x, sizeof(ix));
3428
+
3429
+ #if SIMDE_ACCURACY_PREFERENCE == 1
3430
+ ix = INT32_C(0x5F375A82) - (ix >> 1);
3431
+ #else
3432
+ ix = INT32_C(0x5F37599E) - (ix >> 1);
3433
+ #endif
3434
+
3435
+ simde_memcpy(&x, &ix, sizeof(x));
3436
+
3437
+ #if SIMDE_ACCURACY_PREFERENCE >= 2
3438
+ x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3439
+ #endif
3440
+ x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3441
+
3442
+ r_.f32[0] = x;
3443
+ #endif
3444
+ }
3445
+ r_.f32[1] = a_.f32[1];
3446
+ r_.f32[2] = a_.f32[2];
3447
+ r_.f32[3] = a_.f32[3];
3448
+ #elif defined(simde_math_sqrtf)
3449
+ r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]);
3450
+ r_.f32[1] = a_.f32[1];
3451
+ r_.f32[2] = a_.f32[2];
3452
+ r_.f32[3] = a_.f32[3];
3453
+ #else
3454
+ HEDLEY_UNREACHABLE();
3455
+ #endif
3456
+
3457
+ return simde__m128_from_private(r_);
3458
+ #endif
3459
+ }
3460
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3461
+ #define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
3462
+ #endif
3463
+
3464
+ SIMDE_FUNCTION_ATTRIBUTES
3465
+ simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
3466
+ {
3467
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3468
+ return _mm_sad_pu8(a, b);
3469
+ #else
3470
+ simde__m64_private r_, a_ = simde__m64_to_private(a),
3471
+ b_ = simde__m64_to_private(b);
3472
+
3473
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3474
+ uint16x4_t t = vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8));
3475
+ uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3476
+ r_.neon_u16 = vset_lane_u16(r0, vdup_n_u16(0), 0);
3477
+ #else
3478
+ uint16_t sum = 0;
3479
+
3480
+ #if defined(SIMDE_HAVE_STDLIB_H)
3481
+ SIMDE_VECTORIZE_REDUCTION(+ : sum)
3482
+ for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
3483
+ sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i]));
3484
+ }
3485
+
3486
+ r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum);
3487
+ r_.i16[1] = 0;
3488
+ r_.i16[2] = 0;
3489
+ r_.i16[3] = 0;
3490
+ #else
3491
+ HEDLEY_UNREACHABLE();
3492
+ #endif
3493
+ #endif
3494
+
3495
+ return simde__m64_from_private(r_);
3496
+ #endif
3497
+ }
3498
+ #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
3499
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3500
+ #define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
3501
+ #define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
3502
+ #endif
3503
+
3504
+ SIMDE_FUNCTION_ATTRIBUTES
3505
+ simde__m128 simde_mm_set_ss(simde_float32 a)
3506
+ {
3507
+ #if defined(SIMDE_X86_SSE_NATIVE)
3508
+ return _mm_set_ss(a);
3509
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3510
+ return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
3511
+ #else
3512
+ return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0),
3513
+ SIMDE_FLOAT32_C(0.0), a);
3514
+ #endif
3515
+ }
3516
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3517
+ #define _mm_set_ss(a) simde_mm_set_ss(a)
3518
+ #endif
3519
+
3520
+ SIMDE_FUNCTION_ATTRIBUTES
3521
+ simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2,
3522
+ simde_float32 e1, simde_float32 e0)
3523
+ {
3524
+ #if defined(SIMDE_X86_SSE_NATIVE)
3525
+ return _mm_setr_ps(e3, e2, e1, e0);
3526
+ #else
3527
+ return simde_mm_set_ps(e0, e1, e2, e3);
3528
+ #endif
3529
+ }
3530
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3531
+ #define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
3532
+ #endif
3533
+
3534
+ SIMDE_FUNCTION_ATTRIBUTES
3535
+ simde__m128 simde_mm_setzero_ps(void)
3536
+ {
3537
+ #if defined(SIMDE_X86_SSE_NATIVE)
3538
+ return _mm_setzero_ps();
3539
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3540
+ return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
3541
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3542
+ return vec_splats(SIMDE_FLOAT32_C(0.0));
3543
+ #else
3544
+ simde__m128 r;
3545
+ simde_memset(&r, 0, sizeof(r));
3546
+ return r;
3547
+ #endif
3548
+ }
3549
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3550
+ #define _mm_setzero_ps() simde_mm_setzero_ps()
3551
+ #endif
3552
+
3553
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3554
+ HEDLEY_DIAGNOSTIC_PUSH
3555
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
3556
+ #endif
3557
+
3558
+ SIMDE_FUNCTION_ATTRIBUTES
3559
+ simde__m128 simde_mm_undefined_ps(void)
3560
+ {
3561
+ simde__m128_private r_;
3562
+
3563
+ #if defined(SIMDE_HAVE_UNDEFINED128)
3564
+ r_.n = _mm_undefined_ps();
3565
+ #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3566
+ r_ = simde__m128_to_private(simde_mm_setzero_ps());
3567
+ #endif
3568
+
3569
+ return simde__m128_from_private(r_);
3570
+ }
3571
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3572
+ #define _mm_undefined_ps() simde_mm_undefined_ps()
3573
+ #endif
3574
+
3575
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3576
+ HEDLEY_DIAGNOSTIC_POP
3577
+ #endif
3578
+
3579
+ SIMDE_FUNCTION_ATTRIBUTES
3580
+ simde__m128 simde_x_mm_setone_ps(void)
3581
+ {
3582
+ simde__m128 t = simde_mm_setzero_ps();
3583
+ return simde_mm_cmpeq_ps(t, t);
3584
+ }
3585
+
3586
+ SIMDE_FUNCTION_ATTRIBUTES
3587
+ void simde_mm_sfence(void)
3588
+ {
3589
+ /* TODO: Use Hedley. */
3590
+ #if defined(SIMDE_X86_SSE_NATIVE)
3591
+ _mm_sfence();
3592
+ #elif defined(__GNUC__) && \
3593
+ ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
3594
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
3595
+ #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
3596
+ (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
3597
+ #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
3598
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
3599
+ #else
3600
+ atomic_thread_fence(memory_order_seq_cst);
3601
+ #endif
3602
+ #elif defined(_MSC_VER)
3603
+ MemoryBarrier();
3604
+ #elif HEDLEY_HAS_EXTENSION(c_atomic)
3605
+ __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
3606
+ #elif defined(__GNUC__) && \
3607
+ ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
3608
+ __sync_synchronize();
3609
+ #elif defined(_OPENMP)
3610
+ #pragma omp critical(simde_mm_sfence_)
3611
+ {
3612
+ }
3613
+ #endif
3614
+ }
3615
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3616
+ #define _mm_sfence() simde_mm_sfence()
3617
+ #endif
3618
+
3619
+ #define SIMDE_MM_SHUFFLE(z, y, x, w) \
3620
+ (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3621
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3622
+ #define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
3623
+ #endif
3624
+
3625
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
3626
+ !defined(__PGI)
3627
+ #define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
3628
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
3629
+ #define simde_mm_shuffle_pi16(a, imm8) \
3630
+ (__extension__({ \
3631
+ const simde__m64_private simde__tmp_a_ = \
3632
+ simde__m64_to_private(a); \
3633
+ simde__m64_from_private((simde__m64_private){ \
3634
+ .i16 = SIMDE_SHUFFLE_VECTOR_( \
3635
+ 16, 8, (simde__tmp_a_).i16, \
3636
+ (simde__tmp_a_).i16, (((imm8)) & 3), \
3637
+ (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
3638
+ (((imm8) >> 6) & 3))}); \
3639
+ }))
3640
+ #else
3641
+ SIMDE_FUNCTION_ATTRIBUTES
3642
+ simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
3643
+ SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
3644
+ {
3645
+ simde__m64_private r_;
3646
+ simde__m64_private a_ = simde__m64_to_private(a);
3647
+
3648
+ for (size_t i = 0; i < sizeof(r_.i16) / sizeof(r_.i16[0]); i++) {
3649
+ r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
3650
+ }
3651
+
3652
+ HEDLEY_DIAGNOSTIC_PUSH
3653
+ #if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
3654
+ #pragma clang diagnostic ignored "-Wconditional-uninitialized"
3655
+ #endif
3656
+ return simde__m64_from_private(r_);
3657
+ HEDLEY_DIAGNOSTIC_POP
3658
+ }
3659
+ #endif
3660
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
3661
+ !defined(__PGI)
3662
+ #define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
3663
+ #else
3664
+ #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3665
+ #endif
3666
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3667
+ #define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3668
+ #define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3669
+ #endif
3670
+
3671
+ #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
3672
+ #define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
3673
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3674
+ #define simde_mm_shuffle_ps(a, b, imm8) \
3675
+ __extension__({ \
3676
+ float32x4_t ret; \
3677
+ ret = vmovq_n_f32(vgetq_lane_f32(a, (imm8) & (0x3))); \
3678
+ ret = vsetq_lane_f32(vgetq_lane_f32(a, ((imm8) >> 2) & 0x3), \
3679
+ ret, 1); \
3680
+ ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 4) & 0x3), \
3681
+ ret, 2); \
3682
+ ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 6) & 0x3), \
3683
+ ret, 3); \
3684
+ })
3685
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
3686
+ #define simde_mm_shuffle_ps(a, b, imm8) \
3687
+ (__extension__({ \
3688
+ simde__m128_from_private((simde__m128_private){ \
3689
+ .f32 = SIMDE_SHUFFLE_VECTOR_( \
3690
+ 32, 16, simde__m128_to_private(a).f32, \
3691
+ simde__m128_to_private(b).f32, (((imm8)) & 3), \
3692
+ (((imm8) >> 2) & 3), (((imm8) >> 4) & 3) + 4, \
3693
+ (((imm8) >> 6) & 3) + 4)}); \
3694
+ }))
3695
+ #else
3696
+ SIMDE_FUNCTION_ATTRIBUTES
3697
+ simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
3698
+ SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
3699
+ {
3700
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
3701
+ b_ = simde__m128_to_private(b);
3702
+
3703
+ r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
3704
+ r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
3705
+ r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
3706
+ r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
3707
+
3708
+ return simde__m128_from_private(r_);
3709
+ }
3710
+ #endif
3711
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3712
+ #define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
3713
+ #endif
3714
+
3715
+ SIMDE_FUNCTION_ATTRIBUTES
3716
+ simde__m128 simde_mm_sqrt_ps(simde__m128 a)
3717
+ {
3718
+ #if defined(SIMDE_X86_SSE_NATIVE)
3719
+ return _mm_sqrt_ps(a);
3720
+ #else
3721
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3722
+
3723
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3724
+ r_.neon_f32 = vsqrtq_f32(a_.neon_f32);
3725
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3726
+ float32x4_t est = vrsqrteq_f32(a_.neon_f32);
3727
+ for (int i = 0; i <= SIMDE_ACCURACY_PREFERENCE; i++) {
3728
+ est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est),
3729
+ est);
3730
+ }
3731
+ r_.neon_f32 = vmulq_f32(a_.neon_f32, est);
3732
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3733
+ r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128);
3734
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3735
+ r_.altivec_f32 = vec_sqrt(a_.altivec_f32);
3736
+ #elif defined(simde_math_sqrt)
3737
+ SIMDE_VECTORIZE
3738
+ for (size_t i = 0; i < sizeof(r_.f32) / sizeof(r_.f32[0]); i++) {
3739
+ r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
3740
+ }
3741
+ #else
3742
+ HEDLEY_UNREACHABLE();
3743
+ #endif
3744
+
3745
+ return simde__m128_from_private(r_);
3746
+ #endif
3747
+ }
3748
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3749
+ #define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
3750
+ #endif
3751
+
3752
+ SIMDE_FUNCTION_ATTRIBUTES
3753
+ simde__m128 simde_mm_sqrt_ss(simde__m128 a)
3754
+ {
3755
+ #if defined(SIMDE_X86_SSE_NATIVE)
3756
+ return _mm_sqrt_ss(a);
3757
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3758
+ return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
3759
+ #else
3760
+ simde__m128_private r_, a_ = simde__m128_to_private(a);
3761
+
3762
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3763
+ float32_t value = vgetq_lane_f32(
3764
+ simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0);
3765
+ r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
3766
+ #elif defined(simde_math_sqrtf)
3767
+ r_.f32[0] = simde_math_sqrtf(a_.f32[0]);
3768
+ r_.f32[1] = a_.f32[1];
3769
+ r_.f32[2] = a_.f32[2];
3770
+ r_.f32[3] = a_.f32[3];
3771
+ #else
3772
+ HEDLEY_UNREACHABLE();
3773
+ #endif
3774
+
3775
+ return simde__m128_from_private(r_);
3776
+ #endif
3777
+ }
3778
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3779
+ #define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
3780
+ #endif
3781
+
3782
+ SIMDE_FUNCTION_ATTRIBUTES
3783
+ void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
3784
+ {
3785
+ #if defined(SIMDE_X86_SSE_NATIVE)
3786
+ _mm_store_ps(mem_addr, a);
3787
+ #else
3788
+ simde__m128_private a_ = simde__m128_to_private(a);
3789
+
3790
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3791
+ vst1q_f32(mem_addr, a_.neon_f32);
3792
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3793
+ vec_st(a_.altivec_f32, 0, mem_addr);
3794
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3795
+ wasm_v128_store(mem_addr, a_.wasm_v128);
3796
+ #else
3797
+ simde_memcpy(mem_addr, &a_, sizeof(a));
3798
+ #endif
3799
+ #endif
3800
+ }
3801
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3802
+ #define _mm_store_ps(mem_addr, a) \
3803
+ simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
3804
+ float *, simde_float32 *, mem_addr), \
3805
+ (a))
3806
+ #endif
3807
+
3808
+ SIMDE_FUNCTION_ATTRIBUTES
3809
+ void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
3810
+ {
3811
+ simde_float32 *mem_addr_ =
3812
+ SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128);
3813
+
3814
+ #if defined(SIMDE_X86_SSE_NATIVE)
3815
+ _mm_store_ps1(mem_addr_, a);
3816
+ #else
3817
+ simde__m128_private a_ = simde__m128_to_private(a);
3818
+
3819
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3820
+ vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0));
3821
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3822
+ wasm_v128_store(mem_addr_,
3823
+ wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0,
3824
+ 0));
3825
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3826
+ vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_);
3827
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
3828
+ simde__m128_private tmp_;
3829
+ tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
3830
+ simde_mm_store_ps(mem_addr_, tmp_.f32);
3831
+ #else
3832
+ SIMDE_VECTORIZE_ALIGNED(mem_addr_ : 16)
3833
+ for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3834
+ mem_addr_[i] = a_.f32[0];
3835
+ }
3836
+ #endif
3837
+ #endif
3838
+ }
3839
+ #define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a)
3840
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3841
+ #define _mm_store_ps1(mem_addr, a) \
3842
+ simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
3843
+ float *, simde_float32 *, mem_addr), \
3844
+ (a))
3845
+ #define _mm_store1_ps(mem_addr, a) \
3846
+ simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
3847
+ float *, simde_float32 *, mem_addr), \
3848
+ (a))
3849
+ #endif
3850
+
3851
+ SIMDE_FUNCTION_ATTRIBUTES
3852
+ void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
3853
+ {
3854
+ #if defined(SIMDE_X86_SSE_NATIVE)
3855
+ _mm_store_ss(mem_addr, a);
3856
+ #else
3857
+ simde__m128_private a_ = simde__m128_to_private(a);
3858
+
3859
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3860
+ vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
3861
+ #else
3862
+ *mem_addr = a_.f32[0];
3863
+ #endif
3864
+ #endif
3865
+ }
3866
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3867
+ #define _mm_store_ss(mem_addr, a) \
3868
+ simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST( \
3869
+ float *, simde_float32 *, mem_addr), \
3870
+ (a))
3871
+ #endif
3872
+
3873
+ SIMDE_FUNCTION_ATTRIBUTES
3874
+ void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
3875
+ {
3876
+ #if defined(SIMDE_X86_SSE_NATIVE)
3877
+ _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3878
+ #else
3879
+ simde__m128_private a_ = simde__m128_to_private(a);
3880
+
3881
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3882
+ vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t *, mem_addr),
3883
+ vget_high_f32(a_.neon_f32));
3884
+ #else
3885
+ simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1]));
3886
+ #endif
3887
+ #endif
3888
+ }
3889
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3890
+ #define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
3891
+ #endif
3892
+
3893
+ SIMDE_FUNCTION_ATTRIBUTES
3894
+ void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
3895
+ {
3896
+ #if defined(SIMDE_X86_SSE_NATIVE)
3897
+ _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3898
+ #else
3899
+ simde__m64_private *dest_ =
3900
+ HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr);
3901
+ simde__m128_private a_ = simde__m128_to_private(a);
3902
+
3903
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3904
+ dest_->neon_f32 = vget_low_f32(a_.neon_f32);
3905
+ #else
3906
+ dest_->f32[0] = a_.f32[0];
3907
+ dest_->f32[1] = a_.f32[1];
3908
+ #endif
3909
+ #endif
3910
+ }
3911
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3912
+ #define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
3913
+ #endif
3914
+
3915
+ SIMDE_FUNCTION_ATTRIBUTES
3916
+ void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
3917
+ {
3918
+ #if defined(SIMDE_X86_SSE_NATIVE)
3919
+ _mm_storer_ps(mem_addr, a);
3920
+ #else
3921
+ simde__m128_private a_ = simde__m128_to_private(a);
3922
+
3923
+ #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3924
+ vec_st(vec_reve(a_.altivec_f32), 0, mem_addr);
3925
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3926
+ float32x4_t tmp = vrev64q_f32(a_.neon_f32);
3927
+ vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2));
3928
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
3929
+ a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
3930
+ simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
3931
+ #else
3932
+ SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
3933
+ for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3934
+ mem_addr[i] =
3935
+ a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
3936
+ }
3937
+ #endif
3938
+ #endif
3939
+ }
3940
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3941
+ #define _mm_storer_ps(mem_addr, a) \
3942
+ simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
3943
+ float *, simde_float32 *, mem_addr), \
3944
+ (a))
3945
+ #endif
3946
+
3947
+ SIMDE_FUNCTION_ATTRIBUTES
3948
+ void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
3949
+ {
3950
+ #if defined(SIMDE_X86_SSE_NATIVE)
3951
+ _mm_storeu_ps(mem_addr, a);
3952
+ #else
3953
+ simde__m128_private a_ = simde__m128_to_private(a);
3954
+
3955
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3956
+ vst1q_f32(mem_addr, a_.neon_f32);
3957
+ #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3958
+ vec_vsx_st(a_.altivec_f32, 0, mem_addr);
3959
+ #else
3960
+ simde_memcpy(mem_addr, &a_, sizeof(a_));
3961
+ #endif
3962
+ #endif
3963
+ }
3964
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3965
+ #define _mm_storeu_ps(mem_addr, a) \
3966
+ simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
3967
+ float *, simde_float32 *, mem_addr), \
3968
+ (a))
3969
+ #endif
3970
+
3971
+ SIMDE_FUNCTION_ATTRIBUTES
3972
+ simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
3973
+ {
3974
+ #if defined(SIMDE_X86_SSE_NATIVE)
3975
+ return _mm_sub_ps(a, b);
3976
+ #else
3977
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
3978
+ b_ = simde__m128_to_private(b);
3979
+
3980
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3981
+ r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
3982
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3983
+ r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
3984
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3985
+ r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32);
3986
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3987
+ r_.f32 = a_.f32 - b_.f32;
3988
+ #else
3989
+ SIMDE_VECTORIZE
3990
+ for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3991
+ r_.f32[i] = a_.f32[i] - b_.f32[i];
3992
+ }
3993
+ #endif
3994
+
3995
+ return simde__m128_from_private(r_);
3996
+ #endif
3997
+ }
3998
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3999
+ #define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
4000
+ #endif
4001
+
4002
+ SIMDE_FUNCTION_ATTRIBUTES
4003
+ simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
4004
+ {
4005
+ #if defined(SIMDE_X86_SSE_NATIVE)
4006
+ return _mm_sub_ss(a, b);
4007
+ #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4008
+ return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
4009
+ #else
4010
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
4011
+ b_ = simde__m128_to_private(b);
4012
+
4013
+ r_.f32[0] = a_.f32[0] - b_.f32[0];
4014
+ r_.f32[1] = a_.f32[1];
4015
+ r_.f32[2] = a_.f32[2];
4016
+ r_.f32[3] = a_.f32[3];
4017
+
4018
+ return simde__m128_from_private(r_);
4019
+ #endif
4020
+ }
4021
+
4022
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4023
+ #define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
4024
+ #endif
4025
+
4026
+ SIMDE_FUNCTION_ATTRIBUTES
4027
+ int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
4028
+ {
4029
+ #if defined(SIMDE_X86_SSE_NATIVE)
4030
+ return _mm_ucomieq_ss(a, b);
4031
+ #else
4032
+ simde__m128_private a_ = simde__m128_to_private(a),
4033
+ b_ = simde__m128_to_private(b);
4034
+ int r;
4035
+
4036
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4037
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4038
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4039
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4040
+ uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
4041
+ r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
4042
+ #elif defined(SIMDE_HAVE_FENV_H)
4043
+ fenv_t envp;
4044
+ int x = feholdexcept(&envp);
4045
+ r = a_.f32[0] == b_.f32[0];
4046
+ if (HEDLEY_LIKELY(x == 0))
4047
+ fesetenv(&envp);
4048
+ #else
4049
+ r = a_.f32[0] == b_.f32[0];
4050
+ #endif
4051
+
4052
+ return r;
4053
+ #endif
4054
+ }
4055
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4056
+ #define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
4057
+ #endif
4058
+
4059
+ SIMDE_FUNCTION_ATTRIBUTES
4060
+ int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
4061
+ {
4062
+ #if defined(SIMDE_X86_SSE_NATIVE)
4063
+ return _mm_ucomige_ss(a, b);
4064
+ #else
4065
+ simde__m128_private a_ = simde__m128_to_private(a),
4066
+ b_ = simde__m128_to_private(b);
4067
+ int r;
4068
+
4069
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4070
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4071
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4072
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4073
+ uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
4074
+ r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
4075
+ #elif defined(SIMDE_HAVE_FENV_H)
4076
+ fenv_t envp;
4077
+ int x = feholdexcept(&envp);
4078
+ r = a_.f32[0] >= b_.f32[0];
4079
+ if (HEDLEY_LIKELY(x == 0))
4080
+ fesetenv(&envp);
4081
+ #else
4082
+ r = a_.f32[0] >= b_.f32[0];
4083
+ #endif
4084
+
4085
+ return r;
4086
+ #endif
4087
+ }
4088
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4089
+ #define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
4090
+ #endif
4091
+
4092
+ SIMDE_FUNCTION_ATTRIBUTES
4093
+ int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
4094
+ {
4095
+ #if defined(SIMDE_X86_SSE_NATIVE)
4096
+ return _mm_ucomigt_ss(a, b);
4097
+ #else
4098
+ simde__m128_private a_ = simde__m128_to_private(a),
4099
+ b_ = simde__m128_to_private(b);
4100
+ int r;
4101
+
4102
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4103
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4104
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4105
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4106
+ uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
4107
+ r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
4108
+ #elif defined(SIMDE_HAVE_FENV_H)
4109
+ fenv_t envp;
4110
+ int x = feholdexcept(&envp);
4111
+ r = a_.f32[0] > b_.f32[0];
4112
+ if (HEDLEY_LIKELY(x == 0))
4113
+ fesetenv(&envp);
4114
+ #else
4115
+ r = a_.f32[0] > b_.f32[0];
4116
+ #endif
4117
+
4118
+ return r;
4119
+ #endif
4120
+ }
4121
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4122
+ #define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
4123
+ #endif
4124
+
4125
+ SIMDE_FUNCTION_ATTRIBUTES
4126
+ int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
4127
+ {
4128
+ #if defined(SIMDE_X86_SSE_NATIVE)
4129
+ return _mm_ucomile_ss(a, b);
4130
+ #else
4131
+ simde__m128_private a_ = simde__m128_to_private(a),
4132
+ b_ = simde__m128_to_private(b);
4133
+ int r;
4134
+
4135
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4136
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4137
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4138
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4139
+ uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
4140
+ r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
4141
+ #elif defined(SIMDE_HAVE_FENV_H)
4142
+ fenv_t envp;
4143
+ int x = feholdexcept(&envp);
4144
+ r = a_.f32[0] <= b_.f32[0];
4145
+ if (HEDLEY_LIKELY(x == 0))
4146
+ fesetenv(&envp);
4147
+ #else
4148
+ r = a_.f32[0] <= b_.f32[0];
4149
+ #endif
4150
+
4151
+ return r;
4152
+ #endif
4153
+ }
4154
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4155
+ #define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
4156
+ #endif
4157
+
4158
+ SIMDE_FUNCTION_ATTRIBUTES
4159
+ int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
4160
+ {
4161
+ #if defined(SIMDE_X86_SSE_NATIVE)
4162
+ return _mm_ucomilt_ss(a, b);
4163
+ #else
4164
+ simde__m128_private a_ = simde__m128_to_private(a),
4165
+ b_ = simde__m128_to_private(b);
4166
+ int r;
4167
+
4168
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4169
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4170
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4171
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4172
+ uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
4173
+ r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
4174
+ #elif defined(SIMDE_HAVE_FENV_H)
4175
+ fenv_t envp;
4176
+ int x = feholdexcept(&envp);
4177
+ r = a_.f32[0] < b_.f32[0];
4178
+ if (HEDLEY_LIKELY(x == 0))
4179
+ fesetenv(&envp);
4180
+ #else
4181
+ r = a_.f32[0] < b_.f32[0];
4182
+ #endif
4183
+
4184
+ return r;
4185
+ #endif
4186
+ }
4187
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4188
+ #define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
4189
+ #endif
4190
+
4191
+ SIMDE_FUNCTION_ATTRIBUTES
4192
+ int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
4193
+ {
4194
+ #if defined(SIMDE_X86_SSE_NATIVE)
4195
+ return _mm_ucomineq_ss(a, b);
4196
+ #else
4197
+ simde__m128_private a_ = simde__m128_to_private(a),
4198
+ b_ = simde__m128_to_private(b);
4199
+ int r;
4200
+
4201
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4202
+ uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4203
+ uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4204
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4205
+ uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
4206
+ r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
4207
+ #elif defined(SIMDE_HAVE_FENV_H)
4208
+ fenv_t envp;
4209
+ int x = feholdexcept(&envp);
4210
+ r = a_.f32[0] != b_.f32[0];
4211
+ if (HEDLEY_LIKELY(x == 0))
4212
+ fesetenv(&envp);
4213
+ #else
4214
+ r = a_.f32[0] != b_.f32[0];
4215
+ #endif
4216
+
4217
+ return r;
4218
+ #endif
4219
+ }
4220
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4221
+ #define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
4222
+ #endif
4223
+
4224
+ #if defined(SIMDE_X86_SSE_NATIVE)
4225
+ #if defined(__has_builtin)
4226
+ #if __has_builtin(__builtin_ia32_undef128)
4227
+ #define SIMDE_HAVE_UNDEFINED128
4228
+ #endif
4229
+ #elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && \
4230
+ !defined(_MSC_VER)
4231
+ #define SIMDE_HAVE_UNDEFINED128
4232
+ #endif
4233
+ #endif
4234
+
4235
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4236
+ HEDLEY_DIAGNOSTIC_PUSH
4237
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4238
+ #endif
4239
+
4240
+ SIMDE_FUNCTION_ATTRIBUTES
4241
+ simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
4242
+ {
4243
+ #if defined(SIMDE_X86_SSE_NATIVE)
4244
+ return _mm_unpackhi_ps(a, b);
4245
+ #else
4246
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
4247
+ b_ = simde__m128_to_private(b);
4248
+
4249
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4250
+ r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32);
4251
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4252
+ float32x2_t a1 = vget_high_f32(a_.neon_f32);
4253
+ float32x2_t b1 = vget_high_f32(b_.neon_f32);
4254
+ float32x2x2_t result = vzip_f32(a1, b1);
4255
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
4256
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
4257
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
4258
+ #else
4259
+ r_.f32[0] = a_.f32[2];
4260
+ r_.f32[1] = b_.f32[2];
4261
+ r_.f32[2] = a_.f32[3];
4262
+ r_.f32[3] = b_.f32[3];
4263
+ #endif
4264
+
4265
+ return simde__m128_from_private(r_);
4266
+ #endif
4267
+ }
4268
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4269
+ #define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
4270
+ #endif
4271
+
4272
+ SIMDE_FUNCTION_ATTRIBUTES
4273
+ simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
4274
+ {
4275
+ #if defined(SIMDE_X86_SSE_NATIVE)
4276
+ return _mm_unpacklo_ps(a, b);
4277
+ #else
4278
+ simde__m128_private r_, a_ = simde__m128_to_private(a),
4279
+ b_ = simde__m128_to_private(b);
4280
+
4281
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4282
+ r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32);
4283
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4284
+ r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32);
4285
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
4286
+ r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
4287
+ #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4288
+ float32x2_t a1 = vget_low_f32(a_.neon_f32);
4289
+ float32x2_t b1 = vget_low_f32(b_.neon_f32);
4290
+ float32x2x2_t result = vzip_f32(a1, b1);
4291
+ r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
4292
+ #else
4293
+ r_.f32[0] = a_.f32[0];
4294
+ r_.f32[1] = b_.f32[0];
4295
+ r_.f32[2] = a_.f32[1];
4296
+ r_.f32[3] = b_.f32[1];
4297
+ #endif
4298
+
4299
+ return simde__m128_from_private(r_);
4300
+ #endif
4301
+ }
4302
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4303
+ #define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
4304
+ #endif
4305
+
4306
+ SIMDE_FUNCTION_ATTRIBUTES
4307
+ void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
4308
+ {
4309
+ #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4310
+ _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
4311
+ #else
4312
+ simde__m64_private *dest = HEDLEY_REINTERPRET_CAST(simde__m64_private *,
4313
+ mem_addr),
4314
+ a_ = simde__m64_to_private(a);
4315
+
4316
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4317
+ dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
4318
+ #else
4319
+ dest->i64[0] = a_.i64[0];
4320
+ #endif
4321
+ #endif
4322
+ }
4323
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4324
+ #define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
4325
+ #endif
4326
+
4327
+ SIMDE_FUNCTION_ATTRIBUTES
4328
+ void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
4329
+ {
4330
+ #if defined(SIMDE_X86_SSE_NATIVE)
4331
+ _mm_stream_ps(mem_addr, a);
4332
+ #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && \
4333
+ defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4334
+ simde__m128_private a_ = simde__m128_to_private(a);
4335
+ __builtin_nontemporal_store(
4336
+ a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32) *, mem_addr));
4337
+ #else
4338
+ simde_mm_store_ps(mem_addr, a);
4339
+ #endif
4340
+ }
4341
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4342
+ #define _mm_stream_ps(mem_addr, a) \
4343
+ simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
4344
+ float *, simde_float32 *, mem_addr), \
4345
+ (a))
4346
+ #endif
4347
+
4348
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4349
+ #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
4350
+ do { \
4351
+ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
4352
+ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
4353
+ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
4354
+ vget_low_f32(ROW23.val[0])); \
4355
+ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
4356
+ vget_low_f32(ROW23.val[1])); \
4357
+ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
4358
+ vget_high_f32(ROW23.val[0])); \
4359
+ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
4360
+ vget_high_f32(ROW23.val[1])); \
4361
+ } while (0)
4362
+ #else
4363
+ #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
4364
+ do { \
4365
+ simde__m128 tmp3, tmp2, tmp1, tmp0; \
4366
+ tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
4367
+ tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
4368
+ tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
4369
+ tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
4370
+ row0 = simde_mm_movelh_ps(tmp0, tmp2); \
4371
+ row1 = simde_mm_movehl_ps(tmp2, tmp0); \
4372
+ row2 = simde_mm_movelh_ps(tmp1, tmp3); \
4373
+ row3 = simde_mm_movehl_ps(tmp3, tmp1); \
4374
+ } while (0)
4375
+ #endif
4376
+ #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4377
+ #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
4378
+ SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
4379
+ #endif
4380
+
4381
+ #if defined(_MM_EXCEPT_INVALID)
4382
+ #define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
4383
+ #else
4384
+ #define SIMDE_MM_EXCEPT_INVALID (0x0001)
4385
+ #endif
4386
+ #if defined(_MM_EXCEPT_DENORM)
4387
+ #define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
4388
+ #else
4389
+ #define SIMDE_MM_EXCEPT_DENORM (0x0002)
4390
+ #endif
4391
+ #if defined(_MM_EXCEPT_DIV_ZERO)
4392
+ #define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
4393
+ #else
4394
+ #define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
4395
+ #endif
4396
+ #if defined(_MM_EXCEPT_OVERFLOW)
4397
+ #define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
4398
+ #else
4399
+ #define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
4400
+ #endif
4401
+ #if defined(_MM_EXCEPT_UNDERFLOW)
4402
+ #define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
4403
+ #else
4404
+ #define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
4405
+ #endif
4406
+ #if defined(_MM_EXCEPT_INEXACT)
4407
+ #define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
4408
+ #else
4409
+ #define SIMDE_MM_EXCEPT_INEXACT (0x0020)
4410
+ #endif
4411
+ #if defined(_MM_EXCEPT_MASK)
4412
+ #define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
4413
+ #else
4414
+ #define SIMDE_MM_EXCEPT_MASK \
4415
+ (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \
4416
+ SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
4417
+ SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
4418
+ #endif
4419
+
4420
+ #if defined(_MM_MASK_INVALID)
4421
+ #define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
4422
+ #else
4423
+ #define SIMDE_MM_MASK_INVALID (0x0080)
4424
+ #endif
4425
+ #if defined(_MM_MASK_DENORM)
4426
+ #define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
4427
+ #else
4428
+ #define SIMDE_MM_MASK_DENORM (0x0100)
4429
+ #endif
4430
+ #if defined(_MM_MASK_DIV_ZERO)
4431
+ #define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
4432
+ #else
4433
+ #define SIMDE_MM_MASK_DIV_ZERO (0x0200)
4434
+ #endif
4435
+ #if defined(_MM_MASK_OVERFLOW)
4436
+ #define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
4437
+ #else
4438
+ #define SIMDE_MM_MASK_OVERFLOW (0x0400)
4439
+ #endif
4440
+ #if defined(_MM_MASK_UNDERFLOW)
4441
+ #define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
4442
+ #else
4443
+ #define SIMDE_MM_MASK_UNDERFLOW (0x0800)
4444
+ #endif
4445
+ #if defined(_MM_MASK_INEXACT)
4446
+ #define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
4447
+ #else
4448
+ #define SIMDE_MM_MASK_INEXACT (0x1000)
4449
+ #endif
4450
+ #if defined(_MM_MASK_MASK)
4451
+ #define SIMDE_MM_MASK_MASK _MM_MASK_MASK
4452
+ #else
4453
+ #define SIMDE_MM_MASK_MASK \
4454
+ (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \
4455
+ SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
4456
+ SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
4457
+ #endif
4458
+
4459
+ #if defined(_MM_FLUSH_ZERO_MASK)
4460
+ #define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
4461
+ #else
4462
+ #define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
4463
+ #endif
4464
+ #if defined(_MM_FLUSH_ZERO_ON)
4465
+ #define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
4466
+ #else
4467
+ #define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
4468
+ #endif
4469
+ #if defined(_MM_FLUSH_ZERO_OFF)
4470
+ #define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
4471
+ #else
4472
+ #define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
4473
+ #endif
4474
+
4475
+ SIMDE_END_DECLS_
4476
+
4477
+ HEDLEY_DIAGNOSTIC_POP
4478
+
4479
+ #endif /* !defined(SIMDE_X86_SSE_H) */