minimap2 0.2.24.3 → 0.2.24.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  3. data/ext/minimap2/lib/simde/COPYING +20 -0
  4. data/ext/minimap2/lib/simde/README.md +333 -0
  5. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  6. data/ext/minimap2/lib/simde/meson.build +33 -0
  7. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  8. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  9. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  10. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  11. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  12. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  13. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  14. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  15. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  16. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  17. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  18. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  19. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  20. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  21. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  29. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  30. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  31. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  32. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  33. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  34. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  35. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  36. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  37. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  38. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  39. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  40. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  41. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  42. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  43. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  44. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  45. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  46. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  47. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  48. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  49. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  50. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  51. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  52. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  53. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  54. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  55. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  56. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  57. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  58. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  59. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  60. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  61. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  62. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  63. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  64. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  65. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  66. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  67. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  68. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  69. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  70. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  71. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  72. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  73. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  74. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  75. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  76. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  77. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  78. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  79. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  80. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  81. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  82. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  83. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  84. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  85. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  86. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  87. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  88. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  89. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  90. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  91. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  92. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  93. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  94. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  95. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  96. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  97. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  98. data/lib/minimap2/aligner.rb +2 -2
  99. data/lib/minimap2/ffi/constants.rb +3 -0
  100. data/lib/minimap2/version.rb +1 -1
  101. metadata +99 -3
@@ -0,0 +1,2210 @@
1
+ /* Copyright (c) 2017-2020 Evan Nemerson <evan@nemerson.com>
2
+ *
3
+ * Permission is hereby granted, free of charge, to any person
4
+ * obtaining a copy of this software and associated documentation
5
+ * files (the "Software"), to deal in the Software without
6
+ * restriction, including without limitation the rights to use, copy,
7
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ * of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be
12
+ * included in all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ * SOFTWARE.
22
+ */
23
+
24
+ #if !defined(SIMDE__MMX_H)
25
+ # if !defined(SIMDE__MMX_H)
26
+ # define SIMDE__MMX_H
27
+ # endif
28
+ # include "../simde-common.h"
29
+
30
+ HEDLEY_DIAGNOSTIC_PUSH
31
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
32
+
33
+ # if defined(SIMDE_MMX_FORCE_NATIVE)
34
+ # define SIMDE_MMX_NATIVE
35
+ # elif defined(SIMDE_ARCH_X86_MMX) && !defined(SIMDE_MMX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
36
+ # define SIMDE_MMX_NATIVE
37
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && !defined(SIMDE_NO_NEON)
38
+ # define SIMDE_MMX_NEON
39
+ # endif
40
+
41
+ # if defined(SIMDE_MMX_NATIVE)
42
+ # define SIMDE_MMX_USE_NATIVE_TYPE
43
+ # elif defined(SIMDE_ARCH_X86_SSE)
44
+ # define SIMDE_MMX_USE_NATIVE_TYPE
45
+ # endif
46
+
47
+ # if defined(SIMDE_MMX_USE_NATIVE_TYPE)
48
+ # include <mmintrin.h>
49
+ # else
50
+ # if defined(SIMDE_MMX_NEON)
51
+ # include <arm_neon.h>
52
+ # endif
53
+ # endif
54
+ # include <stdint.h>
55
+ # include <limits.h>
56
+
57
+ SIMDE__BEGIN_DECLS
58
+
59
+ typedef union {
60
+ #if defined(SIMDE_VECTOR_SUBSCRIPT)
61
+ SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
62
+ SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
63
+ SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
64
+ SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
65
+ SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
66
+ SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
67
+ SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
68
+ SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
69
+ SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
70
+ SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
71
+ SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
72
+ #else
73
+ SIMDE_ALIGN(8) int8_t i8[8];
74
+ SIMDE_ALIGN(8) int16_t i16[4];
75
+ SIMDE_ALIGN(8) int32_t i32[2];
76
+ SIMDE_ALIGN(8) int64_t i64[1];
77
+ SIMDE_ALIGN(8) uint8_t u8[8];
78
+ SIMDE_ALIGN(8) uint16_t u16[4];
79
+ SIMDE_ALIGN(8) uint32_t u32[2];
80
+ SIMDE_ALIGN(8) uint64_t u64[1];
81
+ SIMDE_ALIGN(8) simde_float32 f32[2];
82
+ SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)];
83
+ SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
84
+ #endif
85
+
86
+ #if defined(SIMDE_MMX_USE_NATIVE_TYPE)
87
+ __m64 n;
88
+ #endif
89
+ #if defined(SIMDE_MMX_NEON)
90
+ int8x8_t neon_i8;
91
+ int16x4_t neon_i16;
92
+ int32x2_t neon_i32;
93
+ int64x1_t neon_i64;
94
+ uint8x8_t neon_u8;
95
+ uint16x4_t neon_u16;
96
+ uint32x2_t neon_u32;
97
+ uint64x1_t neon_u64;
98
+ float32x2_t neon_f32;
99
+ #endif
100
+ } simde__m64_private;
101
+
102
+ #if defined(SIMDE_MMX_USE_NATIVE_TYPE)
103
+ typedef __m64 simde__m64;
104
+ #elif defined(SIMDE_MMX_NEON)
105
+ typedef int32x2_t simde__m64;
106
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT)
107
+ typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
108
+ #else
109
+ typedef simde__m64_private simde__m64;
110
+ #endif
111
+
112
+ #if !defined(SIMDE_MMX_USE_NATIVE_TYPE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
113
+ #define SIMDE_MMX_ENABLE_NATIVE_ALIASES
114
+ typedef simde__m64 __m64;
115
+ #endif
116
+
117
+ HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
118
+ HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
119
+ #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
120
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8, "simde__m64 is not 8-byte aligned");
121
+ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8, "simde__m64_private is not 8-byte aligned");
122
+ #endif
123
+
124
+ SIMDE__FUNCTION_ATTRIBUTES
125
+ simde__m64
126
+ simde__m64_from_private(simde__m64_private v) {
127
+ simde__m64 r;
128
+ simde_memcpy(&r, &v, sizeof(r));
129
+ return r;
130
+ }
131
+
132
+ SIMDE__FUNCTION_ATTRIBUTES
133
+ simde__m64_private
134
+ simde__m64_to_private(simde__m64 v) {
135
+ simde__m64_private r;
136
+ simde_memcpy(&r, &v, sizeof(r));
137
+ return r;
138
+ }
139
+
140
+ SIMDE__FUNCTION_ATTRIBUTES
141
+ simde__m64
142
+ simde_mm_add_pi8 (simde__m64 a, simde__m64 b) {
143
+ #if defined(SIMDE_MMX_NATIVE)
144
+ return _mm_add_pi8(a, b);
145
+ #else
146
+ simde__m64_private r_;
147
+ simde__m64_private a_ = simde__m64_to_private(a);
148
+ simde__m64_private b_ = simde__m64_to_private(b);
149
+
150
+ #if defined(SIMDE_MMX_NEON)
151
+ r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
152
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
153
+ r_.i8 = a_.i8 + b_.i8;
154
+ #else
155
+ SIMDE__VECTORIZE
156
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
157
+ r_.i8[i] = a_.i8[i] + b_.i8[i];
158
+ }
159
+ #endif
160
+
161
+ return simde__m64_from_private(r_);
162
+ #endif
163
+ }
164
+ #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
165
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
166
+ # define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
167
+ # define _m_paddb(a, b) simde_m_paddb(a, b)
168
+ #endif
169
+
170
+ SIMDE__FUNCTION_ATTRIBUTES
171
+ simde__m64
172
+ simde_mm_add_pi16 (simde__m64 a, simde__m64 b) {
173
+ #if defined(SIMDE_MMX_NATIVE)
174
+ return _mm_add_pi16(a, b);
175
+ #else
176
+ simde__m64_private r_;
177
+ simde__m64_private a_ = simde__m64_to_private(a);
178
+ simde__m64_private b_ = simde__m64_to_private(b);
179
+
180
+ #if defined(SIMDE_MMX_NEON)
181
+ r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
182
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
183
+ r_.i16 = a_.i16 + b_.i16;
184
+ #else
185
+ SIMDE__VECTORIZE
186
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
187
+ r_.i16[i] = a_.i16[i] + b_.i16[i];
188
+ }
189
+ #endif
190
+
191
+ return simde__m64_from_private(r_);
192
+ #endif
193
+ }
194
+ #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
195
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
196
+ # define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
197
+ # define _m_add_paddw(a, b) simde_mm_add_pi16(a, b)
198
+ #endif
199
+
200
+ SIMDE__FUNCTION_ATTRIBUTES
201
+ simde__m64
202
+ simde_mm_add_pi32 (simde__m64 a, simde__m64 b) {
203
+ #if defined(SIMDE_MMX_NATIVE)
204
+ return _mm_add_pi32(a, b);
205
+ #else
206
+ simde__m64_private r_;
207
+ simde__m64_private a_ = simde__m64_to_private(a);
208
+ simde__m64_private b_ = simde__m64_to_private(b);
209
+
210
+ #if defined(SIMDE_MMX_NEON)
211
+ r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
212
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
213
+ r_.i32 = a_.i32 + b_.i32;
214
+ #else
215
+ SIMDE__VECTORIZE
216
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
217
+ r_.i32[i] = a_.i32[i] + b_.i32[i];
218
+ }
219
+ #endif
220
+
221
+ return simde__m64_from_private(r_);
222
+ #endif
223
+ }
224
+ #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
225
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
226
+ # define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
227
+ # define _m_add_paddd(a, b) simde_mm_add_pi32(a, b)
228
+ #endif
229
+
230
+ SIMDE__FUNCTION_ATTRIBUTES
231
+ simde__m64
232
+ simde_mm_adds_pi8 (simde__m64 a, simde__m64 b) {
233
+ #if defined(SIMDE_MMX_NATIVE)
234
+ return _mm_adds_pi8(a, b);
235
+ #else
236
+ simde__m64_private r_;
237
+ simde__m64_private a_ = simde__m64_to_private(a);
238
+ simde__m64_private b_ = simde__m64_to_private(b);
239
+
240
+ #if defined(SIMDE_MMX_NEON)
241
+ r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
242
+ #else
243
+ SIMDE__VECTORIZE
244
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
245
+ if ((((b_.i8[i]) > 0) && ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
246
+ r_.i8[i] = INT8_MAX;
247
+ } else if ((((b_.i8[i]) < 0) && ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
248
+ r_.i8[i] = INT8_MIN;
249
+ } else {
250
+ r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
251
+ }
252
+ }
253
+ #endif
254
+
255
+ return simde__m64_from_private(r_);
256
+ #endif
257
+ }
258
+ #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
259
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
260
+ # define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
261
+ # define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b)
262
+ #endif
263
+
264
+ SIMDE__FUNCTION_ATTRIBUTES
265
+ simde__m64
266
+ simde_mm_adds_pu8 (simde__m64 a, simde__m64 b) {
267
+ #if defined(SIMDE_MMX_NATIVE)
268
+ return _mm_adds_pu8(a, b);
269
+ #else
270
+ simde__m64_private r_;
271
+ simde__m64_private a_ = simde__m64_to_private(a);
272
+ simde__m64_private b_ = simde__m64_to_private(b);
273
+
274
+ #if defined(SIMDE_MMX_NEON)
275
+ r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
276
+ #else
277
+ SIMDE__VECTORIZE
278
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
279
+ const uint_fast16_t x = HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) + HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
280
+ if (x > UINT8_MAX)
281
+ r_.u8[i] = UINT8_MAX;
282
+ else
283
+ r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
284
+ }
285
+ #endif
286
+
287
+ return simde__m64_from_private(r_);
288
+ #endif
289
+ }
290
+ #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
291
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
292
+ # define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
293
+ # define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
294
+ #endif
295
+
296
+ SIMDE__FUNCTION_ATTRIBUTES
297
+ simde__m64
298
+ simde_mm_adds_pi16 (simde__m64 a, simde__m64 b) {
299
+ #if defined(SIMDE_MMX_NATIVE)
300
+ return _mm_adds_pi16(a, b);
301
+ #else
302
+ simde__m64_private r_;
303
+ simde__m64_private a_ = simde__m64_to_private(a);
304
+ simde__m64_private b_ = simde__m64_to_private(b);
305
+
306
+ #if defined(SIMDE_MMX_NEON)
307
+ r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
308
+ #else
309
+ SIMDE__VECTORIZE
310
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
311
+ if ((((b_.i16[i]) > 0) && ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
312
+ r_.i16[i] = INT16_MAX;
313
+ } else if ((((b_.i16[i]) < 0) && ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
314
+ r_.i16[i] = SHRT_MIN;
315
+ } else {
316
+ r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
317
+ }
318
+ }
319
+ #endif
320
+
321
+ return simde__m64_from_private(r_);
322
+ #endif
323
+ }
324
+ #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
325
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
326
+ # define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
327
+ # define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
328
+ #endif
329
+
330
+ SIMDE__FUNCTION_ATTRIBUTES
331
+ simde__m64
332
+ simde_mm_adds_pu16 (simde__m64 a, simde__m64 b) {
333
+ #if defined(SIMDE_MMX_NATIVE)
334
+ return _mm_adds_pu16(a, b);
335
+ #else
336
+ simde__m64_private r_;
337
+ simde__m64_private a_ = simde__m64_to_private(a);
338
+ simde__m64_private b_ = simde__m64_to_private(b);
339
+
340
+ #if defined(SIMDE_MMX_NEON)
341
+ r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
342
+ #else
343
+ SIMDE__VECTORIZE
344
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
345
+ const uint32_t x = a_.u16[i] + b_.u16[i];
346
+ if (x > UINT16_MAX)
347
+ r_.u16[i] = UINT16_MAX;
348
+ else
349
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
350
+ }
351
+ #endif
352
+
353
+ return simde__m64_from_private(r_);
354
+ #endif
355
+ }
356
+ #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
357
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
358
+ # define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
359
+ # define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
360
+ #endif
361
+
362
+ SIMDE__FUNCTION_ATTRIBUTES
363
+ simde__m64
364
+ simde_mm_and_si64 (simde__m64 a, simde__m64 b) {
365
+ #if defined(SIMDE_MMX_NATIVE)
366
+ return _mm_and_si64(a, b);
367
+ #else
368
+ simde__m64_private r_;
369
+ simde__m64_private a_ = simde__m64_to_private(a);
370
+ simde__m64_private b_ = simde__m64_to_private(b);
371
+
372
+ #if defined(SIMDE_MMX_NEON)
373
+ r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
374
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
375
+ r_.i64 = a_.i64 & b_.i64;
376
+ #else
377
+ r_.i64[0] = a_.i64[0] & b_.i64[0];
378
+ #endif
379
+
380
+ return simde__m64_from_private(r_);
381
+ #endif
382
+ }
383
+ #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
384
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
385
+ # define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
386
+ # define _m_pand(a, b) simde_mm_and_si64(a, b)
387
+ #endif
388
+
389
+ SIMDE__FUNCTION_ATTRIBUTES
390
+ simde__m64
391
+ simde_mm_andnot_si64 (simde__m64 a, simde__m64 b) {
392
+ #if defined(SIMDE_MMX_NATIVE)
393
+ return _mm_andnot_si64(a, b);
394
+ #else
395
+ simde__m64_private r_;
396
+ simde__m64_private a_ = simde__m64_to_private(a);
397
+ simde__m64_private b_ = simde__m64_to_private(b);
398
+
399
+ #if defined(SIMDE_MMX_NEON)
400
+ r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
401
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
402
+ r_.i32f = ~a_.i32f & b_.i32f;
403
+ #else
404
+ r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
405
+ #endif
406
+
407
+ return simde__m64_from_private(r_);
408
+ #endif
409
+ }
410
+ #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
411
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
412
+ # define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
413
+ # define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
414
+ #endif
415
+
416
+ SIMDE__FUNCTION_ATTRIBUTES
417
+ simde__m64
418
+ simde_mm_cmpeq_pi8 (simde__m64 a, simde__m64 b) {
419
+ #if defined(SIMDE_MMX_NATIVE)
420
+ return _mm_cmpeq_pi8(a, b);
421
+ #else
422
+ simde__m64_private r_;
423
+ simde__m64_private a_ = simde__m64_to_private(a);
424
+ simde__m64_private b_ = simde__m64_to_private(b);
425
+
426
+ #if defined(SIMDE_MMX_NEON)
427
+ r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8));
428
+ #else
429
+ SIMDE__VECTORIZE
430
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
431
+ r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
432
+ }
433
+ #endif
434
+
435
+ return simde__m64_from_private(r_);
436
+ #endif
437
+ }
438
+ #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
439
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
440
+ # define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
441
+ # define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
442
+ #endif
443
+
444
+ SIMDE__FUNCTION_ATTRIBUTES
445
+ simde__m64
446
+ simde_mm_cmpeq_pi16 (simde__m64 a, simde__m64 b) {
447
+ #if defined(SIMDE_MMX_NATIVE)
448
+ return _mm_cmpeq_pi16(a, b);
449
+ #else
450
+ simde__m64_private r_;
451
+ simde__m64_private a_ = simde__m64_to_private(a);
452
+ simde__m64_private b_ = simde__m64_to_private(b);
453
+
454
+ #if defined(SIMDE_MMX_NEON)
455
+ r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16));
456
+ #else
457
+ SIMDE__VECTORIZE
458
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
459
+ r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
460
+ }
461
+ #endif
462
+
463
+ return simde__m64_from_private(r_);
464
+ #endif
465
+ }
466
+ #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
467
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
468
+ # define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
469
+ # define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
470
+ #endif
471
+
472
+ SIMDE__FUNCTION_ATTRIBUTES
473
+ simde__m64
474
+ simde_mm_cmpeq_pi32 (simde__m64 a, simde__m64 b) {
475
+ #if defined(SIMDE_MMX_NATIVE)
476
+ return _mm_cmpeq_pi32(a, b);
477
+ #else
478
+ simde__m64_private r_;
479
+ simde__m64_private a_ = simde__m64_to_private(a);
480
+ simde__m64_private b_ = simde__m64_to_private(b);
481
+
482
+ #if defined(SIMDE_MMX_NEON)
483
+ r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32));
484
+ #else
485
+ SIMDE__VECTORIZE
486
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
487
+ r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
488
+ }
489
+ #endif
490
+
491
+ return simde__m64_from_private(r_);
492
+ #endif
493
+ }
494
+ #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
495
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
496
+ # define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
497
+ # define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
498
+ #endif
499
+
500
+ SIMDE__FUNCTION_ATTRIBUTES
501
+ simde__m64
502
+ simde_mm_cmpgt_pi8 (simde__m64 a, simde__m64 b) {
503
+ #if defined(SIMDE_MMX_NATIVE)
504
+ return _mm_cmpgt_pi8(a, b);
505
+ #else
506
+ simde__m64_private r_;
507
+ simde__m64_private a_ = simde__m64_to_private(a);
508
+ simde__m64_private b_ = simde__m64_to_private(b);
509
+
510
+ #if defined(SIMDE_MMX_NEON)
511
+ r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8));
512
+ #else
513
+ SIMDE__VECTORIZE
514
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
515
+ r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
516
+ }
517
+ #endif
518
+
519
+ return simde__m64_from_private(r_);
520
+ #endif
521
+ }
522
+ #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
523
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
524
+ # define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
525
+ # define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
526
+ #endif
527
+
528
+ SIMDE__FUNCTION_ATTRIBUTES
529
+ simde__m64
530
+ simde_mm_cmpgt_pi16 (simde__m64 a, simde__m64 b) {
531
+ #if defined(SIMDE_MMX_NATIVE)
532
+ return _mm_cmpgt_pi16(a, b);
533
+ #else
534
+ simde__m64_private r_;
535
+ simde__m64_private a_ = simde__m64_to_private(a);
536
+ simde__m64_private b_ = simde__m64_to_private(b);
537
+
538
+ #if defined(SIMDE_MMX_NEON)
539
+ r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16));
540
+ #else
541
+ SIMDE__VECTORIZE
542
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
543
+ r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
544
+ }
545
+ #endif
546
+
547
+ return simde__m64_from_private(r_);
548
+ #endif
549
+ }
550
+ #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
551
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
552
+ # define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
553
+ # define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
554
+ #endif
555
+
556
+ SIMDE__FUNCTION_ATTRIBUTES
557
+ simde__m64
558
+ simde_mm_cmpgt_pi32 (simde__m64 a, simde__m64 b) {
559
+ #if defined(SIMDE_MMX_NATIVE)
560
+ return _mm_cmpgt_pi32(a, b);
561
+ #else
562
+ simde__m64_private r_;
563
+ simde__m64_private a_ = simde__m64_to_private(a);
564
+ simde__m64_private b_ = simde__m64_to_private(b);
565
+
566
+ #if defined(SIMDE_MMX_NEON)
567
+ r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32));
568
+ #else
569
+ SIMDE__VECTORIZE
570
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
571
+ r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
572
+ }
573
+ #endif
574
+
575
+ return simde__m64_from_private(r_);
576
+ #endif
577
+ }
578
+ #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
579
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
580
+ # define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
581
+ # define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
582
+ #endif
583
+
584
+ SIMDE__FUNCTION_ATTRIBUTES
585
+ int64_t
586
+ simde_mm_cvtm64_si64 (simde__m64 a) {
587
+ #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
588
+ return _mm_cvtm64_si64(a);
589
+ #else
590
+ simde__m64_private a_ = simde__m64_to_private(a);
591
+
592
+ #if defined(SIMDE_MMX_NEON)
593
+ return vget_lane_s64(a_.neon_i64, 0);
594
+ #else
595
+ return a_.i64[0];
596
+ #endif
597
+ #endif
598
+ }
599
+ #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
600
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
601
+ # define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
602
+ # define _m_to_int64(a) simde_mm_cvtm64_si64(a)
603
+ #endif
604
+
605
+ SIMDE__FUNCTION_ATTRIBUTES
606
+ simde__m64
607
+ simde_mm_cvtsi32_si64 (int32_t a) {
608
+ #if defined(SIMDE_MMX_NATIVE)
609
+ return _mm_cvtsi32_si64(a);
610
+ #else
611
+ simde__m64_private r_;
612
+
613
+ #if defined(SIMDE_MMX_NEON)
614
+ const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = { a, 0 };
615
+ r_.neon_i32 = vld1_s32(av);
616
+ #else
617
+ r_.i32[0] = a;
618
+ r_.i32[1] = 0;
619
+ #endif
620
+
621
+ return simde__m64_from_private(r_);
622
+ #endif
623
+ }
624
+ #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
625
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
626
+ # define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
627
+ # define _m_from_int(a) simde_mm_cvtsi32_si64(a)
628
+ #endif
629
+
630
+ SIMDE__FUNCTION_ATTRIBUTES
631
+ simde__m64
632
+ simde_mm_cvtsi64_m64 (int64_t a) {
633
+ #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
634
+ return _mm_cvtsi64_m64(a);
635
+ #else
636
+ simde__m64_private r_;
637
+
638
+ #if defined(SIMDE_MMX_NEON)
639
+ r_.neon_i64 = vld1_s64(&a);
640
+ #else
641
+ r_.i64[0] = a;
642
+ #endif
643
+
644
+ return simde__m64_from_private(r_);
645
+ #endif
646
+ }
647
+ #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
648
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
649
+ # define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
650
+ # define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
651
+ #endif
652
+
653
+ SIMDE__FUNCTION_ATTRIBUTES
654
+ int32_t
655
+ simde_mm_cvtsi64_si32 (simde__m64 a) {
656
+ #if defined(SIMDE_MMX_NATIVE)
657
+ return _mm_cvtsi64_si32(a);
658
+ #else
659
+ simde__m64_private a_ = simde__m64_to_private(a);
660
+
661
+ #if defined(SIMDE_MMX_NEON)
662
+ return vget_lane_s32(a_.neon_i32, 0);
663
+ #else
664
+ return a_.i32[0];
665
+ #endif
666
+ #endif
667
+ }
668
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
669
+ # define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
670
+ #endif
671
+
672
+ SIMDE__FUNCTION_ATTRIBUTES
673
+ void
674
+ simde_mm_empty (void) {
675
+ #if defined(SIMDE_MMX_NATIVE)
676
+ _mm_empty();
677
+ #else
678
+ #endif
679
+ }
680
+ #define simde_m_empty() simde_mm_empty()
681
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
682
+ # define _mm_empty() simde_mm_empty()
683
+ # define _m_empty() simde_mm_empty()
684
+ #endif
685
+
686
+ SIMDE__FUNCTION_ATTRIBUTES
687
+ simde__m64
688
+ simde_mm_madd_pi16 (simde__m64 a, simde__m64 b) {
689
+ #if defined(SIMDE_MMX_NATIVE)
690
+ return _mm_madd_pi16(a, b);
691
+ #else
692
+ simde__m64_private r_;
693
+ simde__m64_private a_ = simde__m64_to_private(a);
694
+ simde__m64_private b_ = simde__m64_to_private(b);
695
+
696
+ #if defined(SIMDE_MMX_NEON)
697
+ int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
698
+ r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
699
+ #else
700
+ SIMDE__VECTORIZE
701
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i += 2) {
702
+ r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
703
+ }
704
+ #endif
705
+
706
+ return simde__m64_from_private(r_);
707
+ #endif
708
+ }
709
+ #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
710
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
711
+ # define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
712
+ # define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
713
+ #endif
714
+
715
+ SIMDE__FUNCTION_ATTRIBUTES
716
+ simde__m64
717
+ simde_mm_mulhi_pi16 (simde__m64 a, simde__m64 b) {
718
+ #if defined(SIMDE_MMX_NATIVE)
719
+ return _mm_mulhi_pi16(a, b);
720
+ #else
721
+ simde__m64_private r_;
722
+ simde__m64_private a_ = simde__m64_to_private(a);
723
+ simde__m64_private b_ = simde__m64_to_private(b);
724
+
725
+ #if defined(SIMDE_MMX_NEON)
726
+ const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
727
+ const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
728
+ const uint16x4_t t3 = vmovn_u32(t2);
729
+ r_.neon_i16 = vreinterpret_s16_u16(t3);
730
+ #else
731
+ SIMDE__VECTORIZE
732
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
733
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) >> 16));
734
+ }
735
+ #endif
736
+
737
+ return simde__m64_from_private(r_);
738
+ #endif
739
+ }
740
+ #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
741
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
742
+ # define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
743
+ # define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
744
+ #endif
745
+
746
+ SIMDE__FUNCTION_ATTRIBUTES
747
+ simde__m64
748
+ simde_mm_mullo_pi16 (simde__m64 a, simde__m64 b) {
749
+ #if defined(SIMDE_MMX_NATIVE)
750
+ return _mm_mullo_pi16(a, b);
751
+ #else
752
+ simde__m64_private r_;
753
+ simde__m64_private a_ = simde__m64_to_private(a);
754
+ simde__m64_private b_ = simde__m64_to_private(b);
755
+
756
+ #if defined(SIMDE_MMX_NEON)
757
+ const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
758
+ const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
759
+ r_.neon_i16 = vreinterpret_s16_u16(t2);
760
+ #else
761
+ SIMDE__VECTORIZE
762
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
763
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
764
+ }
765
+ #endif
766
+
767
+ return simde__m64_from_private(r_);
768
+ #endif
769
+ }
770
+ #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
771
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
772
+ # define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
773
+ # define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
774
+ #endif
775
+
776
+ SIMDE__FUNCTION_ATTRIBUTES
777
+ simde__m64
778
+ simde_mm_or_si64 (simde__m64 a, simde__m64 b) {
779
+ #if defined(SIMDE_MMX_NATIVE)
780
+ return _mm_or_si64(a, b);
781
+ #else
782
+ simde__m64_private r_;
783
+ simde__m64_private a_ = simde__m64_to_private(a);
784
+ simde__m64_private b_ = simde__m64_to_private(b);
785
+
786
+ #if defined(SIMDE_MMX_NEON)
787
+ r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
788
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
789
+ r_.i64 = a_.i64 | b_.i64;
790
+ #else
791
+ r_.i64[0] = a_.i64[0] | b_.i64[0];
792
+ #endif
793
+
794
+ return simde__m64_from_private(r_);
795
+ #endif
796
+ }
797
+ #define simde_m_por(a, b) simde_mm_or_si64(a, b)
798
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
799
+ # define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
800
+ # define _m_por(a, b) simde_mm_or_si64(a, b)
801
+ #endif
802
+
803
+ SIMDE__FUNCTION_ATTRIBUTES
804
+ simde__m64
805
+ simde_mm_packs_pi16 (simde__m64 a, simde__m64 b) {
806
+ #if defined(SIMDE_MMX_NATIVE)
807
+ return _mm_packs_pi16(a, b);
808
+ #else
809
+ simde__m64_private r_;
810
+ simde__m64_private a_ = simde__m64_to_private(a);
811
+ simde__m64_private b_ = simde__m64_to_private(b);
812
+
813
+ #if defined(SIMDE_MMX_NEON)
814
+ r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
815
+ #else
816
+ SIMDE__VECTORIZE
817
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
818
+ if (a_.i16[i] < INT8_MIN) {
819
+ r_.i8[i] = INT8_MIN;
820
+ } else if (a_.i16[i] > INT8_MAX) {
821
+ r_.i8[i] = INT8_MAX;
822
+ } else {
823
+ r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
824
+ }
825
+ }
826
+
827
+ SIMDE__VECTORIZE
828
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
829
+ if (b_.i16[i] < INT8_MIN) {
830
+ r_.i8[i + 4] = INT8_MIN;
831
+ } else if (b_.i16[i] > INT8_MAX) {
832
+ r_.i8[i + 4] = INT8_MAX;
833
+ } else {
834
+ r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
835
+ }
836
+ }
837
+ #endif
838
+
839
+ return simde__m64_from_private(r_);
840
+ #endif
841
+ }
842
+ #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
843
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
844
+ # define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
845
+ # define _m_packsswb(a, b) mm_packs_pi16(a, b)
846
+ #endif
847
+
848
+ SIMDE__FUNCTION_ATTRIBUTES
849
+ simde__m64
850
+ simde_mm_packs_pi32 (simde__m64 a, simde__m64 b) {
851
+ #if defined(SIMDE_MMX_NATIVE)
852
+ return _mm_packs_pi32(a, b);
853
+ #else
854
+ simde__m64_private r_;
855
+ simde__m64_private a_ = simde__m64_to_private(a);
856
+ simde__m64_private b_ = simde__m64_to_private(b);
857
+
858
+ #if defined(SIMDE_MMX_NEON)
859
+ r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
860
+ #else
861
+ SIMDE__VECTORIZE
862
+ for (size_t i = 0 ; i < (8 / sizeof(a_.i32[0])) ; i++) {
863
+ if (a_.i32[i] < SHRT_MIN) {
864
+ r_.i16[i] = SHRT_MIN;
865
+ } else if (a_.i32[i] > INT16_MAX) {
866
+ r_.i16[i] = INT16_MAX;
867
+ } else {
868
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
869
+ }
870
+ }
871
+
872
+ SIMDE__VECTORIZE
873
+ for (size_t i = 0 ; i < (8 / sizeof(b_.i32[0])) ; i++) {
874
+ if (b_.i32[i] < SHRT_MIN) {
875
+ r_.i16[i + 2] = SHRT_MIN;
876
+ } else if (b_.i32[i] > INT16_MAX) {
877
+ r_.i16[i + 2] = INT16_MAX;
878
+ } else {
879
+ r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
880
+ }
881
+ }
882
+ #endif
883
+
884
+ return simde__m64_from_private(r_);
885
+ #endif
886
+ }
887
+ #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
888
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
889
+ # define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
890
+ # define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
891
+ #endif
892
+
893
+ SIMDE__FUNCTION_ATTRIBUTES
894
+ simde__m64
895
+ simde_mm_packs_pu16 (simde__m64 a, simde__m64 b) {
896
+ #if defined(SIMDE_MMX_NATIVE)
897
+ return _mm_packs_pu16(a, b);
898
+ #else
899
+ simde__m64_private r_;
900
+ simde__m64_private a_ = simde__m64_to_private(a);
901
+ simde__m64_private b_ = simde__m64_to_private(b);
902
+
903
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
904
+ const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
905
+
906
+ /* Set elements which are < 0 to 0 */
907
+ const int16x8_t t2 = vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
908
+
909
+ /* Vector with all s16 elements set to UINT8_MAX */
910
+ const int16x8_t vmax = vmovq_n_s16((int16_t) UINT8_MAX);
911
+
912
+ /* Elements which are within the acceptable range */
913
+ const int16x8_t le_max = vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
914
+ const int16x8_t gt_max = vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
915
+
916
+ /* Final values as 16-bit integers */
917
+ const int16x8_t values = vorrq_s16(le_max, gt_max);
918
+
919
+ r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
920
+ #else
921
+ SIMDE__VECTORIZE
922
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
923
+ if (a_.i16[i] > UINT8_MAX) {
924
+ r_.u8[i] = UINT8_MAX;
925
+ } else if (a_.i16[i] < 0) {
926
+ r_.u8[i] = 0;
927
+ } else {
928
+ r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
929
+ }
930
+ }
931
+
932
+ SIMDE__VECTORIZE
933
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
934
+ if (b_.i16[i] > UINT8_MAX) {
935
+ r_.u8[i + 4] = UINT8_MAX;
936
+ } else if (b_.i16[i] < 0) {
937
+ r_.u8[i + 4] = 0;
938
+ } else {
939
+ r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
940
+ }
941
+ }
942
+ #endif
943
+
944
+ return simde__m64_from_private(r_);
945
+ #endif
946
+ }
947
+ #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
948
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
949
+ # define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
950
+ # define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
951
+ #endif
952
+
953
+ SIMDE__FUNCTION_ATTRIBUTES
954
+ simde__m64
955
+ simde_mm_set_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
956
+ #if defined(SIMDE_MMX_NATIVE)
957
+ return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
958
+ #else
959
+ simde__m64_private r_;
960
+
961
+ #if defined(SIMDE_MMX_NEON)
962
+ const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 };
963
+ r_.neon_i8 = vld1_s8(v);
964
+ #else
965
+ r_.i8[0] = e0;
966
+ r_.i8[1] = e1;
967
+ r_.i8[2] = e2;
968
+ r_.i8[3] = e3;
969
+ r_.i8[4] = e4;
970
+ r_.i8[5] = e5;
971
+ r_.i8[6] = e6;
972
+ r_.i8[7] = e7;
973
+ #endif
974
+
975
+ return simde__m64_from_private(r_);
976
+ #endif
977
+ }
978
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
979
+ # define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
980
+ #endif
981
+
982
+ SIMDE__FUNCTION_ATTRIBUTES
983
+ simde__m64
984
+ simde_x_mm_set_pu8 (uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
985
+ simde__m64_private r_;
986
+
987
+ #if defined(SIMDE_MMX_NATIVE)
988
+ r_.n = _mm_set_pi8(
989
+ HEDLEY_STATIC_CAST(int8_t, e7),
990
+ HEDLEY_STATIC_CAST(int8_t, e6),
991
+ HEDLEY_STATIC_CAST(int8_t, e5),
992
+ HEDLEY_STATIC_CAST(int8_t, e4),
993
+ HEDLEY_STATIC_CAST(int8_t, e3),
994
+ HEDLEY_STATIC_CAST(int8_t, e2),
995
+ HEDLEY_STATIC_CAST(int8_t, e1),
996
+ HEDLEY_STATIC_CAST(int8_t, e0));
997
+ #elif defined(SIMDE_MMX_NEON)
998
+ const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 };
999
+ r_.neon_u8 = vld1_u8(v);
1000
+ #else
1001
+ r_.u8[0] = e0;
1002
+ r_.u8[1] = e1;
1003
+ r_.u8[2] = e2;
1004
+ r_.u8[3] = e3;
1005
+ r_.u8[4] = e4;
1006
+ r_.u8[5] = e5;
1007
+ r_.u8[6] = e6;
1008
+ r_.u8[7] = e7;
1009
+ #endif
1010
+
1011
+ return simde__m64_from_private(r_);
1012
+ }
1013
+
1014
+ SIMDE__FUNCTION_ATTRIBUTES
1015
+ simde__m64
1016
+ simde_mm_set_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
1017
+ #if defined(SIMDE_MMX_NATIVE)
1018
+ return _mm_set_pi16(e3, e2, e1, e0);
1019
+ #else
1020
+ simde__m64_private r_;
1021
+
1022
+ #if defined(SIMDE_MMX_NEON)
1023
+ const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = { e0, e1, e2, e3 };
1024
+ r_.neon_i16 = vld1_s16(v);
1025
+ #else
1026
+ r_.i16[0] = e0;
1027
+ r_.i16[1] = e1;
1028
+ r_.i16[2] = e2;
1029
+ r_.i16[3] = e3;
1030
+ #endif
1031
+ return simde__m64_from_private(r_);
1032
+ #endif
1033
+ }
1034
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1035
+ # define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
1036
+ #endif
1037
+
1038
+ SIMDE__FUNCTION_ATTRIBUTES
1039
+ simde__m64
1040
+ simde_x_mm_set_pu16 (uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
1041
+ simde__m64_private r_;
1042
+
1043
+ #if defined(SIMDE_MMX_NATIVE)
1044
+ r_.n = _mm_set_pi16(
1045
+ HEDLEY_STATIC_CAST(int16_t, e3),
1046
+ HEDLEY_STATIC_CAST(int16_t, e2),
1047
+ HEDLEY_STATIC_CAST(int16_t, e1),
1048
+ HEDLEY_STATIC_CAST(int16_t, e0)
1049
+ );
1050
+ #elif defined(SIMDE_MMX_NEON)
1051
+ const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = { e0, e1, e2, e3 };
1052
+ r_.neon_u16 = vld1_u16(v);
1053
+ #else
1054
+ r_.u16[0] = e0;
1055
+ r_.u16[1] = e1;
1056
+ r_.u16[2] = e2;
1057
+ r_.u16[3] = e3;
1058
+ #endif
1059
+
1060
+ return simde__m64_from_private(r_);
1061
+ }
1062
+
1063
+ SIMDE__FUNCTION_ATTRIBUTES
1064
+ simde__m64
1065
+ simde_x_mm_set_pu32 (uint32_t e1, uint32_t e0) {
1066
+ simde__m64_private r_;
1067
+
1068
+ #if defined(SIMDE_MMX_NATIVE)
1069
+ r_.n = _mm_set_pi32(
1070
+ HEDLEY_STATIC_CAST(int32_t, e1),
1071
+ HEDLEY_STATIC_CAST(int32_t, e0));
1072
+ #elif defined(SIMDE_MMX_NEON)
1073
+ const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = { e0, e1 };
1074
+ r_.neon_u32 = vld1_u32(v);
1075
+ #else
1076
+ r_.u32[0] = e0;
1077
+ r_.u32[1] = e1;
1078
+ #endif
1079
+
1080
+ return simde__m64_from_private(r_);
1081
+ }
1082
+
1083
+ SIMDE__FUNCTION_ATTRIBUTES
1084
+ simde__m64
1085
+ simde_mm_set_pi32 (int32_t e1, int32_t e0) {
1086
+ simde__m64_private r_;
1087
+
1088
+ #if defined(SIMDE_MMX_NATIVE)
1089
+ r_.n = _mm_set_pi32(e1, e0);
1090
+ #elif defined(SIMDE_MMX_NEON)
1091
+ const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = { e0, e1 };
1092
+ r_.neon_i32 = vld1_s32(v);
1093
+ #else
1094
+ r_.i32[0] = e0;
1095
+ r_.i32[1] = e1;
1096
+ #endif
1097
+
1098
+ return simde__m64_from_private(r_);
1099
+ }
1100
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1101
+ # define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
1102
+ #endif
1103
+
1104
+ SIMDE__FUNCTION_ATTRIBUTES
1105
+ simde__m64
1106
+ simde_x_mm_set_pi64 (int64_t e0) {
1107
+ simde__m64_private r_;
1108
+
1109
+ #if defined(SIMDE_MMX_NEON)
1110
+ const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = { e0 };
1111
+ r_.neon_i64 = vld1_s64(v);
1112
+ #else
1113
+ r_.i64[0] = e0;
1114
+ #endif
1115
+
1116
+ return simde__m64_from_private(r_);
1117
+ }
1118
+
1119
+
1120
+ SIMDE__FUNCTION_ATTRIBUTES
1121
+ simde__m64
1122
+ simde_x_mm_set_f32x2 (simde_float32 e1, simde_float32 e0) {
1123
+ simde__m64_private r_;
1124
+
1125
+ #if defined(SIMDE_MMX_NEON)
1126
+ const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = { e0, e1 };
1127
+ r_.neon_f32 = vld1_f32(v);
1128
+ #else
1129
+ r_.f32[0] = e0;
1130
+ r_.f32[1] = e1;
1131
+ #endif
1132
+
1133
+ return simde__m64_from_private(r_);
1134
+ }
1135
+
1136
+ SIMDE__FUNCTION_ATTRIBUTES
1137
+ simde__m64
1138
+ simde_mm_set1_pi8 (int8_t a) {
1139
+ #if defined(SIMDE_MMX_NATIVE)
1140
+ return _mm_set1_pi8(a);
1141
+ #elif defined(SIMDE_MMX_NEON)
1142
+ simde__m64_private r_;
1143
+ r_.neon_i8 = vmov_n_s8(a);
1144
+ return simde__m64_from_private(r_);
1145
+ #else
1146
+ return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
1147
+ #endif
1148
+ }
1149
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1150
+ # define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
1151
+ #endif
1152
+
1153
+ SIMDE__FUNCTION_ATTRIBUTES
1154
+ simde__m64
1155
+ simde_mm_set1_pi16 (int16_t a) {
1156
+ #if defined(SIMDE_MMX_NATIVE)
1157
+ return _mm_set1_pi16(a);
1158
+ #elif defined(SIMDE_MMX_NEON)
1159
+ simde__m64_private r_;
1160
+ r_.neon_i16 = vmov_n_s16(a);
1161
+ return simde__m64_from_private(r_);
1162
+ #else
1163
+ return simde_mm_set_pi16(a, a, a, a);
1164
+ #endif
1165
+ }
1166
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1167
+ # define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
1168
+ #endif
1169
+
1170
+ SIMDE__FUNCTION_ATTRIBUTES
1171
+ simde__m64
1172
+ simde_mm_set1_pi32 (int32_t a) {
1173
+ #if defined(SIMDE_MMX_NATIVE)
1174
+ return _mm_set1_pi32(a);
1175
+ #elif defined(SIMDE_MMX_NEON)
1176
+ simde__m64_private r_;
1177
+ r_.neon_i32 = vmov_n_s32(a);
1178
+ return simde__m64_from_private(r_);
1179
+ #else
1180
+ return simde_mm_set_pi32(a, a);
1181
+ #endif
1182
+ }
1183
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1184
+ # define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
1185
+ #endif
1186
+
1187
+ SIMDE__FUNCTION_ATTRIBUTES
1188
+ simde__m64
1189
+ simde_mm_setr_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
1190
+ #if defined(SIMDE_MMX_NATIVE)
1191
+ return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
1192
+ #else
1193
+ return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
1194
+ #endif
1195
+ }
1196
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1197
+ # define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
1198
+ #endif
1199
+
1200
+ SIMDE__FUNCTION_ATTRIBUTES
1201
+ simde__m64
1202
+ simde_mm_setr_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
1203
+ #if defined(SIMDE_MMX_NATIVE)
1204
+ return _mm_setr_pi16(e3, e2, e1, e0);
1205
+ #else
1206
+ return simde_mm_set_pi16(e0, e1, e2, e3);
1207
+ #endif
1208
+ }
1209
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1210
+ # define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
1211
+ #endif
1212
+
1213
+ SIMDE__FUNCTION_ATTRIBUTES
1214
+ simde__m64
1215
+ simde_mm_setr_pi32 (int32_t e1, int32_t e0) {
1216
+ #if defined(SIMDE_MMX_NATIVE)
1217
+ return _mm_setr_pi32(e1, e0);
1218
+ #else
1219
+ return simde_mm_set_pi32(e0, e1);
1220
+ #endif
1221
+ }
1222
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1223
+ # define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
1224
+ #endif
1225
+
1226
+ SIMDE__FUNCTION_ATTRIBUTES
1227
+ simde__m64
1228
+ simde_mm_setzero_si64 (void) {
1229
+ #if defined(SIMDE_MMX_NATIVE)
1230
+ return _mm_setzero_si64();
1231
+ #elif defined(SIMDE_MMX_NEON)
1232
+ simde__m64_private r_;
1233
+ r_.neon_u32 = vmov_n_u32(0);
1234
+ return simde__m64_from_private(r_);
1235
+ #else
1236
+ return simde_mm_set_pi32(0, 0);
1237
+ #endif
1238
+ }
1239
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1240
+ # define _mm_setzero_si64() simde_mm_setzero_si64()
1241
+ #endif
1242
+
1243
+ SIMDE__FUNCTION_ATTRIBUTES
1244
+ simde__m64
1245
+ simde_mm_setone_si64 (void) {
1246
+ #if defined(SIMDE_SSE_NATIVE)
1247
+ __m64 t = _mm_undefined_ps();
1248
+ return _mm_andnot_ps(t, t);
1249
+ #else
1250
+ simde__m64 r;
1251
+ simde_memset(&r, ~0, sizeof(r));
1252
+ return r;
1253
+ #endif
1254
+ }
1255
+
1256
+ SIMDE__FUNCTION_ATTRIBUTES
1257
+ simde__m64
1258
+ simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) {
1259
+ #if defined(SIMDE_MMX_NATIVE)
1260
+ return _mm_sll_pi16(a, count);
1261
+ #else
1262
+ simde__m64_private r_;
1263
+ simde__m64_private a_ = simde__m64_to_private(a);
1264
+ simde__m64_private count_ = simde__m64_to_private(count);
1265
+
1266
+ #if defined(SIMDE_MMX_NEON)
1267
+ r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) vget_lane_u64(count_.neon_u64, 0)));
1268
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1269
+ r_.i16 = a_.i16 << count_.u64[0];
1270
+ #else
1271
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1272
+ simde_memset(&r_, 0, sizeof(r_));
1273
+ return simde__m64_from_private(r_);
1274
+ }
1275
+
1276
+ SIMDE__VECTORIZE
1277
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1278
+ r_.u16[i] = (uint16_t) (a_.u16[i] << count_.u64[0]);
1279
+ }
1280
+ #endif
1281
+
1282
+ return simde__m64_from_private(r_);
1283
+ #endif
1284
+ }
1285
+ #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
1286
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1287
+ # define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
1288
+ # define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
1289
+ #endif
1290
+
1291
+ SIMDE__FUNCTION_ATTRIBUTES
1292
+ simde__m64
1293
+ simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) {
1294
+ #if defined(SIMDE_MMX_NATIVE)
1295
+ return _mm_sll_pi32(a, count);
1296
+ #else
1297
+ simde__m64_private r_;
1298
+ simde__m64_private a_ = simde__m64_to_private(a);
1299
+ simde__m64_private count_ = simde__m64_to_private(count);
1300
+
1301
+ #if defined(SIMDE_MMX_NEON)
1302
+ r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t) vget_lane_u64(count_.neon_u64, 0)));
1303
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1304
+ r_.i32 = a_.i32 << count_.u64[0];
1305
+ #else
1306
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1307
+ simde_memset(&r_, 0, sizeof(r_));
1308
+ return simde__m64_from_private(r_);
1309
+ }
1310
+
1311
+ SIMDE__VECTORIZE
1312
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1313
+ r_.u32[i] = a_.u32[i] << count_.u64[0];
1314
+ }
1315
+ #endif
1316
+
1317
+ return simde__m64_from_private(r_);
1318
+ #endif
1319
+ }
1320
+ #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
1321
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1322
+ # define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
1323
+ # define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
1324
+ #endif
1325
+
1326
+ SIMDE__FUNCTION_ATTRIBUTES
1327
+ simde__m64
1328
+ simde_mm_slli_pi16 (simde__m64 a, int count) {
1329
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1330
+ return _mm_slli_pi16(a, count);
1331
+ #else
1332
+ simde__m64_private r_;
1333
+ simde__m64_private a_ = simde__m64_to_private(a);
1334
+
1335
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1336
+ r_.i16 = a_.i16 << count;
1337
+ #elif defined(SIMDE_MMX_NEON)
1338
+ r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count));
1339
+ #else
1340
+ SIMDE__VECTORIZE
1341
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1342
+ r_.u16[i] = (uint16_t) (a_.u16[i] << count);
1343
+ }
1344
+ #endif
1345
+
1346
+ return simde__m64_from_private(r_);
1347
+ #endif
1348
+ }
1349
+ #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1350
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1351
+ # define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
1352
+ # define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1353
+ #endif
1354
+
1355
+ SIMDE__FUNCTION_ATTRIBUTES
1356
+ simde__m64
1357
+ simde_mm_slli_pi32 (simde__m64 a, int count) {
1358
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1359
+ return _mm_slli_pi32(a, count);
1360
+ #else
1361
+ simde__m64_private r_;
1362
+ simde__m64_private a_ = simde__m64_to_private(a);
1363
+
1364
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1365
+ r_.i32 = a_.i32 << count;
1366
+ #elif defined(SIMDE_MMX_NEON)
1367
+ r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t) count));
1368
+ #else
1369
+ SIMDE__VECTORIZE
1370
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1371
+ r_.u32[i] = a_.u32[i] << count;
1372
+ }
1373
+ #endif
1374
+
1375
+ return simde__m64_from_private(r_);
1376
+ #endif
1377
+ }
1378
+ #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
1379
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1380
+ # define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
1381
+ # define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
1382
+ #endif
1383
+
1384
+ SIMDE__FUNCTION_ATTRIBUTES
1385
+ simde__m64
1386
+ simde_mm_slli_si64 (simde__m64 a, int count) {
1387
+ #if defined(SIMDE_MMX_NATIVE)
1388
+ return _mm_slli_si64(a, count);
1389
+ #else
1390
+ simde__m64_private r_;
1391
+ simde__m64_private a_ = simde__m64_to_private(a);
1392
+
1393
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1394
+ r_.i64 = a_.i64 << count;
1395
+ #elif defined(SIMDE_MMX_NEON)
1396
+ r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t) count));
1397
+ #else
1398
+ r_.u64[0] = a_.u64[0] << count;
1399
+ #endif
1400
+
1401
+ return simde__m64_from_private(r_);
1402
+ #endif
1403
+ }
1404
+ #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
1405
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1406
+ # define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
1407
+ # define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
1408
+ #endif
1409
+
1410
+ SIMDE__FUNCTION_ATTRIBUTES
1411
+ simde__m64
1412
+ simde_mm_sll_si64 (simde__m64 a, simde__m64 count) {
1413
+ #if defined(SIMDE_MMX_NATIVE)
1414
+ return _mm_sll_si64(a, count);
1415
+ #else
1416
+ simde__m64_private r_;
1417
+ simde__m64_private a_ = simde__m64_to_private(a);
1418
+ simde__m64_private count_ = simde__m64_to_private(count);
1419
+
1420
+ #if defined(SIMDE_MMX_NEON)
1421
+ r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
1422
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1423
+ r_.i64 = a_.i64 << count_.i64;
1424
+ #else
1425
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1426
+ simde_memset(&r_, 0, sizeof(r_));
1427
+ return simde__m64_from_private(r_);
1428
+ }
1429
+
1430
+ r_.u64[0] = a_.u64[0] << count_.u64[0];
1431
+ #endif
1432
+
1433
+ return simde__m64_from_private(r_);
1434
+ #endif
1435
+ }
1436
+ #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
1437
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1438
+ # define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
1439
+ # define _m_psllq(a, count) simde_mm_sll_si64(a, count)
1440
+ #endif
1441
+
1442
+ SIMDE__FUNCTION_ATTRIBUTES
1443
+ simde__m64
1444
+ simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) {
1445
+ #if defined(SIMDE_MMX_NATIVE)
1446
+ return _mm_srl_pi16(a, count);
1447
+ #else
1448
+ simde__m64_private r_;
1449
+ simde__m64_private a_ = simde__m64_to_private(a);
1450
+ simde__m64_private count_ = simde__m64_to_private(count);
1451
+
1452
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1453
+ r_.u16 = a_.u16 >> count_.u64[0];
1454
+ #elif defined(SIMDE_MMX_NEON)
1455
+ r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) vget_lane_u64(count_.neon_u64, 0))));
1456
+ #else
1457
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1458
+ simde_memset(&r_, 0, sizeof(r_));
1459
+ return simde__m64_from_private(r_);
1460
+ }
1461
+
1462
+ SIMDE__VECTORIZE
1463
+ for (size_t i = 0 ; i < sizeof(r_.u16) / sizeof(r_.u16[0]) ; i++) {
1464
+ r_.u16[i] = a_.u16[i] >> count_.u64[0];
1465
+ }
1466
+ #endif
1467
+
1468
+ return simde__m64_from_private(r_);
1469
+ #endif
1470
+ }
1471
+ #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1472
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1473
+ # define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
1474
+ # define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1475
+ #endif
1476
+
1477
+ SIMDE__FUNCTION_ATTRIBUTES
1478
+ simde__m64
1479
+ simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) {
1480
+ #if defined(SIMDE_MMX_NATIVE)
1481
+ return _mm_srl_pi32(a, count);
1482
+ #else
1483
+ simde__m64_private r_;
1484
+ simde__m64_private a_ = simde__m64_to_private(a);
1485
+ simde__m64_private count_ = simde__m64_to_private(count);
1486
+
1487
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1488
+ r_.u32 = a_.u32 >> count_.u64[0];
1489
+ #elif defined(SIMDE_MMX_NEON)
1490
+ r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) vget_lane_u64(count_.neon_u64, 0))));
1491
+ #else
1492
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1493
+ simde_memset(&r_, 0, sizeof(r_));
1494
+ return simde__m64_from_private(r_);
1495
+ }
1496
+
1497
+ SIMDE__VECTORIZE
1498
+ for (size_t i = 0 ; i < sizeof(r_.u32) / sizeof(r_.u32[0]) ; i++) {
1499
+ r_.u32[i] = a_.u32[i] >> count_.u64[0];
1500
+ }
1501
+ #endif
1502
+
1503
+ return simde__m64_from_private(r_);
1504
+ #endif
1505
+ }
1506
+ #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
1507
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1508
+ # define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
1509
+ # define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
1510
+ #endif
1511
+
1512
+ SIMDE__FUNCTION_ATTRIBUTES
1513
+ simde__m64
1514
+ simde_mm_srli_pi16 (simde__m64 a, int count) {
1515
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1516
+ return _mm_srli_pi16(a, count);
1517
+ #else
1518
+ simde__m64_private r_;
1519
+ simde__m64_private a_ = simde__m64_to_private(a);
1520
+
1521
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1522
+ r_.u16 = a_.u16 >> count;
1523
+ #elif defined(SIMDE_MMX_NEON)
1524
+ r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) count)));
1525
+ #else
1526
+ SIMDE__VECTORIZE
1527
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1528
+ r_.u16[i] = a_.u16[i] >> count;
1529
+ }
1530
+ #endif
1531
+
1532
+ return simde__m64_from_private(r_);
1533
+ #endif
1534
+ }
1535
+ #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1536
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1537
+ # define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
1538
+ # define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1539
+ #endif
1540
+
1541
+ SIMDE__FUNCTION_ATTRIBUTES
1542
+ simde__m64
1543
+ simde_mm_srli_pi32 (simde__m64 a, int count) {
1544
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1545
+ return _mm_srli_pi32(a, count);
1546
+ #else
1547
+ simde__m64_private r_;
1548
+ simde__m64_private a_ = simde__m64_to_private(a);
1549
+
1550
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1551
+ r_.u32 = a_.u32 >> count;
1552
+ #elif defined(SIMDE_MMX_NEON)
1553
+ r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) count)));
1554
+ #else
1555
+ SIMDE__VECTORIZE
1556
+ for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1557
+ r_.u32[i] = a_.u32[i] >> count;
1558
+ }
1559
+ #endif
1560
+
1561
+ return simde__m64_from_private(r_);
1562
+ #endif
1563
+ }
1564
+ #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1565
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1566
+ # define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
1567
+ # define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1568
+ #endif
1569
+
1570
+ SIMDE__FUNCTION_ATTRIBUTES
1571
+ simde__m64
1572
+ simde_mm_srli_si64 (simde__m64 a, int count) {
1573
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1574
+ return _mm_srli_si64(a, count);
1575
+ #else
1576
+ simde__m64_private r_;
1577
+ simde__m64_private a_ = simde__m64_to_private(a);
1578
+
1579
+ #if defined(SIMDE_MMX_NEON)
1580
+ r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
1581
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1582
+ r_.u64 = a_.u64 >> count;
1583
+ #else
1584
+ r_.u64[0] = a_.u64[0] >> count;
1585
+ #endif
1586
+
1587
+ return simde__m64_from_private(r_);
1588
+ #endif
1589
+ }
1590
+ #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1591
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1592
+ # define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
1593
+ # define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1594
+ #endif
1595
+
1596
+ SIMDE__FUNCTION_ATTRIBUTES
1597
+ simde__m64
1598
+ simde_mm_srl_si64 (simde__m64 a, simde__m64 count) {
1599
+ #if defined(SIMDE_MMX_NATIVE)
1600
+ return _mm_srl_si64(a, count);
1601
+ #else
1602
+ simde__m64_private r_;
1603
+ simde__m64_private a_ = simde__m64_to_private(a);
1604
+ simde__m64_private count_ = simde__m64_to_private(count);
1605
+
1606
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
1607
+ r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
1608
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1609
+ r_.u64 = a_.u64 >> count_.u64;
1610
+ #else
1611
+ if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1612
+ simde_memset(&r_, 0, sizeof(r_));
1613
+ return simde__m64_from_private(r_);
1614
+ }
1615
+
1616
+ r_.u64[0] = a_.u64[0] >> count_.u64[0];
1617
+ #endif
1618
+
1619
+ return simde__m64_from_private(r_);
1620
+ #endif
1621
+ }
1622
+ #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
1623
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1624
+ # define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
1625
+ # define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
1626
+ #endif
1627
+
1628
+ SIMDE__FUNCTION_ATTRIBUTES
1629
+ simde__m64
1630
+ simde_mm_srai_pi16 (simde__m64 a, int count) {
1631
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1632
+ return _mm_srai_pi16(a, count);
1633
+ #else
1634
+ simde__m64_private r_;
1635
+ simde__m64_private a_ = simde__m64_to_private(a);
1636
+
1637
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1638
+ r_.i16 = a_.i16 >> (count & 0xff);
1639
+ #elif defined(SIMDE_MMX_NEON)
1640
+ r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count));
1641
+ #else
1642
+ SIMDE__VECTORIZE
1643
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1644
+ r_.i16[i] = a_.i16[i] >> (count & 0xff);
1645
+ }
1646
+ #endif
1647
+
1648
+ return simde__m64_from_private(r_);
1649
+ #endif
1650
+ }
1651
+ #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1652
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1653
+ # define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
1654
+ # define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1655
+ #endif
1656
+
1657
+ SIMDE__FUNCTION_ATTRIBUTES
1658
+ simde__m64
1659
+ simde_mm_srai_pi32 (simde__m64 a, int count) {
1660
+ #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1661
+ return _mm_srai_pi32(a, count);
1662
+ #else
1663
+ simde__m64_private r_;
1664
+ simde__m64_private a_ = simde__m64_to_private(a);
1665
+
1666
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1667
+ r_.i32 = a_.i32 >> (count & 0xff);
1668
+ #elif defined(SIMDE_MMX_NEON)
1669
+ r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
1670
+ #else
1671
+ SIMDE__VECTORIZE
1672
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1673
+ r_.i32[i] = a_.i32[i] >> (count & 0xff);
1674
+ }
1675
+ #endif
1676
+
1677
+ return simde__m64_from_private(r_);
1678
+ #endif
1679
+ }
1680
+ #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
1681
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1682
+ # define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1683
+ # define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1684
+ #endif
1685
+
1686
+ SIMDE__FUNCTION_ATTRIBUTES
1687
+ simde__m64
1688
+ simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) {
1689
+ #if defined(SIMDE_MMX_NATIVE)
1690
+ return _mm_sra_pi16(a, count);
1691
+ #else
1692
+ simde__m64_private r_;
1693
+ simde__m64_private a_ = simde__m64_to_private(a);
1694
+ simde__m64_private count_ = simde__m64_to_private(count);
1695
+ const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
1696
+
1697
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1698
+ r_.i16 = a_.i16 >> cnt;
1699
+ #elif defined(SIMDE_MMX_NEON)
1700
+ r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0))));
1701
+ #else
1702
+ SIMDE__VECTORIZE
1703
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1704
+ r_.i16[i] = a_.i16[i] >> cnt;
1705
+ }
1706
+ #endif
1707
+
1708
+ return simde__m64_from_private(r_);
1709
+ #endif
1710
+ }
1711
+ #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
1712
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1713
+ # define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
1714
+ # define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
1715
+ #endif
1716
+
1717
+ SIMDE__FUNCTION_ATTRIBUTES
1718
+ simde__m64
1719
+ simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) {
1720
+ #if defined(SIMDE_MMX_NATIVE)
1721
+ return _mm_sra_pi32(a, count);
1722
+ #else
1723
+ simde__m64_private r_;
1724
+ simde__m64_private a_ = simde__m64_to_private(a);
1725
+ simde__m64_private count_ = simde__m64_to_private(count);
1726
+ const int32_t cnt = (count_.u64[0] > 31) ? 31 : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
1727
+
1728
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1729
+ r_.i32 = a_.i32 >> cnt;
1730
+ #elif defined(SIMDE_MMX_NEON)
1731
+ r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0))));
1732
+ #else
1733
+ SIMDE__VECTORIZE
1734
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1735
+ r_.i32[i] = a_.i32[i] >> cnt;
1736
+ }
1737
+ #endif
1738
+
1739
+ return simde__m64_from_private(r_);
1740
+ #endif
1741
+ }
1742
+ #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
1743
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1744
+ # define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
1745
+ # define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
1746
+ #endif
1747
+
1748
+ SIMDE__FUNCTION_ATTRIBUTES
1749
+ simde__m64
1750
+ simde_mm_sub_pi8 (simde__m64 a, simde__m64 b) {
1751
+ #if defined(SIMDE_MMX_NATIVE)
1752
+ return _mm_sub_pi8(a, b);
1753
+ #else
1754
+ simde__m64_private r_;
1755
+ simde__m64_private a_ = simde__m64_to_private(a);
1756
+ simde__m64_private b_ = simde__m64_to_private(b);
1757
+
1758
+ #if defined(SIMDE_MMX_NEON)
1759
+ r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
1760
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1761
+ r_.i8 = a_.i8 - b_.i8;
1762
+ #else
1763
+ SIMDE__VECTORIZE
1764
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1765
+ r_.i8[i] = a_.i8[i] - b_.i8[i];
1766
+ }
1767
+ #endif
1768
+
1769
+ return simde__m64_from_private(r_);
1770
+ #endif
1771
+ }
1772
+ #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
1773
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1774
+ # define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
1775
+ # define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
1776
+ #endif
1777
+
1778
+ SIMDE__FUNCTION_ATTRIBUTES
1779
+ simde__m64
1780
+ simde_mm_sub_pi16 (simde__m64 a, simde__m64 b) {
1781
+ #if defined(SIMDE_MMX_NATIVE)
1782
+ return _mm_sub_pi16(a, b);
1783
+ #else
1784
+ simde__m64_private r_;
1785
+ simde__m64_private a_ = simde__m64_to_private(a);
1786
+ simde__m64_private b_ = simde__m64_to_private(b);
1787
+
1788
+ #if defined(SIMDE_MMX_NEON)
1789
+ r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
1790
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1791
+ r_.i16 = a_.i16 - b_.i16;
1792
+ #else
1793
+ SIMDE__VECTORIZE
1794
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1795
+ r_.i16[i] = a_.i16[i] - b_.i16[i];
1796
+ }
1797
+ #endif
1798
+
1799
+ return simde__m64_from_private(r_);
1800
+ #endif
1801
+ }
1802
+ #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
1803
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1804
+ # define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
1805
+ # define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
1806
+ #endif
1807
+
1808
+ SIMDE__FUNCTION_ATTRIBUTES
1809
+ simde__m64
1810
+ simde_mm_sub_pi32 (simde__m64 a, simde__m64 b) {
1811
+ #if defined(SIMDE_MMX_NATIVE)
1812
+ return _mm_sub_pi32(a, b);
1813
+ #else
1814
+ simde__m64_private r_;
1815
+ simde__m64_private a_ = simde__m64_to_private(a);
1816
+ simde__m64_private b_ = simde__m64_to_private(b);
1817
+
1818
+ #if defined(SIMDE_MMX_NEON)
1819
+ r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
1820
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1821
+ r_.i32 = a_.i32 - b_.i32;
1822
+ #else
1823
+ SIMDE__VECTORIZE
1824
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1825
+ r_.i32[i] = a_.i32[i] - b_.i32[i];
1826
+ }
1827
+ #endif
1828
+
1829
+ return simde__m64_from_private(r_);
1830
+ #endif
1831
+ }
1832
+ #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
1833
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1834
+ # define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
1835
+ # define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
1836
+ #endif
1837
+
1838
+ SIMDE__FUNCTION_ATTRIBUTES
1839
+ simde__m64
1840
+ simde_mm_subs_pi8 (simde__m64 a, simde__m64 b) {
1841
+ #if defined(SIMDE_MMX_NATIVE)
1842
+ return _mm_subs_pi8(a, b);
1843
+ #else
1844
+ simde__m64_private r_;
1845
+ simde__m64_private a_ = simde__m64_to_private(a);
1846
+ simde__m64_private b_ = simde__m64_to_private(b);
1847
+
1848
+ #if defined(SIMDE_MMX_NEON)
1849
+ r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
1850
+ #else
1851
+ SIMDE__VECTORIZE
1852
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1853
+ if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
1854
+ r_.i8[i] = INT8_MIN;
1855
+ } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
1856
+ r_.i8[i] = INT8_MAX;
1857
+ } else {
1858
+ r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
1859
+ }
1860
+ }
1861
+ #endif
1862
+
1863
+ return simde__m64_from_private(r_);
1864
+ #endif
1865
+ }
1866
+ #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
1867
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1868
+ # define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
1869
+ # define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
1870
+ #endif
1871
+
1872
+ SIMDE__FUNCTION_ATTRIBUTES
1873
+ simde__m64
1874
+ simde_mm_subs_pu8 (simde__m64 a, simde__m64 b) {
1875
+ #if defined(SIMDE_MMX_NATIVE)
1876
+ return _mm_subs_pu8(a, b);
1877
+ #else
1878
+ simde__m64_private r_;
1879
+ simde__m64_private a_ = simde__m64_to_private(a);
1880
+ simde__m64_private b_ = simde__m64_to_private(b);
1881
+
1882
+ #if defined(SIMDE_MMX_NEON)
1883
+ r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
1884
+ #else
1885
+ SIMDE__VECTORIZE
1886
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1887
+ const int32_t x = a_.u8[i] - b_.u8[i];
1888
+ if (x < 0) {
1889
+ r_.u8[i] = 0;
1890
+ } else if (x > UINT8_MAX) {
1891
+ r_.u8[i] = UINT8_MAX;
1892
+ } else {
1893
+ r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
1894
+ }
1895
+ }
1896
+ #endif
1897
+
1898
+ return simde__m64_from_private(r_);
1899
+ #endif
1900
+ }
1901
+ #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
1902
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1903
+ # define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
1904
+ # define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
1905
+ #endif
1906
+
1907
+ SIMDE__FUNCTION_ATTRIBUTES
1908
+ simde__m64
1909
+ simde_mm_subs_pi16 (simde__m64 a, simde__m64 b) {
1910
+ #if defined(SIMDE_MMX_NATIVE)
1911
+ return _mm_subs_pi16(a, b);
1912
+ #else
1913
+ simde__m64_private r_;
1914
+ simde__m64_private a_ = simde__m64_to_private(a);
1915
+ simde__m64_private b_ = simde__m64_to_private(b);
1916
+
1917
+ #if defined(SIMDE_MMX_NEON)
1918
+ r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
1919
+ #else
1920
+ SIMDE__VECTORIZE
1921
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1922
+ if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
1923
+ r_.i16[i] = SHRT_MIN;
1924
+ } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
1925
+ r_.i16[i] = INT16_MAX;
1926
+ } else {
1927
+ r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
1928
+ }
1929
+ }
1930
+ #endif
1931
+
1932
+ return simde__m64_from_private(r_);
1933
+ #endif
1934
+ }
1935
+ #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
1936
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1937
+ # define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
1938
+ # define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
1939
+ #endif
1940
+
1941
+ SIMDE__FUNCTION_ATTRIBUTES
1942
+ simde__m64
1943
+ simde_mm_subs_pu16 (simde__m64 a, simde__m64 b) {
1944
+ #if defined(SIMDE_MMX_NATIVE)
1945
+ return _mm_subs_pu16(a, b);
1946
+ #else
1947
+ simde__m64_private r_;
1948
+ simde__m64_private a_ = simde__m64_to_private(a);
1949
+ simde__m64_private b_ = simde__m64_to_private(b);
1950
+
1951
+ #if defined(SIMDE_MMX_NEON)
1952
+ r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
1953
+ #else
1954
+ SIMDE__VECTORIZE
1955
+ for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1956
+ const int x = a_.u16[i] - b_.u16[i];
1957
+ if (x < 0) {
1958
+ r_.u16[i] = 0;
1959
+ } else if (x > UINT16_MAX) {
1960
+ r_.u16[i] = UINT16_MAX;
1961
+ } else {
1962
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
1963
+ }
1964
+ }
1965
+ #endif
1966
+
1967
+ return simde__m64_from_private(r_);
1968
+ #endif
1969
+ }
1970
+ #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
1971
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
1972
+ # define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
1973
+ # define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
1974
+ #endif
1975
+
1976
+ SIMDE__FUNCTION_ATTRIBUTES
1977
+ simde__m64
1978
+ simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) {
1979
+ #if defined(SIMDE_MMX_NATIVE)
1980
+ return _mm_unpackhi_pi8(a, b);
1981
+ #else
1982
+ simde__m64_private r_;
1983
+ simde__m64_private a_ = simde__m64_to_private(a);
1984
+ simde__m64_private b_ = simde__m64_to_private(b);
1985
+
1986
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
1987
+ r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
1988
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
1989
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15);
1990
+ #else
1991
+ r_.i8[0] = a_.i8[4];
1992
+ r_.i8[1] = b_.i8[4];
1993
+ r_.i8[2] = a_.i8[5];
1994
+ r_.i8[3] = b_.i8[5];
1995
+ r_.i8[4] = a_.i8[6];
1996
+ r_.i8[5] = b_.i8[6];
1997
+ r_.i8[6] = a_.i8[7];
1998
+ r_.i8[7] = b_.i8[7];
1999
+ #endif
2000
+
2001
+ return simde__m64_from_private(r_);
2002
+ #endif
2003
+ }
2004
+ #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2005
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2006
+ # define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
2007
+ # define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2008
+ #endif
2009
+
2010
+ SIMDE__FUNCTION_ATTRIBUTES
2011
+ simde__m64
2012
+ simde_mm_unpackhi_pi16 (simde__m64 a, simde__m64 b) {
2013
+ #if defined(SIMDE_MMX_NATIVE)
2014
+ return _mm_unpackhi_pi16(a, b);
2015
+ #else
2016
+ simde__m64_private r_;
2017
+ simde__m64_private a_ = simde__m64_to_private(a);
2018
+ simde__m64_private b_ = simde__m64_to_private(b);
2019
+
2020
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
2021
+ r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
2022
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2023
+ r_.i16 = SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
2024
+ #else
2025
+ r_.i16[0] = a_.i16[2];
2026
+ r_.i16[1] = b_.i16[2];
2027
+ r_.i16[2] = a_.i16[3];
2028
+ r_.i16[3] = b_.i16[3];
2029
+ #endif
2030
+
2031
+ return simde__m64_from_private(r_);
2032
+ #endif
2033
+ }
2034
+ #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2035
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2036
+ # define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
2037
+ # define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2038
+ #endif
2039
+
2040
+ SIMDE__FUNCTION_ATTRIBUTES
2041
+ simde__m64
2042
+ simde_mm_unpackhi_pi32 (simde__m64 a, simde__m64 b) {
2043
+ #if defined(SIMDE_MMX_NATIVE)
2044
+ return _mm_unpackhi_pi32(a, b);
2045
+ #else
2046
+ simde__m64_private r_;
2047
+ simde__m64_private a_ = simde__m64_to_private(a);
2048
+ simde__m64_private b_ = simde__m64_to_private(b);
2049
+
2050
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
2051
+ r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
2052
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2053
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 1, 3);
2054
+ #else
2055
+ r_.i32[0] = a_.i32[1];
2056
+ r_.i32[1] = b_.i32[1];
2057
+ #endif
2058
+
2059
+ return simde__m64_from_private(r_);
2060
+ #endif
2061
+ }
2062
+ #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2063
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2064
+ # define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
2065
+ # define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2066
+ #endif
2067
+
2068
+ SIMDE__FUNCTION_ATTRIBUTES
2069
+ simde__m64
2070
+ simde_mm_unpacklo_pi8 (simde__m64 a, simde__m64 b) {
2071
+ #if defined(SIMDE_MMX_NATIVE)
2072
+ return _mm_unpacklo_pi8(a, b);
2073
+ #else
2074
+ simde__m64_private r_;
2075
+ simde__m64_private a_ = simde__m64_to_private(a);
2076
+ simde__m64_private b_ = simde__m64_to_private(b);
2077
+
2078
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
2079
+ r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
2080
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2081
+ r_.i8 = SIMDE__SHUFFLE_VECTOR(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3, 11);
2082
+ #else
2083
+ r_.i8[0] = a_.i8[0];
2084
+ r_.i8[1] = b_.i8[0];
2085
+ r_.i8[2] = a_.i8[1];
2086
+ r_.i8[3] = b_.i8[1];
2087
+ r_.i8[4] = a_.i8[2];
2088
+ r_.i8[5] = b_.i8[2];
2089
+ r_.i8[6] = a_.i8[3];
2090
+ r_.i8[7] = b_.i8[3];
2091
+ #endif
2092
+
2093
+ return simde__m64_from_private(r_);
2094
+ #endif
2095
+ }
2096
+ #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2097
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2098
+ # define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
2099
+ # define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2100
+ #endif
2101
+
2102
+ SIMDE__FUNCTION_ATTRIBUTES
2103
+ simde__m64
2104
+ simde_mm_unpacklo_pi16 (simde__m64 a, simde__m64 b) {
2105
+ #if defined(SIMDE_MMX_NATIVE)
2106
+ return _mm_unpacklo_pi16(a, b);
2107
+ #else
2108
+ simde__m64_private r_;
2109
+ simde__m64_private a_ = simde__m64_to_private(a);
2110
+ simde__m64_private b_ = simde__m64_to_private(b);
2111
+
2112
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
2113
+ r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
2114
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2115
+ r_.i16 = SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
2116
+ #else
2117
+ r_.i16[0] = a_.i16[0];
2118
+ r_.i16[1] = b_.i16[0];
2119
+ r_.i16[2] = a_.i16[1];
2120
+ r_.i16[3] = b_.i16[1];
2121
+ #endif
2122
+
2123
+ return simde__m64_from_private(r_);
2124
+ #endif
2125
+ }
2126
+ #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2127
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2128
+ # define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
2129
+ # define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2130
+ #endif
2131
+
2132
+ SIMDE__FUNCTION_ATTRIBUTES
2133
+ simde__m64
2134
+ simde_mm_unpacklo_pi32 (simde__m64 a, simde__m64 b) {
2135
+ #if defined(SIMDE_MMX_NATIVE)
2136
+ return _mm_unpacklo_pi32(a, b);
2137
+ #else
2138
+ simde__m64_private r_;
2139
+ simde__m64_private a_ = simde__m64_to_private(a);
2140
+ simde__m64_private b_ = simde__m64_to_private(b);
2141
+
2142
+ #if defined(SIMDE_MMX_NEON) && defined(SIMDE_ARCH_AARCH64)
2143
+ r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
2144
+ #elif defined(SIMDE__SHUFFLE_VECTOR)
2145
+ r_.i32 = SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 0, 2);
2146
+ #else
2147
+ r_.i32[0] = a_.i32[0];
2148
+ r_.i32[1] = b_.i32[0];
2149
+ #endif
2150
+
2151
+ return simde__m64_from_private(r_);
2152
+ #endif
2153
+ }
2154
+ #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2155
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2156
+ # define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
2157
+ # define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2158
+ #endif
2159
+
2160
+ SIMDE__FUNCTION_ATTRIBUTES
2161
+ simde__m64
2162
+ simde_mm_xor_si64 (simde__m64 a, simde__m64 b) {
2163
+ #if defined(SIMDE_MMX_NATIVE)
2164
+ return _mm_xor_si64(a, b);
2165
+ #else
2166
+ simde__m64_private r_;
2167
+ simde__m64_private a_ = simde__m64_to_private(a);
2168
+ simde__m64_private b_ = simde__m64_to_private(b);
2169
+
2170
+ #if defined(SIMDE_MMX_NEON)
2171
+ r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
2172
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2173
+ r_.i32f = a_.i32f ^ b_.i32f;
2174
+ #else
2175
+ r_.u64[0] = a_.u64[0] ^ b_.u64[0];
2176
+ #endif
2177
+
2178
+ return simde__m64_from_private(r_);
2179
+ #endif
2180
+ }
2181
+ #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
2182
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2183
+ # define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
2184
+ # define _m_pxor(a, b) simde_mm_xor_si64(a, b)
2185
+ #endif
2186
+
2187
+ SIMDE__FUNCTION_ATTRIBUTES
2188
+ int32_t
2189
+ simde_m_to_int (simde__m64 a) {
2190
+ #if defined(SIMDE_MMX_NATIVE)
2191
+ return _m_to_int(a);
2192
+ #else
2193
+ simde__m64_private a_ = simde__m64_to_private(a);
2194
+
2195
+ #if defined(SIMDE_MMX_NEON)
2196
+ return vget_lane_s32(a_.neon_i32, 0);
2197
+ #else
2198
+ return a_.i32[0];
2199
+ #endif
2200
+ #endif
2201
+ }
2202
+ #if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES)
2203
+ # define _m_to_int(a) simde_m_to_int(a)
2204
+ #endif
2205
+
2206
+ SIMDE__END_DECLS
2207
+
2208
+ HEDLEY_DIAGNOSTIC_POP
2209
+
2210
+ #endif /* !defined(SIMDE__MMX_H) */