minimap2 0.2.24.3 → 0.2.24.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/ext/minimap2/lib/simde/CONTRIBUTING.md +114 -0
  3. data/ext/minimap2/lib/simde/COPYING +20 -0
  4. data/ext/minimap2/lib/simde/README.md +333 -0
  5. data/ext/minimap2/lib/simde/amalgamate.py +58 -0
  6. data/ext/minimap2/lib/simde/meson.build +33 -0
  7. data/ext/minimap2/lib/simde/netlify.toml +20 -0
  8. data/ext/minimap2/lib/simde/simde/arm/neon/float32x2.h +140 -0
  9. data/ext/minimap2/lib/simde/simde/arm/neon/float32x4.h +137 -0
  10. data/ext/minimap2/lib/simde/simde/arm/neon/float64x1.h +142 -0
  11. data/ext/minimap2/lib/simde/simde/arm/neon/float64x2.h +145 -0
  12. data/ext/minimap2/lib/simde/simde/arm/neon/int16x4.h +140 -0
  13. data/ext/minimap2/lib/simde/simde/arm/neon/int16x8.h +145 -0
  14. data/ext/minimap2/lib/simde/simde/arm/neon/int32x2.h +140 -0
  15. data/ext/minimap2/lib/simde/simde/arm/neon/int32x4.h +143 -0
  16. data/ext/minimap2/lib/simde/simde/arm/neon/int64x1.h +137 -0
  17. data/ext/minimap2/lib/simde/simde/arm/neon/int64x2.h +141 -0
  18. data/ext/minimap2/lib/simde/simde/arm/neon/int8x16.h +147 -0
  19. data/ext/minimap2/lib/simde/simde/arm/neon/int8x8.h +141 -0
  20. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x4.h +134 -0
  21. data/ext/minimap2/lib/simde/simde/arm/neon/uint16x8.h +138 -0
  22. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x2.h +134 -0
  23. data/ext/minimap2/lib/simde/simde/arm/neon/uint32x4.h +137 -0
  24. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x1.h +131 -0
  25. data/ext/minimap2/lib/simde/simde/arm/neon/uint64x2.h +135 -0
  26. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x16.h +141 -0
  27. data/ext/minimap2/lib/simde/simde/arm/neon/uint8x8.h +135 -0
  28. data/ext/minimap2/lib/simde/simde/arm/neon.h +97 -0
  29. data/ext/minimap2/lib/simde/simde/check.h +267 -0
  30. data/ext/minimap2/lib/simde/simde/debug-trap.h +83 -0
  31. data/ext/minimap2/lib/simde/simde/hedley.h +1899 -0
  32. data/ext/minimap2/lib/simde/simde/simde-arch.h +445 -0
  33. data/ext/minimap2/lib/simde/simde/simde-common.h +697 -0
  34. data/ext/minimap2/lib/simde/simde/x86/avx.h +5385 -0
  35. data/ext/minimap2/lib/simde/simde/x86/avx2.h +2402 -0
  36. data/ext/minimap2/lib/simde/simde/x86/avx512bw.h +391 -0
  37. data/ext/minimap2/lib/simde/simde/x86/avx512f.h +3389 -0
  38. data/ext/minimap2/lib/simde/simde/x86/avx512vl.h +112 -0
  39. data/ext/minimap2/lib/simde/simde/x86/fma.h +659 -0
  40. data/ext/minimap2/lib/simde/simde/x86/mmx.h +2210 -0
  41. data/ext/minimap2/lib/simde/simde/x86/sse.h +3696 -0
  42. data/ext/minimap2/lib/simde/simde/x86/sse2.h +5991 -0
  43. data/ext/minimap2/lib/simde/simde/x86/sse3.h +343 -0
  44. data/ext/minimap2/lib/simde/simde/x86/sse4.1.h +1783 -0
  45. data/ext/minimap2/lib/simde/simde/x86/sse4.2.h +105 -0
  46. data/ext/minimap2/lib/simde/simde/x86/ssse3.h +1053 -0
  47. data/ext/minimap2/lib/simde/simde/x86/svml.h +543 -0
  48. data/ext/minimap2/lib/simde/test/CMakeLists.txt +166 -0
  49. data/ext/minimap2/lib/simde/test/arm/meson.build +4 -0
  50. data/ext/minimap2/lib/simde/test/arm/neon/meson.build +23 -0
  51. data/ext/minimap2/lib/simde/test/arm/neon/skel.c +871 -0
  52. data/ext/minimap2/lib/simde/test/arm/neon/test-neon-internal.h +134 -0
  53. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.c +39 -0
  54. data/ext/minimap2/lib/simde/test/arm/neon/test-neon.h +10 -0
  55. data/ext/minimap2/lib/simde/test/arm/neon/vadd.c +1260 -0
  56. data/ext/minimap2/lib/simde/test/arm/neon/vdup_n.c +873 -0
  57. data/ext/minimap2/lib/simde/test/arm/neon/vmul.c +1084 -0
  58. data/ext/minimap2/lib/simde/test/arm/neon/vsub.c +1260 -0
  59. data/ext/minimap2/lib/simde/test/arm/test-arm-internal.h +18 -0
  60. data/ext/minimap2/lib/simde/test/arm/test-arm.c +20 -0
  61. data/ext/minimap2/lib/simde/test/arm/test-arm.h +8 -0
  62. data/ext/minimap2/lib/simde/test/cmake/AddCompilerFlags.cmake +171 -0
  63. data/ext/minimap2/lib/simde/test/cmake/ExtraWarningFlags.cmake +68 -0
  64. data/ext/minimap2/lib/simde/test/meson.build +64 -0
  65. data/ext/minimap2/lib/simde/test/munit/COPYING +21 -0
  66. data/ext/minimap2/lib/simde/test/munit/Makefile +55 -0
  67. data/ext/minimap2/lib/simde/test/munit/README.md +54 -0
  68. data/ext/minimap2/lib/simde/test/munit/example.c +351 -0
  69. data/ext/minimap2/lib/simde/test/munit/meson.build +37 -0
  70. data/ext/minimap2/lib/simde/test/munit/munit.c +2055 -0
  71. data/ext/minimap2/lib/simde/test/munit/munit.h +535 -0
  72. data/ext/minimap2/lib/simde/test/run-tests.c +20 -0
  73. data/ext/minimap2/lib/simde/test/run-tests.h +260 -0
  74. data/ext/minimap2/lib/simde/test/x86/avx.c +13752 -0
  75. data/ext/minimap2/lib/simde/test/x86/avx2.c +9977 -0
  76. data/ext/minimap2/lib/simde/test/x86/avx512bw.c +2664 -0
  77. data/ext/minimap2/lib/simde/test/x86/avx512f.c +10416 -0
  78. data/ext/minimap2/lib/simde/test/x86/avx512vl.c +210 -0
  79. data/ext/minimap2/lib/simde/test/x86/fma.c +2557 -0
  80. data/ext/minimap2/lib/simde/test/x86/meson.build +33 -0
  81. data/ext/minimap2/lib/simde/test/x86/mmx.c +2878 -0
  82. data/ext/minimap2/lib/simde/test/x86/skel.c +2984 -0
  83. data/ext/minimap2/lib/simde/test/x86/sse.c +5121 -0
  84. data/ext/minimap2/lib/simde/test/x86/sse2.c +9860 -0
  85. data/ext/minimap2/lib/simde/test/x86/sse3.c +486 -0
  86. data/ext/minimap2/lib/simde/test/x86/sse4.1.c +3446 -0
  87. data/ext/minimap2/lib/simde/test/x86/sse4.2.c +101 -0
  88. data/ext/minimap2/lib/simde/test/x86/ssse3.c +2084 -0
  89. data/ext/minimap2/lib/simde/test/x86/svml.c +1545 -0
  90. data/ext/minimap2/lib/simde/test/x86/test-avx.h +16 -0
  91. data/ext/minimap2/lib/simde/test/x86/test-avx512.h +25 -0
  92. data/ext/minimap2/lib/simde/test/x86/test-mmx.h +13 -0
  93. data/ext/minimap2/lib/simde/test/x86/test-sse.h +13 -0
  94. data/ext/minimap2/lib/simde/test/x86/test-sse2.h +13 -0
  95. data/ext/minimap2/lib/simde/test/x86/test-x86-internal.h +196 -0
  96. data/ext/minimap2/lib/simde/test/x86/test-x86.c +48 -0
  97. data/ext/minimap2/lib/simde/test/x86/test-x86.h +8 -0
  98. data/lib/minimap2/aligner.rb +2 -2
  99. data/lib/minimap2/ffi/constants.rb +3 -0
  100. data/lib/minimap2/version.rb +1 -1
  101. metadata +99 -3
@@ -0,0 +1,1053 @@
1
+ /* Permission is hereby granted, free of charge, to any person
2
+ * obtaining a copy of this software and associated documentation
3
+ * files (the "Software"), to deal in the Software without
4
+ * restriction, including without limitation the rights to use, copy,
5
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
6
+ * of the Software, and to permit persons to whom the Software is
7
+ * furnished to do so, subject to the following conditions:
8
+ *
9
+ * The above copyright notice and this permission notice shall be
10
+ * included in all copies or substantial portions of the Software.
11
+ *
12
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ * SOFTWARE.
20
+ *
21
+ * Copyright:
22
+ * 2017-2020 Evan Nemerson <evan@nemerson.com>
23
+ */
24
+
25
+ #if !defined(SIMDE__SSSE3_H)
26
+ # if !defined(SIMDE__SSSE3_H)
27
+ # define SIMDE__SSSE3_H
28
+ # endif
29
+ # include "sse3.h"
30
+
31
+ HEDLEY_DIAGNOSTIC_PUSH
32
+ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
33
+
34
+ # if defined(SIMDE_SSSE3_NATIVE)
35
+ # undef SIMDE_SSSE3_NATIVE
36
+ # endif
37
+ # if defined(SIMDE_ARCH_X86_SSSE3) && !defined(SIMDE_SSSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
38
+ # define SIMDE_SSSE3_NATIVE
39
+ # elif defined(SIMDE_ARCH_ARM_NEON) && !defined(SIMDE_SSSE3_NO_NEON) && !defined(SIMDE_NO_NEON)
40
+ # define SIMDE_SSSE3_NEON
41
+ # elif defined(SIMDE_ARCH_POWER_ALTIVEC)
42
+ # define SIMDE_SSSE3_POWER_ALTIVEC
43
+ # endif
44
+
45
+ # if defined(SIMDE_SSSE3_NATIVE) && !defined(SIMDE_SSE3_NATIVE)
46
+ # if defined(SIMDE_SSSE3_FORCE_NATIVE)
47
+ # error Native SSSE3 support requires native SSE3 support
48
+ # else
49
+ HEDLEY_WARNING("Native SSSE3 support requires native SSE3 support, disabling")
50
+ # undef SIMDE_SSSE3_NATIVE
51
+ # endif
52
+ # elif defined(SIMDE_SSSE3_NEON) && !defined(SIMDE_SSE3_NEON)
53
+ HEDLEY_WARNING("SSSE3 NEON support requires SSE3 NEON support, disabling")
54
+ # undef SIMDE_SSSE3_NEON
55
+ # endif
56
+
57
+ # if defined(SIMDE_SSSE3_NATIVE)
58
+ # include <tmmintrin.h>
59
+ # else
60
+ # if defined(SIMDE_SSSE3_NEON)
61
+ # include <arm_neon.h>
62
+ # endif
63
+ # endif
64
+
65
+ #if !defined(SIMDE_SSSE3_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
66
+ # define SIMDE_SSSE3_ENABLE_NATIVE_ALIASES
67
+ #endif
68
+
69
+ SIMDE__BEGIN_DECLS
70
+
71
+ SIMDE__FUNCTION_ATTRIBUTES
72
+ simde__m128i
73
+ simde_mm_abs_epi8 (simde__m128i a) {
74
+ #if defined(SIMDE_SSSE3_NATIVE)
75
+ return _mm_abs_epi8(a);
76
+ #else
77
+ simde__m128i_private
78
+ r_,
79
+ a_ = simde__m128i_to_private(a);
80
+
81
+ #if defined(SIMDE_SSSE3_NEON)
82
+ r_.neon_i8 = vabsq_s8(a_.neon_i8);
83
+ #elif defined(SIMDE_SSSE3_POWER_ALTIVEC)
84
+ r_.altivec_i8 = vec_abs(a_.altivec_i8);
85
+ #else
86
+ SIMDE__VECTORIZE
87
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
88
+ r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]);
89
+ }
90
+ #endif
91
+
92
+ return simde__m128i_from_private(r_);
93
+ #endif
94
+ }
95
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
96
+ # define _mm_abs_epi8(a) simde_mm_abs_epi8(a)
97
+ #endif
98
+
99
+ SIMDE__FUNCTION_ATTRIBUTES
100
+ simde__m128i
101
+ simde_mm_abs_epi16 (simde__m128i a) {
102
+ #if defined(SIMDE_SSSE3_NATIVE)
103
+ return _mm_abs_epi16(a);
104
+ #else
105
+ simde__m128i_private
106
+ r_,
107
+ a_ = simde__m128i_to_private(a);
108
+
109
+ #if defined(SIMDE_SSSE3_NEON)
110
+ r_.neon_i16 = vabsq_s16(a_.neon_i16);
111
+ #elif defined(SIMDE_SSSE3_POWER_ALTIVEC)
112
+ r_.altivec_i16 = vec_abs(a_.altivec_i16);
113
+ #else
114
+ SIMDE__VECTORIZE
115
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
116
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]);
117
+ }
118
+ #endif
119
+
120
+ return simde__m128i_from_private(r_);
121
+ #endif
122
+ }
123
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
124
+ # define _mm_abs_epi16(a) simde_mm_abs_epi16(a)
125
+ #endif
126
+
127
+ SIMDE__FUNCTION_ATTRIBUTES
128
+ simde__m128i
129
+ simde_mm_abs_epi32 (simde__m128i a) {
130
+ #if defined(SIMDE_SSSE3_NATIVE)
131
+ return _mm_abs_epi32(a);
132
+ #else
133
+ simde__m128i_private
134
+ r_,
135
+ a_ = simde__m128i_to_private(a);
136
+
137
+ #if defined(SIMDE_SSE3_NEON)
138
+ r_.neon_i32 = vabsq_s32(a_.neon_i32);
139
+ #elif defined(SIMDE_SSSE3_POWER_ALTIVEC)
140
+ r_.altivec_i32 = vec_abs(a_.altivec_i32);
141
+ #else
142
+ SIMDE__VECTORIZE
143
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
144
+ #if defined(_MSC_VER)
145
+ HEDLEY_DIAGNOSTIC_PUSH
146
+ #pragma warning(disable:4146)
147
+ #endif
148
+ r_.u32[i] = (a_.i32[i] < 0) ? (- HEDLEY_STATIC_CAST(uint32_t, a_.i32[i])) : HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]);
149
+ #if defined(_MSC_VER)
150
+ HEDLEY_DIAGNOSTIC_POP
151
+ #endif
152
+ }
153
+ #endif
154
+
155
+ return simde__m128i_from_private(r_);
156
+ #endif
157
+ }
158
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
159
+ # define _mm_abs_epi32(a) simde_mm_abs_epi32(a)
160
+ #endif
161
+
162
+ SIMDE__FUNCTION_ATTRIBUTES
163
+ simde__m64
164
+ simde_mm_abs_pi8 (simde__m64 a) {
165
+ #if defined(SIMDE_SSSE3_NATIVE)
166
+ return _mm_abs_pi8(a);
167
+ #else
168
+ simde__m64_private
169
+ r_,
170
+ a_ = simde__m64_to_private(a);
171
+
172
+ #if defined(SIMDE_SSSE3_NEON)
173
+ r_.neon_i8 = vabs_s8(a_.neon_i8);
174
+ #else
175
+ SIMDE__VECTORIZE
176
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
177
+ r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]);
178
+ }
179
+ #endif
180
+
181
+ return simde__m64_from_private(r_);
182
+ #endif
183
+ }
184
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
185
+ # define _mm_abs_pi8(a) simde_mm_abs_pi8(a)
186
+ #endif
187
+
188
+ SIMDE__FUNCTION_ATTRIBUTES
189
+ simde__m64
190
+ simde_mm_abs_pi16 (simde__m64 a) {
191
+ #if defined(SIMDE_SSSE3_NATIVE)
192
+ return _mm_abs_pi16(a);
193
+ #else
194
+ simde__m64_private
195
+ r_,
196
+ a_ = simde__m64_to_private(a);
197
+
198
+ #if defined(SIMDE_SSSE3_NEON)
199
+ r_.neon_i16 = vabs_s16(a_.neon_i16);
200
+ #else
201
+ SIMDE__VECTORIZE
202
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
203
+ r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]);
204
+ }
205
+ #endif
206
+
207
+ return simde__m64_from_private(r_);
208
+ #endif
209
+ }
210
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
211
+ # define _mm_abs_pi16(a) simde_mm_abs_pi16(a)
212
+ #endif
213
+
214
+ SIMDE__FUNCTION_ATTRIBUTES
215
+ simde__m64
216
+ simde_mm_abs_pi32 (simde__m64 a) {
217
+ #if defined(SIMDE_SSSE3_NATIVE)
218
+ return _mm_abs_pi32(a);
219
+ #else
220
+ simde__m64_private
221
+ r_,
222
+ a_ = simde__m64_to_private(a);
223
+
224
+ #if defined(SIMDE_SSSE3_NEON)
225
+ r_.neon_i32 = vabs_s32(a_.neon_i32);
226
+ #else
227
+ SIMDE__VECTORIZE
228
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
229
+ r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.i32[i] < 0) ? (- a_.i32[i]) : a_.i32[i]);
230
+ }
231
+ #endif
232
+
233
+ return simde__m64_from_private(r_);
234
+ #endif
235
+ }
236
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
237
+ # define _mm_abs_pi32(a) simde_mm_abs_pi32(a)
238
+ #endif
239
+
240
+ SIMDE__FUNCTION_ATTRIBUTES
241
+ simde__m128i
242
+ simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) {
243
+ simde__m128i_private
244
+ r_,
245
+ a_ = simde__m128i_to_private(a),
246
+ b_ = simde__m128i_to_private(b);
247
+
248
+ #if 0 && defined(SIMDE_BYTE_ORDER_LE)
249
+ const int bits = (8 * count) % 64;
250
+ const int eo = count / 8;
251
+
252
+ switch (eo) {
253
+ case 0:
254
+ r_.u64[0] = b_.u64[0] >> bits;
255
+ r_.u64[0] |= b_.u64[1] << (64 - bits);
256
+ r_.u64[1] = b_.u64[1] >> bits;
257
+ r_.u64[1] |= a_.u64[0] << (64 - bits);
258
+ break;
259
+ case 1:
260
+ r_.u64[0] = b_.u64[1] >> bits;
261
+ r_.u64[0] |= a_.u64[0] << (64 - bits);
262
+ r_.u64[1] = a_.u64[0] >> bits;
263
+ r_.u64[1] |= a_.u64[1] << (64 - bits);
264
+ break;
265
+ case 2:
266
+ r_.u64[0] = a_.u64[0] >> bits;
267
+ r_.u64[0] |= a_.u64[1] << (64 - bits);
268
+ r_.u64[1] = a_.u64[1] >> bits;
269
+ break;
270
+ case 3:
271
+ r_.u64[0] = a_.u64[1] >> bits;
272
+ r_.u64[1] = 0;
273
+ break;
274
+ default:
275
+ HEDLEY_UNREACHABLE();
276
+ break;
277
+ }
278
+ #else
279
+ if (HEDLEY_UNLIKELY(count > 31))
280
+ return simde_mm_setzero_si128();
281
+
282
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
283
+ const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
284
+ if (srcpos > 31) {
285
+ r_.i8[i] = 0;
286
+ } else if (srcpos > 15) {
287
+ r_.i8[i] = a_.i8[(srcpos) & 15];
288
+ } else {
289
+ r_.i8[i] = b_.i8[srcpos];
290
+ }
291
+ }
292
+ #endif
293
+
294
+ return simde__m128i_from_private(r_);
295
+ }
296
+ #if defined(SIMDE_SSSE3_NATIVE)
297
+ # define simde_mm_alignr_epi8(a, b, count) _mm_alignr_epi8(a, b, count)
298
+ #endif
299
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
300
+ # define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count)
301
+ #endif
302
+
303
+ #if defined(simde_mm_alignr_pi8)
304
+ # undef simde_mm_alignr_pi8
305
+ #endif
306
+ SIMDE__FUNCTION_ATTRIBUTES
307
+ simde__m64
308
+ simde_mm_alignr_pi8 (simde__m64 a, simde__m64 b, const int count) {
309
+ simde__m64_private
310
+ r_,
311
+ a_ = simde__m64_to_private(a),
312
+ b_ = simde__m64_to_private(b);
313
+
314
+ if (HEDLEY_UNLIKELY(count > 15))
315
+ return simde_mm_setzero_si64();
316
+
317
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
318
+ const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
319
+ if (srcpos > 15) {
320
+ r_.i8[i] = 0;
321
+ } else if (srcpos > 7) {
322
+ r_.i8[i] = a_.i8[(srcpos) & 7];
323
+ } else {
324
+ r_.i8[i] = b_.i8[srcpos];
325
+ }
326
+ }
327
+
328
+ return simde__m64_from_private(r_);
329
+ }
330
+ #if defined(SIMDE_SSSE3_NATIVE)
331
+ # define simde_mm_alignr_pi8(a, b, count) _mm_alignr_pi8(a, b, count)
332
+ #endif
333
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
334
+ # define _mm_alignr_pi8(a, b, count) simde_mm_alignr_pi8(a, b, count)
335
+ #endif
336
+
337
+ SIMDE__FUNCTION_ATTRIBUTES
338
+ simde__m128i
339
+ simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) {
340
+ #if defined(SIMDE_SSSE3_NATIVE)
341
+ return _mm_shuffle_epi8(a, b);
342
+ #else
343
+ simde__m128i_private
344
+ r_,
345
+ a_ = simde__m128i_to_private(a),
346
+ b_ = simde__m128i_to_private(b);
347
+
348
+ #if defined(SIMDE_SSSE3_NEON)
349
+ /* Mask out the bits we're not interested in. vtbl will result in 0
350
+ for any values outside of [0, 15], so if the high bit is set it
351
+ will return 0, just like in SSSE3. */
352
+ b_.neon_i8 = vandq_s8(b_.neon_i8, vdupq_n_s8((int8_t)((1 << 7) | 15)));
353
+
354
+ /* Convert a from an int8x16_t to an int8x8x2_t */
355
+ int8x8x2_t i = { .val = { vget_low_s8(a_.neon_i8), vget_high_s8(a_.neon_i8) } };
356
+
357
+ /* Table lookups */
358
+ int8x8_t l = vtbl2_s8(i, vget_low_s8(b_.neon_i8));
359
+ int8x8_t h = vtbl2_s8(i, vget_high_s8(b_.neon_i8));
360
+
361
+ r_.neon_i8 = vcombine_s8(l, h);
362
+ #else
363
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
364
+ r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7);
365
+ }
366
+ #endif
367
+
368
+ return simde__m128i_from_private(r_);
369
+ #endif
370
+ }
371
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
372
+ # define _mm_shuffle_epi8(a, b) simde_mm_shuffle_epi8(a, b)
373
+ #endif
374
+
375
+ SIMDE__FUNCTION_ATTRIBUTES
376
+ simde__m64
377
+ simde_mm_shuffle_pi8 (simde__m64 a, simde__m64 b) {
378
+ #if defined(SIMDE_SSSE3_NATIVE)
379
+ return _mm_shuffle_pi8(a, b);
380
+ #else
381
+ simde__m64_private
382
+ r_,
383
+ a_ = simde__m64_to_private(a),
384
+ b_ = simde__m64_to_private(b);
385
+
386
+ #if defined(SIMDE_SSSE3_NEON)
387
+ b_.neon_i8 = vand_s8(b_.neon_i8, vdup_n_s8((int8_t)((1 << 7) | 7)));
388
+ r_.neon_i8 = vtbl1_s8(a_.neon_i8, b_.neon_i8);
389
+ #else
390
+ for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
391
+ r_.i8[i] = a_.i8[b_.i8[i] & 7] & (~(b_.i8[i]) >> 7);
392
+ }
393
+ #endif
394
+
395
+ return simde__m64_from_private(r_);
396
+ #endif
397
+ }
398
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
399
+ # define _mm_shuffle_pi8(a, b) simde_mm_shuffle_pi8(a, b)
400
+ #endif
401
+
402
+ SIMDE__FUNCTION_ATTRIBUTES
403
+ simde__m128i
404
+ simde_mm_hadd_epi16 (simde__m128i a, simde__m128i b) {
405
+ #if defined(SIMDE_SSSE3_NATIVE)
406
+ return _mm_hadd_epi16(a, b);
407
+ #else
408
+ simde__m128i_private
409
+ r_,
410
+ a_ = simde__m128i_to_private(a),
411
+ b_ = simde__m128i_to_private(b);
412
+
413
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
414
+ r_.neon_i16 = vaddq_s16(vuzp1q_s16(a_.neon_i16, b_.neon_i16), vuzp2q_s16(a_.neon_i16, b_.neon_i16));
415
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
416
+ r_.i16 =
417
+ SIMDE__SHUFFLE_VECTOR(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14) +
418
+ SIMDE__SHUFFLE_VECTOR(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15);
419
+ #else
420
+ r_.i16[0] = a_.i16[0] + a_.i16[1];
421
+ r_.i16[1] = a_.i16[2] + a_.i16[3];
422
+ r_.i16[2] = a_.i16[4] + a_.i16[5];
423
+ r_.i16[3] = a_.i16[6] + a_.i16[7];
424
+ r_.i16[4] = b_.i16[0] + b_.i16[1];
425
+ r_.i16[5] = b_.i16[2] + b_.i16[3];
426
+ r_.i16[6] = b_.i16[4] + b_.i16[5];
427
+ r_.i16[7] = b_.i16[6] + b_.i16[7];
428
+ #endif
429
+
430
+ return simde__m128i_from_private(r_);
431
+ #endif
432
+ }
433
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
434
+ # define _mm_hadd_epi16(a, b) simde_mm_hadd_epi16(a, b)
435
+ #endif
436
+
437
+ SIMDE__FUNCTION_ATTRIBUTES
438
+ simde__m128i
439
+ simde_mm_hadd_epi32 (simde__m128i a, simde__m128i b) {
440
+ #if defined(SIMDE_SSSE3_NATIVE)
441
+ return _mm_hadd_epi32(a, b);
442
+ #else
443
+ simde__m128i_private
444
+ r_,
445
+ a_ = simde__m128i_to_private(a),
446
+ b_ = simde__m128i_to_private(b);
447
+
448
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
449
+ r_.neon_i32 = vaddq_s32(vuzp1q_s32(a_.neon_i32, b_.neon_i32), vuzp2q_s32(a_.neon_i32, b_.neon_i32));
450
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
451
+ r_.i32 =
452
+ SIMDE__SHUFFLE_VECTOR(32, 16, a_.i32, b_.i32, 0, 2, 4, 6) +
453
+ SIMDE__SHUFFLE_VECTOR(32, 16, a_.i32, b_.i32, 1, 3, 5, 7);
454
+ #else
455
+ r_.i32[0] = a_.i32[0] + a_.i32[1];
456
+ r_.i32[1] = a_.i32[2] + a_.i32[3];
457
+ r_.i32[2] = b_.i32[0] + b_.i32[1];
458
+ r_.i32[3] = b_.i32[2] + b_.i32[3];
459
+ #endif
460
+
461
+ return simde__m128i_from_private(r_);
462
+ #endif
463
+ }
464
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
465
+ # define _mm_hadd_epi32(a, b) simde_mm_hadd_epi32(a, b)
466
+ #endif
467
+
468
+ SIMDE__FUNCTION_ATTRIBUTES
469
+ simde__m64
470
+ simde_mm_hadd_pi16 (simde__m64 a, simde__m64 b) {
471
+ #if defined(SIMDE_SSSE3_NATIVE)
472
+ return _mm_hadd_pi16(a, b);
473
+ #else
474
+ simde__m64_private
475
+ r_,
476
+ a_ = simde__m64_to_private(a),
477
+ b_ = simde__m64_to_private(b);
478
+
479
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
480
+ r_.neon_i16 = vadd_s16(vuzp1_s16(a_.neon_i16, b_.neon_i16), vuzp2_s16(a_.neon_i16, b_.neon_i16));
481
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
482
+ r_.i16 =
483
+ SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) +
484
+ SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 1, 3, 5, 7);
485
+ #else
486
+ r_.i16[0] = a_.i16[0] + a_.i16[1];
487
+ r_.i16[1] = a_.i16[2] + a_.i16[3];
488
+ r_.i16[2] = b_.i16[0] + b_.i16[1];
489
+ r_.i16[3] = b_.i16[2] + b_.i16[3];
490
+ #endif
491
+
492
+ return simde__m64_from_private(r_);
493
+ #endif
494
+ }
495
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
496
+ # define _mm_hadd_pi16(a, b) simde_mm_hadd_pi16(a, b)
497
+ #endif
498
+
499
+ SIMDE__FUNCTION_ATTRIBUTES
500
+ simde__m64
501
+ simde_mm_hadd_pi32 (simde__m64 a, simde__m64 b) {
502
+ #if defined(SIMDE_SSSE3_NATIVE)
503
+ return _mm_hadd_pi32(a, b);
504
+ #else
505
+ simde__m64_private
506
+ r_,
507
+ a_ = simde__m64_to_private(a),
508
+ b_ = simde__m64_to_private(b);
509
+
510
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
511
+ r_.neon_i32 = vadd_s32(vuzp1_s32(a_.neon_i32, b_.neon_i32), vuzp2_s32(a_.neon_i32, b_.neon_i32));
512
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
513
+ r_.i32 =
514
+ SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 0, 2) +
515
+ SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 1, 3);
516
+ #else
517
+ r_.i32[0] = a_.i32[0] + a_.i32[1];
518
+ r_.i32[1] = b_.i32[0] + b_.i32[1];
519
+ #endif
520
+
521
+ return simde__m64_from_private(r_);
522
+ #endif
523
+ }
524
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
525
+ # define _mm_hadd_pi32(a, b) simde_mm_hadd_pi32(a, b)
526
+ #endif
527
+
528
+ SIMDE__FUNCTION_ATTRIBUTES
529
+ simde__m128i
530
+ simde_mm_hadds_epi16 (simde__m128i a, simde__m128i b) {
531
+ #if defined(SIMDE_SSSE3_NATIVE)
532
+ return _mm_hadds_epi16(a, b);
533
+ #else
534
+ simde__m128i_private
535
+ r_,
536
+ a_ = simde__m128i_to_private(a),
537
+ b_ = simde__m128i_to_private(b);
538
+
539
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
540
+ r_.neon_i16 = vqaddq_s16(vuzp1q_s16(a_.neon_i16, b_.neon_i16), vuzp2q_s16(a_.neon_i16, b_.neon_i16));
541
+ #else
542
+ for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
543
+ int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]);
544
+ r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN;
545
+ int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]);
546
+ r_.i16[i + 4] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN;
547
+ }
548
+ #endif
549
+
550
+ return simde__m128i_from_private(r_);
551
+ #endif
552
+ }
553
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
554
+ # define _mm_hadds_epi16(a, b) simde_mm_hadds_epi16(a, b)
555
+ #endif
556
+
557
+ SIMDE__FUNCTION_ATTRIBUTES
558
+ simde__m64
559
+ simde_mm_hadds_pi16 (simde__m64 a, simde__m64 b) {
560
+ #if defined(SIMDE_SSSE3_NATIVE)
561
+ return _mm_hadds_pi16(a, b);
562
+ #else
563
+ simde__m64_private
564
+ r_,
565
+ a_ = simde__m64_to_private(a),
566
+ b_ = simde__m64_to_private(b);
567
+
568
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
569
+ r_.neon_i16 = vqadd_s16(vuzp1_s16(a_.neon_i16, b_.neon_i16), vuzp2_s16(a_.neon_i16, b_.neon_i16));
570
+ #else
571
+ for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
572
+ int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]);
573
+ r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN;
574
+ int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]);
575
+ r_.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN;
576
+ }
577
+ #endif
578
+
579
+ return simde__m64_from_private(r_);
580
+ #endif
581
+ }
582
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
583
+ # define _mm_hadds_pi16(a, b) simde_mm_hadds_pi16(a, b)
584
+ #endif
585
+
586
+ SIMDE__FUNCTION_ATTRIBUTES
587
+ simde__m128i
588
+ simde_mm_hsub_epi16 (simde__m128i a, simde__m128i b) {
589
+ #if defined(SIMDE_SSSE3_NATIVE)
590
+ return _mm_hsub_epi16(a, b);
591
+ #else
592
+ simde__m128i_private
593
+ r_,
594
+ a_ = simde__m128i_to_private(a),
595
+ b_ = simde__m128i_to_private(b);
596
+
597
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
598
+ r_.neon_i16 = vsubq_s16(vuzp1q_s16(a_.neon_i16, b_.neon_i16), vuzp2q_s16(a_.neon_i16, b_.neon_i16));
599
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
600
+ r_.i16 =
601
+ SIMDE__SHUFFLE_VECTOR(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14) -
602
+ SIMDE__SHUFFLE_VECTOR(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15);
603
+ #else
604
+ r_.i16[0] = a_.i16[0] - a_.i16[1];
605
+ r_.i16[1] = a_.i16[2] - a_.i16[3];
606
+ r_.i16[2] = a_.i16[4] - a_.i16[5];
607
+ r_.i16[3] = a_.i16[6] - a_.i16[7];
608
+ r_.i16[4] = b_.i16[0] - b_.i16[1];
609
+ r_.i16[5] = b_.i16[2] - b_.i16[3];
610
+ r_.i16[6] = b_.i16[4] - b_.i16[5];
611
+ r_.i16[7] = b_.i16[6] - b_.i16[7];
612
+ #endif
613
+
614
+ return simde__m128i_from_private(r_);
615
+ #endif
616
+ }
617
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
618
+ # define _mm_hsub_epi16(a, b) simde_mm_hsub_epi16(a, b)
619
+ #endif
620
+
621
+ SIMDE__FUNCTION_ATTRIBUTES
622
+ simde__m128i
623
+ simde_mm_hsub_epi32 (simde__m128i a, simde__m128i b) {
624
+ #if defined(SIMDE_SSSE3_NATIVE)
625
+ return _mm_hsub_epi32(a, b);
626
+ #else
627
+ simde__m128i_private
628
+ r_,
629
+ a_ = simde__m128i_to_private(a),
630
+ b_ = simde__m128i_to_private(b);
631
+
632
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
633
+ r_.neon_i32 = vsubq_s32(vuzp1q_s32(a_.neon_i32, b_.neon_i32), vuzp2q_s32(a_.neon_i32, b_.neon_i32));
634
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
635
+ r_.i32 =
636
+ SIMDE__SHUFFLE_VECTOR(32, 16, a_.i32, b_.i32, 0, 2, 4, 6) -
637
+ SIMDE__SHUFFLE_VECTOR(32, 16, a_.i32, b_.i32, 1, 3, 5, 7);
638
+ #else
639
+ r_.i32[0] = a_.i32[0] - a_.i32[1];
640
+ r_.i32[1] = a_.i32[2] - a_.i32[3];
641
+ r_.i32[2] = b_.i32[0] - b_.i32[1];
642
+ r_.i32[3] = b_.i32[2] - b_.i32[3];
643
+ #endif
644
+
645
+ return simde__m128i_from_private(r_);
646
+ #endif
647
+ }
648
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
649
+ # define _mm_hsub_epi32(a, b) simde_mm_hsub_epi32(a, b)
650
+ #endif
651
+
652
+ SIMDE__FUNCTION_ATTRIBUTES
653
+ simde__m64
654
+ simde_mm_hsub_pi16 (simde__m64 a, simde__m64 b) {
655
+ #if defined(SIMDE_SSSE3_NATIVE)
656
+ return _mm_hsub_pi16(a, b);
657
+ #else
658
+ simde__m64_private
659
+ r_,
660
+ a_ = simde__m64_to_private(a),
661
+ b_ = simde__m64_to_private(b);
662
+
663
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
664
+ r_.neon_i16 = vsub_s16(vuzp1_s16(a_.neon_i16, b_.neon_i16), vuzp2_s16(a_.neon_i16, b_.neon_i16));
665
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
666
+ r_.i16 =
667
+ SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) -
668
+ SIMDE__SHUFFLE_VECTOR(16, 8, a_.i16, b_.i16, 1, 3, 5, 7);
669
+ #else
670
+ r_.i16[0] = a_.i16[0] - a_.i16[1];
671
+ r_.i16[1] = a_.i16[2] - a_.i16[3];
672
+ r_.i16[2] = b_.i16[0] - b_.i16[1];
673
+ r_.i16[3] = b_.i16[2] - b_.i16[3];
674
+ #endif
675
+
676
+ return simde__m64_from_private(r_);
677
+ #endif
678
+ }
679
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
680
+ # define _mm_hsub_pi16(a, b) simde_mm_hsub_pi16(a, b)
681
+ #endif
682
+
683
+ SIMDE__FUNCTION_ATTRIBUTES
684
+ simde__m64
685
+ simde_mm_hsub_pi32 (simde__m64 a, simde__m64 b) {
686
+ #if defined(SIMDE_SSSE3_NATIVE)
687
+ return _mm_hsub_pi32(a, b);
688
+ #else
689
+ simde__m64_private
690
+ r_,
691
+ a_ = simde__m64_to_private(a),
692
+ b_ = simde__m64_to_private(b);
693
+
694
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
695
+ r_.neon_i32 = vsub_s32(vuzp1_s32(a_.neon_i32, b_.neon_i32), vuzp2_s32(a_.neon_i32, b_.neon_i32));
696
+ #elif defined(SIMDE_ASSUME_VECTORIZATION) && defined(SIMDE__SHUFFLE_VECTOR)
697
+ r_.i32 =
698
+ SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 0, 2) -
699
+ SIMDE__SHUFFLE_VECTOR(32, 8, a_.i32, b_.i32, 1, 3);
700
+ #else
701
+ r_.i32[0] = a_.i32[0] - a_.i32[1];
702
+ r_.i32[1] = b_.i32[0] - b_.i32[1];
703
+ #endif
704
+
705
+ return simde__m64_from_private(r_);
706
+ #endif
707
+ }
708
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
709
+ # define _mm_hsub_pi32(a, b) simde_mm_hsub_pi32(a, b)
710
+ #endif
711
+
712
+ SIMDE__FUNCTION_ATTRIBUTES
713
+ simde__m128i
714
+ simde_mm_hsubs_epi16 (simde__m128i a, simde__m128i b) {
715
+ #if defined(SIMDE_SSSE3_NATIVE)
716
+ return _mm_hsubs_epi16(a, b);
717
+ #else
718
+ simde__m128i_private
719
+ r_,
720
+ a_ = simde__m128i_to_private(a),
721
+ b_ = simde__m128i_to_private(b);
722
+
723
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
724
+ r_.neon_i16 = vqsubq_s16(vuzp1q_s16(a_.neon_i16, b_.neon_i16), vuzp2q_s16(a_.neon_i16, b_.neon_i16));
725
+ #else
726
+ for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
727
+ int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]);
728
+ r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN;
729
+ int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]);
730
+ r_.i16[i + 4] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN;
731
+ }
732
+ #endif
733
+
734
+ return simde__m128i_from_private(r_);
735
+ #endif
736
+ }
737
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
738
+ # define _mm_hsubs_epi16(a, b) simde_mm_hsubs_epi16(a, b)
739
+ #endif
740
+
741
+ SIMDE__FUNCTION_ATTRIBUTES
742
+ simde__m64
743
+ simde_mm_hsubs_pi16 (simde__m64 a, simde__m64 b) {
744
+ #if defined(SIMDE_SSSE3_NATIVE)
745
+ return _mm_hsubs_pi16(a, b);
746
+ #else
747
+ simde__m64_private
748
+ r_,
749
+ a_ = simde__m64_to_private(a),
750
+ b_ = simde__m64_to_private(b);
751
+
752
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
753
+ r_.neon_i16 = vqsub_s16(vuzp1_s16(a_.neon_i16, b_.neon_i16), vuzp2_s16(a_.neon_i16, b_.neon_i16));
754
+ #else
755
+ for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
756
+ int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]);
757
+ r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN;
758
+ int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]);
759
+ r_.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN;
760
+ }
761
+ #endif
762
+
763
+ return simde__m64_from_private(r_);
764
+ #endif
765
+ }
766
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
767
+ # define _mm_hsubs_pi16(a, b) simde_mm_hsubs_pi16(a, b)
768
+ #endif
769
+
770
+ SIMDE__FUNCTION_ATTRIBUTES
771
+ simde__m128i
772
+ simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) {
773
+ #if defined(SIMDE_SSSE3_NATIVE)
774
+ return _mm_maddubs_epi16(a, b);
775
+ #else
776
+ simde__m128i_private
777
+ r_,
778
+ a_ = simde__m128i_to_private(a),
779
+ b_ = simde__m128i_to_private(b);
780
+
781
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
782
+ int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a_.neon_u8))), vmovl_s8(vget_low_s8(b_.neon_i8)));
783
+ int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a_.neon_u8))), vmovl_s8(vget_high_s8(b_.neon_i8)));
784
+ r_.neon_i16 = vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th));
785
+ #else
786
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
787
+ const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
788
+ int32_t ts =
789
+ (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) +
790
+ (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
791
+ r_.i16[i] = HEDLEY_LIKELY(ts > INT16_MIN) ? (HEDLEY_LIKELY(ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
792
+ }
793
+ #endif
794
+
795
+ return simde__m128i_from_private(r_);
796
+ #endif
797
+ }
798
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
799
+ # define _mm_maddubs_epi16(a, b) simde_mm_maddubs_epi16(a, b)
800
+ #endif
801
+
802
+ SIMDE__FUNCTION_ATTRIBUTES
803
+ simde__m64
804
+ simde_mm_maddubs_pi16 (simde__m64 a, simde__m64 b) {
805
+ #if defined(SIMDE_SSSE3_NATIVE)
806
+ return _mm_maddubs_pi16(a, b);
807
+ #else
808
+ simde__m64_private
809
+ r_,
810
+ a_ = simde__m64_to_private(a),
811
+ b_ = simde__m64_to_private(b);
812
+
813
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
814
+ int16x8_t ai = vreinterpretq_s16_u16(vmovl_u8(a_.neon_u8));
815
+ int16x8_t bi = vmovl_s8(b_.neon_i8);
816
+ int16x8_t p = vmulq_s16(ai, bi);
817
+ int16x4_t l = vget_low_s16(p);
818
+ int16x4_t h = vget_high_s16(p);
819
+ r_.neon_i16 = vqadd_s16(vuzp1_s16(l, h), vuzp2_s16(l, h));
820
+ #else
821
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
822
+ const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
823
+ int32_t ts =
824
+ (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) +
825
+ (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
826
+ r_.i16[i] = HEDLEY_LIKELY(ts > INT16_MIN) ? (HEDLEY_LIKELY(ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
827
+ }
828
+ #endif
829
+
830
+ return simde__m64_from_private(r_);
831
+ #endif
832
+ }
833
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
834
+ # define _mm_maddubs_pi16(a, b) simde_mm_maddubs_pi16(a, b)
835
+ #endif
836
+
837
+ SIMDE__FUNCTION_ATTRIBUTES
838
+ simde__m128i
839
+ simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) {
840
+ #if defined(SIMDE_SSSE3_NATIVE)
841
+ return _mm_mulhrs_epi16(a, b);
842
+ #else
843
+ simde__m128i_private
844
+ r_,
845
+ a_ = simde__m128i_to_private(a),
846
+ b_ = simde__m128i_to_private(b);
847
+
848
+ SIMDE__VECTORIZE
849
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
850
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
851
+ }
852
+
853
+ return simde__m128i_from_private(r_);
854
+ #endif
855
+ }
856
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
857
+ # define _mm_mulhrs_epi16(a, b) simde_mm_mulhrs_epi16(a, b)
858
+ #endif
859
+
860
+ SIMDE__FUNCTION_ATTRIBUTES
861
+ simde__m64
862
+ simde_mm_mulhrs_pi16 (simde__m64 a, simde__m64 b) {
863
+ #if defined(SIMDE_SSSE3_NATIVE)
864
+ return _mm_mulhrs_pi16(a, b);
865
+ #else
866
+ simde__m64_private
867
+ r_,
868
+ a_ = simde__m64_to_private(a),
869
+ b_ = simde__m64_to_private(b);
870
+
871
+ SIMDE__VECTORIZE
872
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
873
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
874
+ }
875
+
876
+ return simde__m64_from_private(r_);
877
+ #endif
878
+ }
879
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
880
+ # define _mm_mulhrs_pi16(a, b) simde_mm_mulhrs_pi16(a, b)
881
+ #endif
882
+
883
+ SIMDE__FUNCTION_ATTRIBUTES
884
+ simde__m128i
885
+ simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) {
886
+ #if defined(SIMDE_SSSE3_NATIVE)
887
+ return _mm_sign_epi8(a, b);
888
+ #else
889
+ simde__m128i_private
890
+ r_,
891
+ a_ = simde__m128i_to_private(a),
892
+ b_ = simde__m128i_to_private(b);
893
+
894
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
895
+ int8x16_t m = vreinterpretq_s8_u8(vcgezq_s8(b_.neon_i8));
896
+ r_.neon_i8 = veorq_s8(vandq_s8(a_.neon_i8, m), vandq_s8(vnegq_s8(a_.neon_i8), vmvnq_s8(m)));
897
+ #else
898
+ SIMDE__VECTORIZE
899
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
900
+ r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] > 0) ? (a_.i8[i]) : INT8_C(0));
901
+ }
902
+ #endif
903
+
904
+ return simde__m128i_from_private(r_);
905
+ #endif
906
+ }
907
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
908
+ # define _mm_sign_epi8(a, b) simde_mm_sign_epi8(a, b)
909
+ #endif
910
+
911
+ SIMDE__FUNCTION_ATTRIBUTES
912
+ simde__m128i
913
+ simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) {
914
+ #if defined(SIMDE_SSSE3_NATIVE)
915
+ return _mm_sign_epi16(a, b);
916
+ #else
917
+ simde__m128i_private
918
+ r_,
919
+ a_ = simde__m128i_to_private(a),
920
+ b_ = simde__m128i_to_private(b);
921
+
922
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
923
+ int16x8_t m = vreinterpretq_s16_u16(vcgezq_s16(b_.neon_i16));
924
+ r_.neon_i16 = veorq_s16(vandq_s16(a_.neon_i16, m), vandq_s16(vnegq_s16(a_.neon_i16), vmvnq_s16(m)));
925
+ #else
926
+ SIMDE__VECTORIZE
927
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
928
+ r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0));
929
+ }
930
+ #endif
931
+
932
+ return simde__m128i_from_private(r_);
933
+ #endif
934
+ }
935
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
936
+ # define _mm_sign_epi16(a, b) simde_mm_sign_epi16(a, b)
937
+ #endif
938
+
939
+ SIMDE__FUNCTION_ATTRIBUTES
940
+ simde__m128i
941
+ simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) {
942
+ #if defined(SIMDE_SSSE3_NATIVE)
943
+ return _mm_sign_epi32(a, b);
944
+ #else
945
+ simde__m128i_private
946
+ r_,
947
+ a_ = simde__m128i_to_private(a),
948
+ b_ = simde__m128i_to_private(b);
949
+
950
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
951
+ int32x4_t m = vreinterpretq_s32_u32(vcgezq_s32(b_.neon_i32));
952
+ r_.neon_i32 = veorq_s32(vandq_s32(a_.neon_i32, m), vandq_s32(vnegq_s32(a_.neon_i32), vmvnq_s32(m)));
953
+ #else
954
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
955
+ r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] > 0) ? (a_.i32[i]) : INT32_C(0));
956
+ }
957
+ #endif
958
+
959
+ return simde__m128i_from_private(r_);
960
+ #endif
961
+ }
962
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
963
+ # define _mm_sign_epi32(a, b) simde_mm_sign_epi32(a, b)
964
+ #endif
965
+
966
+ SIMDE__FUNCTION_ATTRIBUTES
967
+ simde__m64
968
+ simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) {
969
+ #if defined(SIMDE_SSSE3_NATIVE)
970
+ return _mm_sign_pi8(a, b);
971
+ #else
972
+ simde__m64_private
973
+ r_,
974
+ a_ = simde__m64_to_private(a),
975
+ b_ = simde__m64_to_private(b);
976
+
977
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
978
+ int8x8_t m = vreinterpret_s8_u8(vcgez_s8(b_.neon_i8));
979
+ r_.neon_i8 = veor_s8(vand_s8(a_.neon_i8, m), vand_s8(vneg_s8(a_.neon_i8), vmvn_s8(m)));
980
+ #else
981
+ SIMDE__VECTORIZE
982
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
983
+ r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] > 0) ? (a_.i8[i]) : INT8_C(0));
984
+ }
985
+ #endif
986
+
987
+ return simde__m64_from_private(r_);
988
+ #endif
989
+ }
990
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
991
+ # define _mm_sign_pi8(a, b) simde_mm_sign_pi8(a, b)
992
+ #endif
993
+
994
+ SIMDE__FUNCTION_ATTRIBUTES
995
+ simde__m64
996
+ simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) {
997
+ #if defined(SIMDE_SSSE3_NATIVE)
998
+ return _mm_sign_pi16(a, b);
999
+ #else
1000
+ simde__m64_private
1001
+ r_,
1002
+ a_ = simde__m64_to_private(a),
1003
+ b_ = simde__m64_to_private(b);
1004
+
1005
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
1006
+ int16x4_t m = vreinterpret_s16_u16(vcgez_s16(b_.neon_i16));
1007
+ r_.neon_i16 = veor_s16(vand_s16(a_.neon_i16, m), vand_s16(vneg_s16(a_.neon_i16), vmvn_s16(m)));
1008
+ #else
1009
+ SIMDE__VECTORIZE
1010
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1011
+ r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0));
1012
+ }
1013
+ #endif
1014
+
1015
+ return simde__m64_from_private(r_);
1016
+ #endif
1017
+ }
1018
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
1019
+ # define _mm_sign_pi16(a, b) simde_mm_sign_pi16(a, b)
1020
+ #endif
1021
+
1022
+ SIMDE__FUNCTION_ATTRIBUTES
1023
+ simde__m64
1024
+ simde_mm_sign_pi32 (simde__m64 a, simde__m64 b) {
1025
+ #if defined(SIMDE_SSSE3_NATIVE)
1026
+ return _mm_sign_pi32(a, b);
1027
+ #else
1028
+ simde__m64_private
1029
+ r_,
1030
+ a_ = simde__m64_to_private(a),
1031
+ b_ = simde__m64_to_private(b);
1032
+
1033
+ #if defined(SIMDE_SSSE3_NEON) && defined(SIMDE_ARCH_AARCH64)
1034
+ int32x2_t m = vreinterpret_s32_u32(vcgez_s32(b_.neon_i32));
1035
+ r_.neon_i32 = veor_s32(vand_s32(a_.neon_i32, m), vand_s32(vneg_s32(a_.neon_i32), vmvn_s32(m)));
1036
+ #else
1037
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1038
+ r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] > 0) ? (a_.i32[i]) : INT32_C(0));
1039
+ }
1040
+ #endif
1041
+
1042
+ return simde__m64_from_private(r_);
1043
+ #endif
1044
+ }
1045
+ #if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES)
1046
+ # define _mm_sign_pi32(a, b) simde_mm_sign_pi32(a, b)
1047
+ #endif
1048
+
1049
+ SIMDE__END_DECLS
1050
+
1051
+ HEDLEY_DIAGNOSTIC_POP
1052
+
1053
+ #endif /* !defined(SIMDE__SSE2_H) */