minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,1689 @@
1
+ #ifndef SSE2NEON_H
2
+ #define SSE2NEON_H
3
+
4
+ // This header file provides a simple API translation layer
5
+ // between SSE intrinsics to their corresponding ARM NEON versions
6
+ //
7
+ // This header file does not (yet) translate *all* of the SSE intrinsics.
8
+ // Since this is in support of a specific porting effort, I have only
9
+ // included the intrinsics I needed to get my port to work.
10
+ //
11
+ // Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
12
+ //
13
+ // If you want to improve or add to this project, send me an
14
+ // email and I will probably approve your access to the depot.
15
+ //
16
+ // Project is located here:
17
+ //
18
+ // https://github.com/jratcliff63367/sse2neon
19
+ //
20
+ // Show your appreciation for open source by sending me a bitcoin tip to the following
21
+ // address.
22
+ //
23
+ // TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
24
+ // https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
25
+ //
26
+ //
27
+ // Contributors to this project are:
28
+ //
29
+ // John W. Ratcliff : jratcliffscarab@gmail.com
30
+ // Brandon Rowlett : browlett@nvidia.com
31
+ // Ken Fast : kfast@gdeb.com
32
+ // Eric van Beurden : evanbeurden@nvidia.com
33
+ // Alexander Potylitsin : apotylitsin@nvidia.com
34
+ // Hasindu Gamaarachchi : hasindu2008@gmail.com
35
+ //
36
+ //
37
+ // *********************************************************************************************************************
38
+ // apoty: March 17, 2017
39
+ // Current version was changed in most to fix issues and potential issues.
40
+ // All unit tests were rewritten as a part of forge lib project to cover all implemented functions.
41
+ // *********************************************************************************************************************
42
+ // Release notes for January 20, 2017 version:
43
+ //
44
+ // The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition
45
+ // The unit-tests now test 10,000 random float and int values against each intrinsic.
46
+ //
47
+ // SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and
48
+ // fully tested on NEON/ARM. The remaining 56 still need unit tests implemented.
49
+ //
50
+ // A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
51
+ // attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128
52
+ // struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
53
+ //
54
+ // However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
55
+ // can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot
56
+ // cast or otherwise alias the base NEON data type for intrinsic operations.
57
+ //
58
+ // A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with
59
+ // a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
60
+ // to return the correct value. This is now fixed.
61
+ //
62
+ // A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers.
63
+ // It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int
64
+ // is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does.
65
+ // As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get
66
+ // a build error.
67
+ //
68
+ // Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
69
+ // producing the correct results on NEON. These unit tests will be added as soon as possible.
70
+ //
71
+ // Here is the list of new instrinsics which have been added:
72
+ //
73
+ // _mm_cvtss_f32 : extracts the lower order floating point value from the parameter
74
+ // _mm_add_ss : adds the scalar single - precision floating point values of a and b
75
+ // _mm_div_ps : Divides the four single - precision, floating - point values of a and b.
76
+ // _mm_div_ss : Divides the scalar single - precision floating point value of a by b.
77
+ // _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in.
78
+ // _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
79
+ // _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation
80
+ // _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
81
+ // _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
82
+ // _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
83
+ // _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation.
84
+ // _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation
85
+ // _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
86
+ // _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
87
+ //
88
+ // *********************************************************************************************************************
89
+ /*
90
+ ** The MIT license:
91
+ **
92
+ ** Permission is hereby granted, free of charge, to any person obtaining a copy
93
+ ** of this software and associated documentation files (the "Software"), to deal
94
+ ** in the Software without restriction, including without limitation the rights
95
+ ** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
96
+ ** copies of the Software, and to permit persons to whom the Software is furnished
97
+ ** to do so, subject to the following conditions:
98
+ **
99
+ ** The above copyright notice and this permission notice shall be included in all
100
+ ** copies or substantial portions of the Software.
101
+
102
+ ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103
+ ** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104
+ ** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105
+ ** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
106
+ ** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
107
+ ** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
108
+ */
109
+
110
+ #define ENABLE_CPP_VERSION 0
111
+
112
+ #if defined(__GNUC__) || defined(__clang__)
113
+ # pragma push_macro("FORCE_INLINE")
114
+ # pragma push_macro("ALIGN_STRUCT")
115
+ # define FORCE_INLINE static inline __attribute__((always_inline))
116
+ # define ALIGN_STRUCT(x) __attribute__((aligned(x)))
117
+ #else
118
+ # error "Macro name collisions may happens with unknown compiler"
119
+ # define FORCE_INLINE static inline
120
+ # define ALIGN_STRUCT(x) __declspec(align(x))
121
+ #endif
122
+
123
+ #include <stdint.h>
124
+ #include "arm_neon.h"
125
+
126
+
127
+ /*******************************************************/
128
+ /* MACRO for shuffle parameter for _mm_shuffle_ps(). */
129
+ /* Argument fp3 is a digit[0123] that represents the fp*/
130
+ /* from argument "b" of mm_shuffle_ps that will be */
131
+ /* placed in fp3 of result. fp2 is the same for fp2 in */
132
+ /* result. fp1 is a digit[0123] that represents the fp */
133
+ /* from argument "a" of mm_shuffle_ps that will be */
134
+ /* places in fp1 of result. fp0 is the same for fp0 of */
135
+ /* result */
136
+ /*******************************************************/
137
+ #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
138
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
139
+
140
+ /* indicate immediate constant argument in a given range */
141
+ #define __constrange(a,b) \
142
+ const
143
+
144
+ typedef float32x4_t __m128;
145
+ typedef int32x4_t __m128i;
146
+
147
+
148
+ // ******************************************
149
+ // type-safe casting between types
150
+ // ******************************************
151
+
152
+ #define vreinterpretq_m128_f16(x) \
153
+ vreinterpretq_f32_f16(x)
154
+
155
+ #define vreinterpretq_m128_f32(x) \
156
+ (x)
157
+
158
+ #define vreinterpretq_m128_f64(x) \
159
+ vreinterpretq_f32_f64(x)
160
+
161
+
162
+ #define vreinterpretq_m128_u8(x) \
163
+ vreinterpretq_f32_u8(x)
164
+
165
+ #define vreinterpretq_m128_u16(x) \
166
+ vreinterpretq_f32_u16(x)
167
+
168
+ #define vreinterpretq_m128_u32(x) \
169
+ vreinterpretq_f32_u32(x)
170
+
171
+ #define vreinterpretq_m128_u64(x) \
172
+ vreinterpretq_f32_u64(x)
173
+
174
+
175
+ #define vreinterpretq_m128_s8(x) \
176
+ vreinterpretq_f32_s8(x)
177
+
178
+ #define vreinterpretq_m128_s16(x) \
179
+ vreinterpretq_f32_s16(x)
180
+
181
+ #define vreinterpretq_m128_s32(x) \
182
+ vreinterpretq_f32_s32(x)
183
+
184
+ #define vreinterpretq_m128_s64(x) \
185
+ vreinterpretq_f32_s64(x)
186
+
187
+
188
+ #define vreinterpretq_f16_m128(x) \
189
+ vreinterpretq_f16_f32(x)
190
+
191
+ #define vreinterpretq_f32_m128(x) \
192
+ (x)
193
+
194
+ #define vreinterpretq_f64_m128(x) \
195
+ vreinterpretq_f64_f32(x)
196
+
197
+
198
+ #define vreinterpretq_u8_m128(x) \
199
+ vreinterpretq_u8_f32(x)
200
+
201
+ #define vreinterpretq_u16_m128(x) \
202
+ vreinterpretq_u16_f32(x)
203
+
204
+ #define vreinterpretq_u32_m128(x) \
205
+ vreinterpretq_u32_f32(x)
206
+
207
+ #define vreinterpretq_u64_m128(x) \
208
+ vreinterpretq_u64_f32(x)
209
+
210
+
211
+ #define vreinterpretq_s8_m128(x) \
212
+ vreinterpretq_s8_f32(x)
213
+
214
+ #define vreinterpretq_s16_m128(x) \
215
+ vreinterpretq_s16_f32(x)
216
+
217
+ #define vreinterpretq_s32_m128(x) \
218
+ vreinterpretq_s32_f32(x)
219
+
220
+ #define vreinterpretq_s64_m128(x) \
221
+ vreinterpretq_s64_f32(x)
222
+
223
+
224
+ #define vreinterpretq_m128i_s8(x) \
225
+ vreinterpretq_s32_s8(x)
226
+
227
+ #define vreinterpretq_m128i_s16(x) \
228
+ vreinterpretq_s32_s16(x)
229
+
230
+ #define vreinterpretq_m128i_s32(x) \
231
+ (x)
232
+
233
+ #define vreinterpretq_m128i_s64(x) \
234
+ vreinterpretq_s32_s64(x)
235
+
236
+
237
+ #define vreinterpretq_m128i_u8(x) \
238
+ vreinterpretq_s32_u8(x)
239
+
240
+ #define vreinterpretq_m128i_u16(x) \
241
+ vreinterpretq_s32_u16(x)
242
+
243
+ #define vreinterpretq_m128i_u32(x) \
244
+ vreinterpretq_s32_u32(x)
245
+
246
+ #define vreinterpretq_m128i_u64(x) \
247
+ vreinterpretq_s32_u64(x)
248
+
249
+
250
+ #define vreinterpretq_s8_m128i(x) \
251
+ vreinterpretq_s8_s32(x)
252
+
253
+ #define vreinterpretq_s16_m128i(x) \
254
+ vreinterpretq_s16_s32(x)
255
+
256
+ #define vreinterpretq_s32_m128i(x) \
257
+ (x)
258
+
259
+ #define vreinterpretq_s64_m128i(x) \
260
+ vreinterpretq_s64_s32(x)
261
+
262
+
263
+ #define vreinterpretq_u8_m128i(x) \
264
+ vreinterpretq_u8_s32(x)
265
+
266
+ #define vreinterpretq_u16_m128i(x) \
267
+ vreinterpretq_u16_s32(x)
268
+
269
+ #define vreinterpretq_u32_m128i(x) \
270
+ vreinterpretq_u32_s32(x)
271
+
272
+ #define vreinterpretq_u64_m128i(x) \
273
+ vreinterpretq_u64_s32(x)
274
+
275
+
276
+ // union intended to allow direct access to an __m128 variable using the names that the MSVC
277
+ // compiler provides. This union should really only be used when trying to access the members
278
+ // of the vector as integer values. GCC/clang allow native access to the float members through
279
+ // a simple array access operator (in C since 4.6, in C++ since 4.8).
280
+ //
281
+ // Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
282
+ // hit. If it really is needed however, the original __m128 variable can be aliased with a
283
+ // pointer to this union and used to access individual components. The use of this union should
284
+ // be hidden behind a macro that is used throughout the codebase to access the members instead
285
+ // of always declaring this type of variable.
286
+ typedef union ALIGN_STRUCT(16) SIMDVec
287
+ {
288
+ float m128_f32[4]; // as floats - do not to use this. Added for convenience.
289
+ int8_t m128_i8[16]; // as signed 8-bit integers.
290
+ int16_t m128_i16[8]; // as signed 16-bit integers.
291
+ int32_t m128_i32[4]; // as signed 32-bit integers.
292
+ int64_t m128_i64[2]; // as signed 64-bit integers.
293
+ uint8_t m128_u8[16]; // as unsigned 8-bit integers.
294
+ uint16_t m128_u16[8]; // as unsigned 16-bit integers.
295
+ uint32_t m128_u32[4]; // as unsigned 32-bit integers.
296
+ uint64_t m128_u64[2]; // as unsigned 64-bit integers.
297
+ } SIMDVec;
298
+
299
+
300
+ // ******************************************
301
+ // Set/get methods
302
+ // ******************************************
303
+
304
+ // extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
305
+ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
306
+ {
307
+ return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
308
+ }
309
+
310
+ // Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
311
+ FORCE_INLINE __m128i _mm_setzero_si128()
312
+ {
313
+ return vreinterpretq_m128i_s32(vdupq_n_s32(0));
314
+ }
315
+
316
+ // Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
317
+ FORCE_INLINE __m128 _mm_setzero_ps(void)
318
+ {
319
+ return vreinterpretq_m128_f32(vdupq_n_f32(0));
320
+ }
321
+
322
+ // Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
323
+ FORCE_INLINE __m128 _mm_set1_ps(float _w)
324
+ {
325
+ return vreinterpretq_m128_f32(vdupq_n_f32(_w));
326
+ }
327
+
328
+ // Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
329
+ FORCE_INLINE __m128 _mm_set_ps1(float _w)
330
+ {
331
+ return vreinterpretq_m128_f32(vdupq_n_f32(_w));
332
+ }
333
+
334
+ // Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
335
+ FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
336
+ {
337
+ float __attribute__((aligned(16))) data[4] = { x, y, z, w };
338
+ return vreinterpretq_m128_f32(vld1q_f32(data));
339
+ }
340
+
341
+ // Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
342
+ FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
343
+ {
344
+ float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
345
+ return vreinterpretq_m128_f32(vld1q_f32(data));
346
+ }
347
+
348
+
349
+ //added by hasindu
350
+ //Sets the 4 signed 32-bit integer values in reverse order https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
351
+ FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
352
+ {
353
+ int32_t __attribute__((aligned(16))) data[4] = { i3, i2, i1, i0 };
354
+ return vreinterpretq_m128i_s32(vld1q_s32(data));
355
+ }
356
+
357
+ //following added by hasindu
358
+ //Sets the 16 signed 8-bit integer values to b.https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
359
+ FORCE_INLINE __m128i _mm_set1_epi8(char w)
360
+ {
361
+ return vreinterpretq_m128i_s8(vdupq_n_s8(w));
362
+ }
363
+
364
+
365
+ //following added by hasindu
366
+ //Sets the 8 signed 16-bit integer values to w. https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
367
+ FORCE_INLINE __m128i _mm_set1_epi16(short w)
368
+ {
369
+ return vreinterpretq_m128i_s16(vdupq_n_s16(w));
370
+ }
371
+
372
+ //following added by hasindu
373
+ //Sets the 8 signed 16-bit integer values. https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
374
+ FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
375
+ {
376
+ int16_t __attribute__((aligned(16))) data[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };
377
+ return vreinterpretq_m128i_s16(vld1q_s16(data));
378
+ }
379
+
380
+
381
+ // Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
382
+ FORCE_INLINE __m128i _mm_set1_epi32(int _i)
383
+ {
384
+ return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
385
+ }
386
+
387
+ // Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
388
+ FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
389
+ {
390
+ int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
391
+ return vreinterpretq_m128i_s32(vld1q_s32(data));
392
+ }
393
+
394
+ // Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
395
+ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
396
+ {
397
+ vst1q_f32(p, vreinterpretq_f32_m128(a));
398
+ }
399
+
400
+ // Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
401
+ FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
402
+ {
403
+ vst1q_f32(p, vreinterpretq_f32_m128(a));
404
+ }
405
+
406
+ // Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
407
+ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
408
+ {
409
+ vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
410
+ }
411
+
412
+ //added by hasindu (verify this for requirement of alignment)
413
+ // Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
414
+ FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
415
+ {
416
+ vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
417
+ }
418
+
419
+ // Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
420
+ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
421
+ {
422
+ vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
423
+ }
424
+
425
+ // Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
426
+ FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
427
+ {
428
+ uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
429
+ uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
430
+ *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
431
+ }
432
+
433
+ // Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
434
+ FORCE_INLINE __m128 _mm_load1_ps(const float * p)
435
+ {
436
+ return vreinterpretq_m128_f32(vld1q_dup_f32(p));
437
+ }
438
+
439
+ // Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
440
+ FORCE_INLINE __m128 _mm_load_ps(const float * p)
441
+ {
442
+ return vreinterpretq_m128_f32(vld1q_f32(p));
443
+ }
444
+
445
+ // Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
446
+ FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
447
+ {
448
+ // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
449
+ return vreinterpretq_m128_f32(vld1q_f32(p));
450
+ }
451
+
452
+ // Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
453
+ FORCE_INLINE __m128 _mm_load_ss(const float * p)
454
+ {
455
+ return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
456
+ }
457
+
458
+
459
+ // ******************************************
460
+ // Logic/Binary operations
461
+ // ******************************************
462
+
463
+ // Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
464
+ FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
465
+ {
466
+ return vreinterpretq_m128_u32( vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) ) );
467
+ }
468
+
469
+ // Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
470
+ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
471
+ {
472
+ return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a)) ); // *NOTE* argument swap
473
+ }
474
+
475
+ // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
476
+ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
477
+ {
478
+ return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a)) ); // *NOTE* argument swap
479
+ }
480
+
481
+ // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
482
+ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
483
+ {
484
+ return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
485
+ }
486
+
487
+ // Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
488
+ FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
489
+ {
490
+ return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
491
+ }
492
+
493
+ // Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
494
+ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
495
+ {
496
+ return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
497
+ }
498
+
499
+ // Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
500
+ FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
501
+ {
502
+ return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
503
+ }
504
+
505
+ // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
506
+ FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
507
+ {
508
+ return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
509
+ }
510
+
511
+ // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
512
+ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
513
+ {
514
+ return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
515
+ }
516
+
517
+ // NEON does not provide this method
518
+ // Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
519
+ FORCE_INLINE int _mm_movemask_ps(__m128 a)
520
+ {
521
+ #if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
522
+ uint32x4_t &ia = *(uint32x4_t *)&a;
523
+ return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
524
+ #else
525
+ static const uint32x4_t movemask = { 1, 2, 4, 8 };
526
+ static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
527
+ uint32x4_t t0 = vreinterpretq_u32_m128(a);
528
+ uint32x4_t t1 = vtstq_u32(t0, highbit);
529
+ uint32x4_t t2 = vandq_u32(t1, movemask);
530
+ uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
531
+ return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
532
+ #endif
533
+ }
534
+
535
+ // Takes the upper 64 bits of a and places it in the low end of the result
536
+ // Takes the lower 64 bits of b and places it into the high end of the result.
537
+ FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
538
+ {
539
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
540
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
541
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
542
+ }
543
+
544
+ // takes the lower two 32-bit values from a and swaps them and places in high end of result
545
+ // takes the higher two 32 bit values from b and swaps them and places in low end of result.
546
+ FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
547
+ {
548
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
549
+ float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
550
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
551
+ }
552
+
553
+ FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
554
+ {
555
+ float32x2_t a21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
556
+ float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
557
+ return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
558
+ }
559
+
560
+ FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
561
+ {
562
+ float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
563
+ float32x2_t b21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
564
+ return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
565
+ }
566
+
567
+ FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
568
+ {
569
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
570
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
571
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
572
+ }
573
+
574
+ FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
575
+ {
576
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
577
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
578
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
579
+ }
580
+
581
+ FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
582
+ {
583
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
584
+ float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
585
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
586
+ }
587
+
588
+ // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
589
+ FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
590
+ {
591
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
592
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
593
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
594
+ }
595
+
596
+ FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
597
+ {
598
+ float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
599
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
600
+ return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
601
+ }
602
+
603
+ FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
604
+ {
605
+ float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
606
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
607
+ return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
608
+ }
609
+
610
+ FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
611
+ {
612
+ float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
613
+ float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
614
+ return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
615
+ }
616
+
617
+ FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
618
+ {
619
+ float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
620
+ float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
621
+ float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* apoty: TODO: use vzip ?*/
622
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
623
+ return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
624
+ }
625
+
626
+ FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
627
+ {
628
+ float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
629
+ float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
630
+ return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
631
+ }
632
+
633
+ FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
634
+ {
635
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
636
+ float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
637
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
638
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
639
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
640
+ }
641
+
642
+ FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
643
+ {
644
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
645
+ float32_t b2 = vgetq_lane_f32(b, 2);
646
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
647
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
648
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
649
+ }
650
+
651
+ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
652
+ {
653
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
654
+ float32_t b2 = vgetq_lane_f32(b, 2);
655
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
656
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1);
657
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
658
+ }
659
+
660
+ // NEON does not support a general purpose permute intrinsic
661
+ // Currently I am not sure whether the C implementation is faster or slower than the NEON version.
662
+ // Note, this has to be expanded as a template because the shuffle value must be an immediate value.
663
+ // The same is true on SSE as well.
664
+ // Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
665
+ #if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
666
+ FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, __constrange(0,255) int imm)
667
+ {
668
+ __m128 ret;
669
+ ret[0] = a[imm & 0x3];
670
+ ret[1] = a[(imm >> 2) & 0x3];
671
+ ret[2] = b[(imm >> 4) & 0x03];
672
+ ret[3] = b[(imm >> 6) & 0x03];
673
+ return ret;
674
+ }
675
+ #else
676
+ #define _mm_shuffle_ps_default(a, b, imm) \
677
+ ({ \
678
+ float32x4_t ret; \
679
+ ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & 0x3)); \
680
+ ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret, 1); \
681
+ ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret, 2); \
682
+ ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret, 3); \
683
+ vreinterpretq_m128_f32(ret); \
684
+ })
685
+ #endif
686
+
687
+ //FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) int imm)
688
+ #define _mm_shuffle_ps(a, b, imm) \
689
+ ({ \
690
+ __m128 ret; \
691
+ switch (imm) \
692
+ { \
693
+ case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_ps_1032((a), (b)); break; \
694
+ case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_ps_2301((a), (b)); break; \
695
+ case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_ps_0321((a), (b)); break; \
696
+ case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_ps_2103((a), (b)); break; \
697
+ case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_ps_1010((a), (b)); break; \
698
+ case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_ps_1001((a), (b)); break; \
699
+ case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_ps_0101((a), (b)); break; \
700
+ case _MM_SHUFFLE(3, 2, 1, 0): ret = _mm_shuffle_ps_3210((a), (b)); break; \
701
+ case _MM_SHUFFLE(0, 0, 1, 1): ret = _mm_shuffle_ps_0011((a), (b)); break; \
702
+ case _MM_SHUFFLE(0, 0, 2, 2): ret = _mm_shuffle_ps_0022((a), (b)); break; \
703
+ case _MM_SHUFFLE(2, 2, 0, 0): ret = _mm_shuffle_ps_2200((a), (b)); break; \
704
+ case _MM_SHUFFLE(3, 2, 0, 2): ret = _mm_shuffle_ps_3202((a), (b)); break; \
705
+ case _MM_SHUFFLE(1, 1, 3, 3): ret = _mm_shuffle_ps_1133((a), (b)); break; \
706
+ case _MM_SHUFFLE(2, 0, 1, 0): ret = _mm_shuffle_ps_2010((a), (b)); break; \
707
+ case _MM_SHUFFLE(2, 0, 0, 1): ret = _mm_shuffle_ps_2001((a), (b)); break; \
708
+ case _MM_SHUFFLE(2, 0, 3, 2): ret = _mm_shuffle_ps_2032((a), (b)); break; \
709
+ default: ret = _mm_shuffle_ps_default((a), (b), (imm)); break; \
710
+ } \
711
+ ret; \
712
+ })
713
+
714
+ // Takes the upper 64 bits of a and places it in the low end of the result
715
+ // Takes the lower 64 bits of a and places it into the high end of the result.
716
+ FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
717
+ {
718
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
719
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
720
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
721
+ }
722
+
723
+ // takes the lower two 32-bit values from a and swaps them and places in low end of result
724
+ // takes the higher two 32 bit values from a and swaps them and places in high end of result.
725
+ FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
726
+ {
727
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
728
+ int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
729
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
730
+ }
731
+
732
+ // rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
733
+ FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
734
+ {
735
+ return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
736
+ }
737
+
738
+ // rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
739
+ FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
740
+ {
741
+ return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
742
+ }
743
+
744
+ // gets the lower 64 bits of a, and places it in the upper 64 bits
745
+ // gets the lower 64 bits of a and places it in the lower 64 bits
746
+ FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
747
+ {
748
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
749
+ return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
750
+ }
751
+
752
+ // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
753
+ // gets the lower 64 bits of a, and places it in the upper 64 bits
754
+ FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
755
+ {
756
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
757
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
758
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
759
+ }
760
+
761
+ // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
762
+ // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
763
+ FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
764
+ {
765
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
766
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
767
+ }
768
+
769
+ FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
770
+ {
771
+ int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
772
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
773
+ return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
774
+ }
775
+
776
+ FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
777
+ {
778
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
779
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
780
+ return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
781
+ }
782
+
783
+ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
784
+ {
785
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
786
+ int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
787
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
788
+ }
789
+
790
+ //FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
791
+ #if ENABLE_CPP_VERSION
792
+ FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
793
+ {
794
+ __m128i ret;
795
+ ret[0] = a[imm & 0x3];
796
+ ret[1] = a[(imm >> 2) & 0x3];
797
+ ret[2] = a[(imm >> 4) & 0x03];
798
+ ret[3] = a[(imm >> 6) & 0x03];
799
+ return ret;
800
+ }
801
+ #else
802
+ #define _mm_shuffle_epi32_default(a, imm) \
803
+ ({ \
804
+ int32x4_t ret; \
805
+ ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & 0x3)); \
806
+ ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret, 1); \
807
+ ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret, 2); \
808
+ ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret, 3); \
809
+ vreinterpretq_m128i_s32(ret); \
810
+ })
811
+ #endif
812
+
813
+ //FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) int imm)
814
+ #if defined(__aarch64__)
815
+ #define _mm_shuffle_epi32_splat(a, imm) \
816
+ ({ \
817
+ vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
818
+ })
819
+ #else
820
+ #define _mm_shuffle_epi32_splat(a, imm) \
821
+ ({ \
822
+ vreinterpretq_m128i_s32(vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
823
+ })
824
+ #endif
825
+
826
+ // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
827
+ //FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int imm)
828
+ #define _mm_shuffle_epi32(a, imm) \
829
+ ({ \
830
+ __m128i ret; \
831
+ switch (imm) \
832
+ { \
833
+ case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_epi_1032((a)); break; \
834
+ case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_epi_2301((a)); break; \
835
+ case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_epi_0321((a)); break; \
836
+ case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_epi_2103((a)); break; \
837
+ case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_epi_1010((a)); break; \
838
+ case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_epi_1001((a)); break; \
839
+ case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_epi_0101((a)); break; \
840
+ case _MM_SHUFFLE(2, 2, 1, 1): ret = _mm_shuffle_epi_2211((a)); break; \
841
+ case _MM_SHUFFLE(0, 1, 2, 2): ret = _mm_shuffle_epi_0122((a)); break; \
842
+ case _MM_SHUFFLE(3, 3, 3, 2): ret = _mm_shuffle_epi_3332((a)); break; \
843
+ case _MM_SHUFFLE(0, 0, 0, 0): ret = _mm_shuffle_epi32_splat((a),0); break; \
844
+ case _MM_SHUFFLE(1, 1, 1, 1): ret = _mm_shuffle_epi32_splat((a),1); break; \
845
+ case _MM_SHUFFLE(2, 2, 2, 2): ret = _mm_shuffle_epi32_splat((a),2); break; \
846
+ case _MM_SHUFFLE(3, 3, 3, 3): ret = _mm_shuffle_epi32_splat((a),3); break; \
847
+ default: ret = _mm_shuffle_epi32_default((a), (imm)); break; \
848
+ } \
849
+ ret; \
850
+ })
851
+
852
+ // Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
853
+ //FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, __constrange(0,255) int imm)
854
+ #define _mm_shufflehi_epi16_function(a, imm) \
855
+ ({ \
856
+ int16x8_t ret = vreinterpretq_s16_s32(a); \
857
+ int16x4_t highBits = vget_high_s16(ret); \
858
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & 0x3), ret, 4); \
859
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
860
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
861
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
862
+ vreinterpretq_s32_s16(ret); \
863
+ })
864
+
865
+ //FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int imm)
866
+ #define _mm_shufflehi_epi16(a, imm) \
867
+ _mm_shufflehi_epi16_function((a), (imm))
868
+
869
+
870
+ //added by hasindu
871
+ //Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
872
+ #define _mm_slli_epi16(a, imm) \
873
+ ({ \
874
+ __m128i ret; \
875
+ if ((imm) <= 0) {\
876
+ ret = a; \
877
+ } \
878
+ else if ((imm) > 31) { \
879
+ ret = _mm_setzero_si128(); \
880
+ } \
881
+ else { \
882
+ ret = vreinterpretq_m128i_s16(vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
883
+ } \
884
+ ret; \
885
+ })
886
+
887
+
888
+
889
+ // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
890
+ //FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
891
+ #define _mm_slli_epi32(a, imm) \
892
+ ({ \
893
+ __m128i ret; \
894
+ if ((imm) <= 0) {\
895
+ ret = a; \
896
+ } \
897
+ else if ((imm) > 31) { \
898
+ ret = _mm_setzero_si128(); \
899
+ } \
900
+ else { \
901
+ ret = vreinterpretq_m128i_s32(vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
902
+ } \
903
+ ret; \
904
+ })
905
+
906
+
907
+ //added by hasindu
908
+ // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros.
909
+ //https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
910
+ #define _mm_srli_epi16(a, imm) \
911
+ ({ \
912
+ __m128i ret; \
913
+ if ((imm) <= 0) { \
914
+ ret = a; \
915
+ } \
916
+ else if ((imm)> 31) { \
917
+ ret = _mm_setzero_si128(); \
918
+ } \
919
+ else { \
920
+ ret = vreinterpretq_m128i_u16(vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
921
+ } \
922
+ ret; \
923
+ })
924
+
925
+
926
+ //Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
927
+ //FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
928
+ #define _mm_srli_epi32(a, imm) \
929
+ ({ \
930
+ __m128i ret; \
931
+ if ((imm) <= 0) { \
932
+ ret = a; \
933
+ } \
934
+ else if ((imm)> 31) { \
935
+ ret = _mm_setzero_si128(); \
936
+ } \
937
+ else { \
938
+ ret = vreinterpretq_m128i_u32(vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
939
+ } \
940
+ ret; \
941
+ })
942
+
943
+ // Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
944
+ //FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
945
+ #define _mm_srai_epi32(a, imm) \
946
+ ({ \
947
+ __m128i ret; \
948
+ if ((imm) <= 0) { \
949
+ ret = a; \
950
+ } \
951
+ else if ((imm) > 31) { \
952
+ ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
953
+ ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16)); \
954
+ } \
955
+ else { \
956
+ ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
957
+ } \
958
+ ret; \
959
+ })
960
+
961
+ // Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
962
+ //FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
963
+ #define _mm_srli_si128(a, imm) \
964
+ ({ \
965
+ __m128i ret; \
966
+ if ((imm) <= 0) { \
967
+ ret = a; \
968
+ } \
969
+ else if ((imm) > 15) { \
970
+ ret = _mm_setzero_si128(); \
971
+ } \
972
+ else { \
973
+ ret = vreinterpretq_m128i_s8(vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
974
+ } \
975
+ ret; \
976
+ })
977
+
978
+ // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
979
+ //FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
980
+ #define _mm_slli_si128(a, imm) \
981
+ ({ \
982
+ __m128i ret; \
983
+ if ((imm) <= 0) { \
984
+ ret = a; \
985
+ } \
986
+ else if ((imm) > 15) { \
987
+ ret = _mm_setzero_si128(); \
988
+ } \
989
+ else { \
990
+ ret = vreinterpretq_m128i_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
991
+ } \
992
+ ret; \
993
+ })
994
+
995
+ // NEON does not provide a version of this function, here is an article about some ways to repro the results.
996
+ // http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
997
+ // Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
998
+ FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
999
+ {
1000
+ uint8x16_t input = vreinterpretq_u8_m128i(_a);
1001
+ static const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
1002
+ uint8x8_t mask_and = vdup_n_u8(0x80);
1003
+ int8x8_t mask_shift = vld1_s8(xr);
1004
+
1005
+ uint8x8_t lo = vget_low_u8(input);
1006
+ uint8x8_t hi = vget_high_u8(input);
1007
+
1008
+ lo = vand_u8(lo, mask_and);
1009
+ lo = vshl_u8(lo, mask_shift);
1010
+
1011
+ hi = vand_u8(hi, mask_and);
1012
+ hi = vshl_u8(hi, mask_shift);
1013
+
1014
+ lo = vpadd_u8(lo, lo);
1015
+ lo = vpadd_u8(lo, lo);
1016
+ lo = vpadd_u8(lo, lo);
1017
+
1018
+ hi = vpadd_u8(hi, hi);
1019
+ hi = vpadd_u8(hi, hi);
1020
+ hi = vpadd_u8(hi, hi);
1021
+
1022
+ return ((hi[0] << 8) | (lo[0] & 0xFF));
1023
+ }
1024
+
1025
+
1026
+ // ******************************************
1027
+ // Math operations
1028
+ // ******************************************
1029
+
1030
+ // Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
1031
+ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
1032
+ {
1033
+ return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1034
+ }
1035
+
1036
+ // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
1037
+ FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
1038
+ {
1039
+ return vreinterpretq_m128_f32(vsubq_s32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1040
+ }
1041
+
1042
+ FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
1043
+ {
1044
+ return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1045
+ }
1046
+
1047
+ //added by hasindu
1048
+ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
1049
+ {
1050
+ return vreinterpretq_m128i_s8(vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1051
+ }
1052
+
1053
+ //added by hasindu
1054
+ //Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
1055
+ FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
1056
+ {
1057
+ return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
1058
+ }
1059
+
1060
+ //added by hasindu
1061
+ //Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
1062
+ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
1063
+ {
1064
+ return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1065
+ }
1066
+
1067
+ // Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1068
+ FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1069
+ {
1070
+ return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1071
+ }
1072
+
1073
+ // adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1074
+ FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1075
+ {
1076
+ float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1077
+ float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1078
+ //the upper values in the result must be the remnants of <a>.
1079
+ return vreinterpretq_m128_f32(vaddq_f32(a, value));
1080
+ }
1081
+
1082
+ // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
1083
+ FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
1084
+ {
1085
+ return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1086
+ }
1087
+
1088
+ // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
1089
+ FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
1090
+ {
1091
+ return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1092
+ }
1093
+
1094
+ //added by hasindu
1095
+ // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or unsigned 8-bit integers in b. https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
1096
+ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
1097
+ {
1098
+ return vreinterpretq_m128i_s8(vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1099
+ }
1100
+
1101
+ //added by hasindu
1102
+ // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates. https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
1103
+ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
1104
+ {
1105
+ return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1106
+ }
1107
+
1108
+ //added by hasindu
1109
+ //Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in b and saturates.. https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
1110
+ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
1111
+ {
1112
+ return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1113
+ }
1114
+
1115
+
1116
+ // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
1117
+ FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1118
+ {
1119
+ return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1120
+ }
1121
+
1122
+ // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
1123
+ FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
1124
+ {
1125
+ return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),vreinterpretq_s32_m128i(b)));
1126
+ }
1127
+
1128
+ // Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
1129
+ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
1130
+ {
1131
+ return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1132
+ }
1133
+
1134
+ // Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1135
+ FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1136
+ {
1137
+ float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
1138
+ float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
1139
+ return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
1140
+ }
1141
+
1142
+ // Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1143
+ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1144
+ {
1145
+ float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1146
+ return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1147
+ }
1148
+
1149
+ // This version does additional iterations to improve accuracy. Between 1 and 4 recommended.
1150
+ // Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
1151
+ FORCE_INLINE __m128 recipq_newton(__m128 in, int n)
1152
+ {
1153
+ int i;
1154
+ float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
1155
+ for (i = 0; i < n; ++i)
1156
+ {
1157
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
1158
+ }
1159
+ return vreinterpretq_m128_f32(recip);
1160
+ }
1161
+
1162
+ // Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
1163
+ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
1164
+ {
1165
+ float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
1166
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
1167
+ return vreinterpretq_m128_f32(recip);
1168
+ }
1169
+
1170
+ // Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
1171
+ FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
1172
+ {
1173
+ float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
1174
+ float32x4_t sq = vrecpeq_f32(recipsq);
1175
+ // ??? use step versions of both sqrt and recip for better accuracy?
1176
+ return vreinterpretq_m128_f32(sq);
1177
+ }
1178
+
1179
+ // Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
1180
+ FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
1181
+ {
1182
+ float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
1183
+ return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
1184
+ }
1185
+
1186
+ // Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
1187
+ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
1188
+ {
1189
+ return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
1190
+ }
1191
+
1192
+ // Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
1193
+ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1194
+ {
1195
+ return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1196
+ }
1197
+
1198
+ // Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
1199
+ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
1200
+ {
1201
+ return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1202
+ }
1203
+
1204
+ // Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
1205
+ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1206
+ {
1207
+ float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1208
+ return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1209
+ }
1210
+
1211
+ // Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
1212
+ FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
1213
+ {
1214
+ float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1215
+ return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1216
+ }
1217
+
1218
+ //added by hasindu
1219
+ //Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
1220
+ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
1221
+ {
1222
+ return vreinterpretq_m128i_u8(vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1223
+ }
1224
+
1225
+ //added by hasindu
1226
+ //Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
1227
+ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
1228
+ {
1229
+ return vreinterpretq_m128i_u8(vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1230
+ }
1231
+
1232
+
1233
+ // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
1234
+ FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
1235
+ {
1236
+ return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1237
+ }
1238
+
1239
+ //added by hasindu
1240
+ //Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
1241
+ FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
1242
+ {
1243
+ return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1244
+ }
1245
+
1246
+
1247
+ // epi versions of min/max
1248
+ // Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
1249
+ FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
1250
+ {
1251
+ return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1252
+ }
1253
+
1254
+ // Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
1255
+ FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
1256
+ {
1257
+ return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1258
+ }
1259
+
1260
+ // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
1261
+ FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1262
+ {
1263
+ /* apoty: issue with large values because of result saturation */
1264
+ //int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); /* =2*a*b */
1265
+ //return vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
1266
+ int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
1267
+ int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
1268
+ int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1269
+ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
1270
+ int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
1271
+ int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
1272
+ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
1273
+ return vreinterpretq_m128i_u16(r.val[1]);
1274
+ }
1275
+
1276
+ // Computes pairwise add of each argument as single-precision, floating-point values a and b.
1277
+ //https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
1278
+ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
1279
+ {
1280
+ #if defined(__aarch64__)
1281
+ return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); //AArch64
1282
+ #else
1283
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1284
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1285
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1286
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1287
+ return vreinterpretq_m128_f32(vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
1288
+ #endif
1289
+ }
1290
+
1291
+ // ******************************************
1292
+ // Compare operations
1293
+ // ******************************************
1294
+
1295
+ // Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1296
+ FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1297
+ {
1298
+ return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1299
+ }
1300
+
1301
+ // Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1302
+ FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1303
+ {
1304
+ return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1305
+ }
1306
+
1307
+ // Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1308
+ FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1309
+ {
1310
+ return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1311
+ }
1312
+
1313
+ // Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1314
+ FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1315
+ {
1316
+ return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1317
+ }
1318
+
1319
+ // Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1320
+ FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1321
+ {
1322
+ return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1323
+ }
1324
+
1325
+
1326
+ //added by hasindu
1327
+ //Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or unsigned 8-bit integers in b for equality. https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
1328
+ FORCE_INLINE __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
1329
+ {
1330
+ return vreinterpretq_m128i_u8(vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1331
+ }
1332
+
1333
+ //added by hasindu
1334
+ //Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or unsigned 16-bit integers in b for equality.
1335
+ //https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
1336
+ FORCE_INLINE __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
1337
+ {
1338
+ return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1339
+ }
1340
+
1341
+ //added by hasindu
1342
+ //Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for lesser than. https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
1343
+ FORCE_INLINE __m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
1344
+ {
1345
+ return vreinterpretq_m128i_u8(vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1346
+ }
1347
+
1348
+
1349
+ //added by hasindu
1350
+ //Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than. https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
1351
+ FORCE_INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
1352
+ {
1353
+ return vreinterpretq_m128i_u8(vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1354
+ }
1355
+
1356
+ //added by hasindu
1357
+ //Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than. https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
1358
+ FORCE_INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
1359
+ {
1360
+ return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1361
+ }
1362
+
1363
+
1364
+ // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
1365
+ FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1366
+ {
1367
+ return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1368
+ }
1369
+
1370
+ // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
1371
+ FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1372
+ {
1373
+ return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1374
+ }
1375
+
1376
+ // Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
1377
+ // see also:
1378
+ // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1379
+ // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1380
+ FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
1381
+ {
1382
+ // Note: NEON does not have ordered compare builtin
1383
+ // Need to compare a eq a and b eq b to check for NaN
1384
+ // Do AND of results to get final
1385
+ uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1386
+ uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1387
+ return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1388
+ }
1389
+
1390
+ // Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
1391
+ // Important note!! The documentation on MSDN is incorrect! If either of the values is a NAN the docs say you will get a one, but in fact, it will return a zero!!
1392
+ FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1393
+ {
1394
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1395
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1396
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1397
+ uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1398
+ return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
1399
+ }
1400
+
1401
+ // Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1402
+ FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1403
+ {
1404
+ //return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1405
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1406
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1407
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1408
+ uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1409
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
1410
+ }
1411
+
1412
+ // Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1413
+ FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1414
+ {
1415
+ //return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1416
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1417
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1418
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1419
+ uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1420
+ return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
1421
+ }
1422
+
1423
+ // Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1424
+ FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1425
+ {
1426
+ //return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1427
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1428
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1429
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1430
+ uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1431
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
1432
+ }
1433
+
1434
+ // Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1435
+ FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1436
+ {
1437
+ //return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1438
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1439
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1440
+ uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1441
+ uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1442
+ return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
1443
+ }
1444
+
1445
+ // Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1446
+ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1447
+ {
1448
+ //return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
1449
+ uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1450
+ uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1451
+ uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1452
+ uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1453
+ return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) ? 1 : 0;
1454
+ }
1455
+
1456
+ // according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here.
1457
+ #define _mm_ucomilt_ss _mm_comilt_ss
1458
+ #define _mm_ucomile_ss _mm_comile_ss
1459
+ #define _mm_ucomigt_ss _mm_comigt_ss
1460
+ #define _mm_ucomige_ss _mm_comige_ss
1461
+ #define _mm_ucomieq_ss _mm_comieq_ss
1462
+ #define _mm_ucomineq_ss _mm_comineq_ss
1463
+
1464
+ // ******************************************
1465
+ // Conversions
1466
+ // ******************************************
1467
+
1468
+ // Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
1469
+ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
1470
+ {
1471
+ return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
1472
+ }
1473
+
1474
+ // Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
1475
+ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
1476
+ {
1477
+ return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
1478
+ }
1479
+
1480
+ // Converts the four unsigned 8-bit integers in the lower 32 bits to four unsigned 32-bit integers. https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
1481
+ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
1482
+ {
1483
+ uint8x16_t u8x16 = vreinterpretq_u8_s32(a); /* xxxx xxxx xxxx DCBA */
1484
+ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
1485
+ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
1486
+ return vreinterpretq_s32_u32(u32x4);
1487
+ }
1488
+
1489
+ // Converts the four signed 16-bit integers in the lower 64 bits to four signed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514079%28v=vs.100%29.aspx
1490
+ FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
1491
+ {
1492
+ return vreinterpretq_m128i_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
1493
+ }
1494
+
1495
+ // Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
1496
+ // *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
1497
+ // It is supported on ARMv8 however.
1498
+ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
1499
+ {
1500
+ #if defined(__aarch64__)
1501
+ return vcvtnq_s32_f32(a);
1502
+ #else
1503
+ uint32x4_t signmask = vdupq_n_u32(0x80000000);
1504
+ float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */
1505
+ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
1506
+ int32x4_t r_trunc = vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
1507
+ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
1508
+ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
1509
+ float32x4_t delta = vsubq_f32(vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
1510
+ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
1511
+ return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
1512
+ #endif
1513
+ }
1514
+
1515
+ // Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
1516
+ FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
1517
+ {
1518
+ return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
1519
+ }
1520
+
1521
+ // Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
1522
+ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
1523
+ {
1524
+ return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
1525
+ }
1526
+
1527
+
1528
+ // Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
1529
+ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
1530
+ {
1531
+ return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
1532
+ }
1533
+
1534
+ // Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
1535
+ FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
1536
+ {
1537
+ return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
1538
+ }
1539
+
1540
+ // Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
1541
+ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
1542
+ {
1543
+ return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
1544
+ }
1545
+
1546
+ //added by hasindu (verify this for requirement of alignment)
1547
+ // Loads 128-bit value. : https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
1548
+ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
1549
+ {
1550
+ return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
1551
+ }
1552
+
1553
+
1554
+ // ******************************************
1555
+ // Miscellaneous Operations
1556
+ // ******************************************
1557
+
1558
+ // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
1559
+ FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
1560
+ {
1561
+ return vreinterpretq_m128i_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b))));
1562
+ }
1563
+
1564
+ // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
1565
+ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
1566
+ {
1567
+ return vreinterpretq_m128i_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b))));
1568
+ }
1569
+
1570
+ // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
1571
+ FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
1572
+ {
1573
+ return vreinterpretq_m128i_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b))));
1574
+ }
1575
+
1576
+ // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
1577
+ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1578
+ {
1579
+ int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
1580
+ int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
1581
+ int8x8x2_t result = vzip_s8(a1, b1);
1582
+ return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
1583
+ }
1584
+
1585
+ // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
1586
+ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1587
+ {
1588
+ int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
1589
+ int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
1590
+ int16x4x2_t result = vzip_s16(a1, b1);
1591
+ return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
1592
+ }
1593
+
1594
+ // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
1595
+ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1596
+ {
1597
+ int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
1598
+ int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
1599
+ int32x2x2_t result = vzip_s32(a1, b1);
1600
+ return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
1601
+ }
1602
+
1603
+ // Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
1604
+ FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
1605
+ {
1606
+ float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
1607
+ float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
1608
+ float32x2x2_t result = vzip_f32(a1, b1);
1609
+ return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
1610
+ }
1611
+
1612
+ // Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
1613
+ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
1614
+ {
1615
+ float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
1616
+ float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
1617
+ float32x2x2_t result = vzip_f32(a1, b1);
1618
+ return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
1619
+ }
1620
+
1621
+ // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
1622
+ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1623
+ {
1624
+ int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
1625
+ int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
1626
+ int8x8x2_t result = vzip_s8(a1, b1);
1627
+ return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
1628
+ }
1629
+
1630
+ // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
1631
+ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1632
+ {
1633
+ int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
1634
+ int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
1635
+ int16x4x2_t result = vzip_s16(a1, b1);
1636
+ return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
1637
+ }
1638
+
1639
+ // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
1640
+ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1641
+ {
1642
+ int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
1643
+ int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
1644
+ int32x2x2_t result = vzip_s32(a1, b1);
1645
+ return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
1646
+ }
1647
+
1648
+ // Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
1649
+ //FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
1650
+ #define _mm_extract_epi16(a, imm) \
1651
+ ({ \
1652
+ (vgetq_lane_s16(vreinterpretq_s16_m128i(a), (imm)) & 0x0000ffffUL); \
1653
+ })
1654
+
1655
+ // Inserts the least significant 16 bits of b into the selected 16-bit integer of a. https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
1656
+ //FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b, __constrange(0,8) int imm)
1657
+ #define _mm_insert_epi16(a, b, imm) \
1658
+ ({ \
1659
+ vreinterpretq_m128i_s16(vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
1660
+ })
1661
+
1662
+ // ******************************************
1663
+ // Streaming Extensions
1664
+ // ******************************************
1665
+
1666
+ // Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
1667
+ FORCE_INLINE void _mm_sfence(void)
1668
+ {
1669
+ __sync_synchronize();
1670
+ }
1671
+
1672
+ // Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
1673
+ FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
1674
+ {
1675
+ *p = a;
1676
+ }
1677
+
1678
+ // Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
1679
+ FORCE_INLINE void _mm_clflush(void const*p)
1680
+ {
1681
+ // no corollary for Neon?
1682
+ }
1683
+
1684
+ #if defined(__GNUC__) || defined(__clang__)
1685
+ # pragma pop_macro("ALIGN_STRUCT")
1686
+ # pragma pop_macro("FORCE_INLINE")
1687
+ #endif
1688
+
1689
+ #endif