node-native-win-utils 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. package/README.md +144 -27
  2. package/binding.gyp +18 -5
  3. package/dist/index.d.ts +146 -4
  4. package/dist/index.js +107 -3
  5. package/include/opencv2/core/affine.hpp +678 -0
  6. package/include/opencv2/core/async.hpp +105 -0
  7. package/include/opencv2/core/base.hpp +664 -0
  8. package/include/opencv2/core/bindings_utils.hpp +325 -0
  9. package/include/opencv2/core/bufferpool.hpp +40 -0
  10. package/include/opencv2/core/check.hpp +170 -0
  11. package/include/opencv2/core/core.hpp +48 -0
  12. package/include/opencv2/core/core_c.h +3128 -0
  13. package/include/opencv2/core/cuda/block.hpp +211 -0
  14. package/include/opencv2/core/cuda/border_interpolate.hpp +722 -0
  15. package/include/opencv2/core/cuda/color.hpp +309 -0
  16. package/include/opencv2/core/cuda/common.hpp +131 -0
  17. package/include/opencv2/core/cuda/datamov_utils.hpp +113 -0
  18. package/include/opencv2/core/cuda/detail/color_detail.hpp +2018 -0
  19. package/include/opencv2/core/cuda/detail/reduce.hpp +365 -0
  20. package/include/opencv2/core/cuda/detail/reduce_key_val.hpp +502 -0
  21. package/include/opencv2/core/cuda/detail/transform_detail.hpp +392 -0
  22. package/include/opencv2/core/cuda/detail/type_traits_detail.hpp +191 -0
  23. package/include/opencv2/core/cuda/detail/vec_distance_detail.hpp +121 -0
  24. package/include/opencv2/core/cuda/dynamic_smem.hpp +88 -0
  25. package/include/opencv2/core/cuda/emulation.hpp +269 -0
  26. package/include/opencv2/core/cuda/filters.hpp +293 -0
  27. package/include/opencv2/core/cuda/funcattrib.hpp +79 -0
  28. package/include/opencv2/core/cuda/functional.hpp +805 -0
  29. package/include/opencv2/core/cuda/limits.hpp +128 -0
  30. package/include/opencv2/core/cuda/reduce.hpp +209 -0
  31. package/include/opencv2/core/cuda/saturate_cast.hpp +292 -0
  32. package/include/opencv2/core/cuda/scan.hpp +258 -0
  33. package/include/opencv2/core/cuda/simd_functions.hpp +869 -0
  34. package/include/opencv2/core/cuda/transform.hpp +75 -0
  35. package/include/opencv2/core/cuda/type_traits.hpp +90 -0
  36. package/include/opencv2/core/cuda/utility.hpp +230 -0
  37. package/include/opencv2/core/cuda/vec_distance.hpp +232 -0
  38. package/include/opencv2/core/cuda/vec_math.hpp +923 -0
  39. package/include/opencv2/core/cuda/vec_traits.hpp +288 -0
  40. package/include/opencv2/core/cuda/warp.hpp +139 -0
  41. package/include/opencv2/core/cuda/warp_reduce.hpp +76 -0
  42. package/include/opencv2/core/cuda/warp_shuffle.hpp +162 -0
  43. package/include/opencv2/core/cuda.hpp +1279 -0
  44. package/include/opencv2/core/cuda.inl.hpp +763 -0
  45. package/include/opencv2/core/cuda_stream_accessor.hpp +86 -0
  46. package/include/opencv2/core/cuda_types.hpp +144 -0
  47. package/include/opencv2/core/cv_cpu_dispatch.h +381 -0
  48. package/include/opencv2/core/cv_cpu_helper.h +550 -0
  49. package/include/opencv2/core/cvdef.h +973 -0
  50. package/include/opencv2/core/cvstd.hpp +190 -0
  51. package/include/opencv2/core/cvstd.inl.hpp +197 -0
  52. package/include/opencv2/core/cvstd_wrapper.hpp +154 -0
  53. package/include/opencv2/core/detail/async_promise.hpp +71 -0
  54. package/include/opencv2/core/detail/dispatch_helper.impl.hpp +49 -0
  55. package/include/opencv2/core/detail/exception_ptr.hpp +27 -0
  56. package/include/opencv2/core/directx.hpp +184 -0
  57. package/include/opencv2/core/dualquaternion.hpp +979 -0
  58. package/include/opencv2/core/dualquaternion.inl.hpp +487 -0
  59. package/include/opencv2/core/eigen.hpp +402 -0
  60. package/include/opencv2/core/fast_math.hpp +433 -0
  61. package/include/opencv2/core/hal/hal.hpp +256 -0
  62. package/include/opencv2/core/hal/interface.h +190 -0
  63. package/include/opencv2/core/hal/intrin.hpp +939 -0
  64. package/include/opencv2/core/hal/intrin_avx.hpp +3177 -0
  65. package/include/opencv2/core/hal/intrin_avx512.hpp +3090 -0
  66. package/include/opencv2/core/hal/intrin_cpp.hpp +3321 -0
  67. package/include/opencv2/core/hal/intrin_forward.hpp +191 -0
  68. package/include/opencv2/core/hal/intrin_lasx.hpp +3236 -0
  69. package/include/opencv2/core/hal/intrin_msa.hpp +1887 -0
  70. package/include/opencv2/core/hal/intrin_neon.hpp +2610 -0
  71. package/include/opencv2/core/hal/intrin_rvv.hpp +3320 -0
  72. package/include/opencv2/core/hal/intrin_rvv071.hpp +2545 -0
  73. package/include/opencv2/core/hal/intrin_rvv_scalable.hpp +2080 -0
  74. package/include/opencv2/core/hal/intrin_sse.hpp +3467 -0
  75. package/include/opencv2/core/hal/intrin_sse_em.hpp +180 -0
  76. package/include/opencv2/core/hal/intrin_vsx.hpp +1608 -0
  77. package/include/opencv2/core/hal/intrin_wasm.hpp +2782 -0
  78. package/include/opencv2/core/hal/msa_macros.h +1558 -0
  79. package/include/opencv2/core/hal/simd_utils.impl.hpp +186 -0
  80. package/include/opencv2/core/llapi/llapi.h +102 -0
  81. package/include/opencv2/core/mat.hpp +3775 -0
  82. package/include/opencv2/core/mat.inl.hpp +3422 -0
  83. package/include/opencv2/core/matx.hpp +1536 -0
  84. package/include/opencv2/core/neon_utils.hpp +128 -0
  85. package/include/opencv2/core/ocl.hpp +917 -0
  86. package/include/opencv2/core/ocl_genbase.hpp +69 -0
  87. package/include/opencv2/core/opencl/ocl_defs.hpp +82 -0
  88. package/include/opencv2/core/opencl/opencl_info.hpp +212 -0
  89. package/include/opencv2/core/opencl/opencl_svm.hpp +81 -0
  90. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp +602 -0
  91. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp +146 -0
  92. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp +371 -0
  93. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp +272 -0
  94. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp +62 -0
  95. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp +42 -0
  96. package/include/opencv2/core/opencl/runtime/opencl_clblas.hpp +53 -0
  97. package/include/opencv2/core/opencl/runtime/opencl_clfft.hpp +53 -0
  98. package/include/opencv2/core/opencl/runtime/opencl_core.hpp +84 -0
  99. package/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp +47 -0
  100. package/include/opencv2/core/opencl/runtime/opencl_gl.hpp +53 -0
  101. package/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp +47 -0
  102. package/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp +48 -0
  103. package/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp +42 -0
  104. package/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp +166 -0
  105. package/include/opencv2/core/opengl.hpp +733 -0
  106. package/include/opencv2/core/openvx/ovx_defs.hpp +48 -0
  107. package/include/opencv2/core/operations.hpp +610 -0
  108. package/include/opencv2/core/optim.hpp +302 -0
  109. package/include/opencv2/core/ovx.hpp +28 -0
  110. package/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp +72 -0
  111. package/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp +153 -0
  112. package/include/opencv2/core/parallel/parallel_backend.hpp +90 -0
  113. package/include/opencv2/core/persistence.hpp +1350 -0
  114. package/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp +30 -0
  115. package/include/opencv2/core/private.cuda.hpp +169 -0
  116. package/include/opencv2/core/private.hpp +896 -0
  117. package/include/opencv2/core/quaternion.hpp +1696 -0
  118. package/include/opencv2/core/quaternion.inl.hpp +1063 -0
  119. package/include/opencv2/core/saturate.hpp +180 -0
  120. package/include/opencv2/core/simd_intrinsics.hpp +87 -0
  121. package/include/opencv2/core/softfloat.hpp +514 -0
  122. package/include/opencv2/core/sse_utils.hpp +652 -0
  123. package/include/opencv2/core/traits.hpp +417 -0
  124. package/include/opencv2/core/types.hpp +2457 -0
  125. package/include/opencv2/core/types_c.h +2126 -0
  126. package/include/opencv2/core/utility.hpp +1229 -0
  127. package/include/opencv2/core/utils/allocator_stats.hpp +29 -0
  128. package/include/opencv2/core/utils/allocator_stats.impl.hpp +158 -0
  129. package/include/opencv2/core/utils/buffer_area.private.hpp +136 -0
  130. package/include/opencv2/core/utils/configuration.private.hpp +22 -0
  131. package/include/opencv2/core/utils/filesystem.hpp +82 -0
  132. package/include/opencv2/core/utils/filesystem.private.hpp +66 -0
  133. package/include/opencv2/core/utils/fp_control.private.hpp +29 -0
  134. package/include/opencv2/core/utils/fp_control_utils.hpp +69 -0
  135. package/include/opencv2/core/utils/instrumentation.hpp +125 -0
  136. package/include/opencv2/core/utils/lock.private.hpp +119 -0
  137. package/include/opencv2/core/utils/logger.defines.hpp +42 -0
  138. package/include/opencv2/core/utils/logger.hpp +218 -0
  139. package/include/opencv2/core/utils/logtag.hpp +28 -0
  140. package/include/opencv2/core/utils/plugin_loader.private.hpp +165 -0
  141. package/include/opencv2/core/utils/tls.hpp +235 -0
  142. package/include/opencv2/core/utils/trace.hpp +252 -0
  143. package/include/opencv2/core/utils/trace.private.hpp +421 -0
  144. package/include/opencv2/core/va_intel.hpp +75 -0
  145. package/include/opencv2/core/version.hpp +26 -0
  146. package/include/opencv2/core/vsx_utils.hpp +1047 -0
  147. package/include/opencv2/core.hpp +3365 -0
  148. package/include/opencv2/imgcodecs/imgcodecs.hpp +48 -0
  149. package/include/opencv2/imgcodecs/imgcodecs_c.h +1 -0
  150. package/include/opencv2/imgcodecs/ios.h +59 -0
  151. package/include/opencv2/imgcodecs/legacy/constants_c.h +54 -0
  152. package/include/opencv2/imgcodecs/macosx.h +20 -0
  153. package/include/opencv2/imgcodecs.hpp +407 -0
  154. package/include/opencv2/imgproc/bindings.hpp +34 -0
  155. package/include/opencv2/imgproc/detail/gcgraph.hpp +395 -0
  156. package/include/opencv2/imgproc/hal/hal.hpp +246 -0
  157. package/include/opencv2/imgproc/hal/interface.h +46 -0
  158. package/include/opencv2/imgproc/imgproc.hpp +48 -0
  159. package/include/opencv2/imgproc/imgproc_c.h +1177 -0
  160. package/include/opencv2/imgproc/segmentation.hpp +141 -0
  161. package/include/opencv2/imgproc/types_c.h +659 -0
  162. package/include/opencv2/imgproc.hpp +5035 -0
  163. package/include/opencv2/opencv_modules.hpp +17 -0
  164. package/libs/libjpeg-turbo.lib +0 -0
  165. package/libs/libpng.lib +0 -0
  166. package/libs/opencv_core470.lib +0 -0
  167. package/libs/opencv_imgcodecs470.lib +0 -0
  168. package/libs/opencv_imgproc470.lib +0 -0
  169. package/libs/zlib.lib +0 -0
  170. package/package.json +8 -2
  171. package/prebuilds/win32-x64/node.napi.node +0 -0
  172. package/src/cpp/capturewindow.cpp +36 -46
  173. package/src/cpp/main.cpp +10 -2
  174. package/src/cpp/opencv.cpp +425 -0
@@ -0,0 +1,1047 @@
1
+ // This file is part of OpenCV project.
2
+ // It is subject to the license terms in the LICENSE file found in the top-level directory
3
+ // of this distribution and at http://opencv.org/license.html
4
+
5
+ #ifndef OPENCV_HAL_VSX_UTILS_HPP
6
+ #define OPENCV_HAL_VSX_UTILS_HPP
7
+
8
+ #include "opencv2/core/cvdef.h"
9
+
10
+ #ifndef SKIP_INCLUDES
11
+ # include <assert.h>
12
+ #endif
13
+
14
+ //! @addtogroup core_utils_vsx
15
+ //! @{
16
+ #if CV_VSX
17
+
18
+ #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
19
+ #define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
20
+ #define __VSX_S4__(c, v) (c){v, v, v, v}
21
+ #define __VSX_S2__(c, v) (c){v, v}
22
+
23
+ typedef __vector unsigned char vec_uchar16;
24
+ #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
25
+ #define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, (unsigned char)c))
26
+ #define vec_uchar16_c(v) ((vec_uchar16)(v))
27
+ #define vec_uchar16_z vec_uchar16_sp(0)
28
+
29
+ typedef __vector signed char vec_char16;
30
+ #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
31
+ #define vec_char16_sp(c) (__VSX_S16__(vec_char16, (signed char)c))
32
+ #define vec_char16_c(v) ((vec_char16)(v))
33
+ #define vec_char16_z vec_char16_sp(0)
34
+
35
+ typedef __vector unsigned short vec_ushort8;
36
+ #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
37
+ #define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, (unsigned short)c))
38
+ #define vec_ushort8_c(v) ((vec_ushort8)(v))
39
+ #define vec_ushort8_z vec_ushort8_sp(0)
40
+
41
+ typedef __vector signed short vec_short8;
42
+ #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
43
+ #define vec_short8_sp(c) (__VSX_S8__(vec_short8, (signed short)c))
44
+ #define vec_short8_c(v) ((vec_short8)(v))
45
+ #define vec_short8_z vec_short8_sp(0)
46
+
47
+ typedef __vector unsigned int vec_uint4;
48
+ #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
49
+ #define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, (unsigned int)c))
50
+ #define vec_uint4_c(v) ((vec_uint4)(v))
51
+ #define vec_uint4_z vec_uint4_sp(0)
52
+
53
+ typedef __vector signed int vec_int4;
54
+ #define vec_int4_set(...) (vec_int4){__VA_ARGS__}
55
+ #define vec_int4_sp(c) (__VSX_S4__(vec_int4, (signed int)c))
56
+ #define vec_int4_c(v) ((vec_int4)(v))
57
+ #define vec_int4_z vec_int4_sp(0)
58
+
59
+ typedef __vector float vec_float4;
60
+ #define vec_float4_set(...) (vec_float4){__VA_ARGS__}
61
+ #define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
62
+ #define vec_float4_c(v) ((vec_float4)(v))
63
+ #define vec_float4_z vec_float4_sp(0)
64
+
65
+ typedef __vector unsigned long long vec_udword2;
66
+ #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
67
+ #define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, (unsigned long long)c))
68
+ #define vec_udword2_c(v) ((vec_udword2)(v))
69
+ #define vec_udword2_z vec_udword2_sp(0)
70
+
71
+ typedef __vector signed long long vec_dword2;
72
+ #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
73
+ #define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, (signed long long)c))
74
+ #define vec_dword2_c(v) ((vec_dword2)(v))
75
+ #define vec_dword2_z vec_dword2_sp(0)
76
+
77
+ typedef __vector double vec_double2;
78
+ #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
79
+ #define vec_double2_c(v) ((vec_double2)(v))
80
+ #define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
81
+ #define vec_double2_z vec_double2_sp(0)
82
+
83
+ #define vec_bchar16 __vector __bool char
84
+ #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
85
+ #define vec_bchar16_c(v) ((vec_bchar16)(v))
86
+
87
+ #define vec_bshort8 __vector __bool short
88
+ #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
89
+ #define vec_bshort8_c(v) ((vec_bshort8)(v))
90
+
91
+ #define vec_bint4 __vector __bool int
92
+ #define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
93
+ #define vec_bint4_c(v) ((vec_bint4)(v))
94
+
95
+ #define vec_bdword2 __vector __bool long long
96
+ #define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
97
+ #define vec_bdword2_c(v) ((vec_bdword2)(v))
98
+
99
+ #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
100
+
101
+ #define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
102
+ VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
103
+
104
+ #define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
105
+ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
106
+
107
+ /*
108
+ * GCC VSX compatibility
109
+ **/
110
+ #if defined(__GNUG__) && !defined(__clang__)
111
+
112
+ // inline asm helper
113
+ #define VSX_IMPL_1RG(rt, rg, opc, fnm) \
114
+ VSX_FINLINE(rt) fnm(const rg& a) \
115
+ { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
116
+
117
+ #define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
118
+ VSX_FINLINE(rt) fnm(const rg& a) \
119
+ { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
120
+
121
+ #define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
122
+ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
123
+ { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
124
+
125
+ #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
126
+
127
+ #if __GNUG__ < 8
128
+
129
+ // Support for int4 -> dword2 expanding multiply was added in GCC 8.
130
+ #ifdef vec_mule
131
+ #undef vec_mule
132
+ #endif
133
+ #ifdef vec_mulo
134
+ #undef vec_mulo
135
+ #endif
136
+
137
+ VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mule, __builtin_vec_mule)
138
+ VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mule, __builtin_vec_mule)
139
+ VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mule, __builtin_vec_mule)
140
+ VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mule, __builtin_vec_mule)
141
+ VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mulo, __builtin_vec_mulo)
142
+ VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mulo, __builtin_vec_mulo)
143
+ VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mulo, __builtin_vec_mulo)
144
+ VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)
145
+
146
+ // dword2 support arrived in ISA 2.07 and GCC 8+
147
+ VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
148
+ VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
149
+ VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
150
+ VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
151
+
152
+ #endif
153
+
154
+ #if __GNUG__ < 7
155
+ // up to GCC 6 vec_mul only supports precisions and llong
156
+ # ifdef vec_mul
157
+ # undef vec_mul
158
+ # endif
159
+ /*
160
+ * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
161
+ * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
162
+ **/
163
+ # define VSX_IMPL_MULH(Tvec, cperm) \
164
+ VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
165
+ { \
166
+ static const vec_uchar16 ev_od = {cperm}; \
167
+ return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
168
+ }
169
+ #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
170
+ VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
171
+ VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
172
+ #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
173
+ VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
174
+ VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
175
+ // vmuluwm can be used for unsigned or signed integers, that's what they said
176
+ VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
177
+ VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
178
+ // redirect to GCC builtin vec_mul, since it already supports precisions and llong
179
+ VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
180
+ VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
181
+ VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
182
+ VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
183
+ #endif // __GNUG__ < 7
184
+
185
+ #if __GNUG__ < 6
186
+ /*
187
+ * Instruction "compare greater than or equal" in ISA 2.07 only supports single
188
+ * and double precision.
189
+ * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
190
+ **/
191
+ # ifdef vec_cmpge
192
+ # undef vec_cmpge
193
+ # endif
194
+ # ifdef vec_cmple
195
+ # undef vec_cmple
196
+ # endif
197
+ # define vec_cmple(a, b) vec_cmpge(b, a)
198
+ # define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
199
+ VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
200
+
201
+ VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
202
+ VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
203
+ VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
204
+ VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
205
+ VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
206
+ VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
207
+ VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
208
+ VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
209
+
210
+ // redirect to GCC builtin cmpge, since it already supports precisions
211
+ VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
212
+ VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
213
+
214
+ // up to gcc5 vec_nor doesn't support bool long long
215
+ # undef vec_nor
216
+ template<typename T>
217
+ VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
218
+
219
+ VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
220
+ { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
221
+
222
+ // vec_packs doesn't support double words in gcc4 and old versions of gcc5
223
+ # undef vec_packs
224
+ VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
225
+ VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
226
+ VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
227
+ VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
228
+
229
+ VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
230
+ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
231
+ #endif // __GNUG__ < 6
232
+
233
+ #if __GNUG__ < 5
234
+ // vec_xxpermdi in gcc4 missing little-endian supports just like clang
235
+ # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
236
+ // same as vec_xxpermdi
237
+ # undef vec_vbpermq
238
+ VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
239
+ VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
240
+ #else
241
+ # define vec_permi vec_xxpermdi
242
+ #endif // __GNUG__ < 5
243
+
244
+ // shift left double by word immediate
245
+ #ifndef vec_sldw
246
+ # define vec_sldw __builtin_vsx_xxsldwi
247
+ #endif
248
+
249
+ // vector population count
250
+ VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
251
+ VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
252
+ VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
253
+ VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
254
+ VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
255
+ VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
256
+ VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
257
+ VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
258
+
259
+ // converts between single and double-precision
260
+ VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
261
+ VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
262
+
263
+ // converts word and doubleword to double-precision
264
+ #undef vec_ctd
265
+ VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
266
+ VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
267
+ VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
268
+ VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
269
+
270
+ // converts word and doubleword to single-precision
271
+ #undef vec_ctf
272
+ VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
273
+ VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
274
+ VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
275
+ VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
276
+
277
+ // converts single and double precision to signed word
278
+ #undef vec_cts
279
+ VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
280
+ VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
281
+
282
+ // converts single and double precision to unsigned word
283
+ #undef vec_ctu
284
+ VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
285
+ VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
286
+
287
+ // converts single and double precision to signed doubleword
288
+ #undef vec_ctsl
289
+ VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
290
+ VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
291
+
292
+ // converts single and double precision to unsigned doubleword
293
+ #undef vec_ctul
294
+ VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
295
+ VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
296
+
297
+ // just in case if GCC doesn't define it
298
+ #ifndef vec_xl
299
+ # define vec_xl vec_vsx_ld
300
+ # define vec_xst vec_vsx_st
301
+ #endif
302
+
303
+ #endif // GCC VSX compatibility
304
+
305
+ /*
306
+ * CLANG VSX compatibility
307
+ **/
308
+ #if defined(__clang__) && !defined(__IBMCPP__)
309
+
310
+ /*
311
+ * CLANG doesn't support %x<n> in the inline asm template which fixes register number
312
+ * when using any of the register constraints wa, wd, wf
313
+ *
314
+ * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
315
+ * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
316
+ *
317
+ * So we're not able to use inline asm and only use built-in functions that CLANG supports
318
+ * and use __builtin_convertvector if clang missing any of vector conversions built-in functions
319
+ *
320
+ * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
321
+ */
322
+
323
+ // convert vector helper
324
+ #define VSX_IMPL_CONVERT(rt, rg, fnm) \
325
+ VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
326
+
327
+ #ifndef vec_permi
328
+ #if __clang_major__ < 5
329
+ // implement vec_permi in a dirty way
330
+ # define VSX_IMPL_CLANG_4_PERMI(Tvec) \
331
+ VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
332
+ { \
333
+ switch (c) \
334
+ { \
335
+ case 0: \
336
+ return vec_mergeh(a, b); \
337
+ case 1: \
338
+ return vec_mergel(vec_mergeh(a, a), b); \
339
+ case 2: \
340
+ return vec_mergeh(vec_mergel(a, a), b); \
341
+ default: \
342
+ return vec_mergel(a, b); \
343
+ } \
344
+ }
345
+ VSX_IMPL_CLANG_4_PERMI(vec_udword2)
346
+ VSX_IMPL_CLANG_4_PERMI(vec_dword2)
347
+ VSX_IMPL_CLANG_4_PERMI(vec_double2)
348
+
349
+ // vec_xxsldwi is missing in clang 4
350
+ # define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
351
+ #else
352
+ // vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
353
+ # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
354
+ #endif // __clang_major__ < 5
355
+ #endif
356
+
357
+ // shift left double by word immediate
358
+ #ifndef vec_sldw
359
+ # define vec_sldw vec_xxsldwi
360
+ #endif
361
+
362
+ #if __clang_major__ < 13
363
+ // Implement vec_rsqrt since clang only supports vec_rsqrte
364
+ #ifndef vec_rsqrt
365
+ VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
366
+ { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
367
+
368
+ VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
369
+ { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
370
+ #endif
371
+
372
+ // vec_promote missing support for doubleword
373
+ VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
374
+ {
375
+ vec_dword2 ret = vec_dword2_z;
376
+ ret[b & 1] = a;
377
+ return ret;
378
+ }
379
+
380
+ VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
381
+ {
382
+ vec_udword2 ret = vec_udword2_z;
383
+ ret[b & 1] = a;
384
+ return ret;
385
+ }
386
+ #endif
387
+
388
+ // vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
389
+ #define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
390
+ VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a) \
391
+ { return ucast(vec_popcnt(a)); }
392
+ VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
393
+ VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
394
+ VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
395
+ VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
396
+ // redirect unsigned types
397
+ VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
398
+ VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
399
+ VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
400
+ VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
401
+
402
+ // converts between single and double precision
403
+ VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
404
+ VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
405
+
406
+ // converts word and doubleword to double-precision
407
+ #ifdef vec_ctd
408
+ # undef vec_ctd
409
+ #endif
410
+ VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
411
+ VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
412
+
413
+ VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
414
+ VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
415
+
416
+ // converts word and doubleword to single-precision
417
+ #if __clang_major__ > 4
418
+ # undef vec_ctf
419
+ #endif
420
+ VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
421
+ VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
422
+ VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
423
+ VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
424
+
425
+ // converts single and double precision to signed word
426
+ #if __clang_major__ > 4
427
+ # undef vec_cts
428
+ #endif
429
+ VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
430
+ VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
431
+
432
+ // converts single and double precision to unsigned word
433
+ #if __clang_major__ > 4
434
+ # undef vec_ctu
435
+ #endif
436
+ VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
437
+ VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
438
+
439
+ // converts single and double precision to signed doubleword
440
+ #ifdef vec_ctsl
441
+ # undef vec_ctsl
442
+ #endif
443
+ VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
444
+ // __builtin_convertvector unable to convert, xvcvspsxds is missing on it
445
+ VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
446
+ { return vec_ctsl(vec_cvfo(a)); }
447
+
448
+ // converts single and double precision to unsigned doubleword
449
+ #ifdef vec_ctul
450
+ # undef vec_ctul
451
+ #endif
452
+ VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
453
+ // __builtin_convertvector unable to convert, xvcvspuxds is missing on it
454
+ VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
455
+ { return vec_ctul(vec_cvfo(a)); }
456
+
457
+ #endif // CLANG VSX compatibility
458
+
459
+ /*
460
+ * Common GCC, CLANG compatibility
461
+ **/
462
+ #if defined(__GNUG__) && !defined(__IBMCPP__)
463
+
464
+ #ifdef vec_cvf
465
+ # undef vec_cvf
466
+ #endif
467
+
468
+ #define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
469
+ VSX_FINLINE(rt) fnm(const rg& a) \
470
+ { return fn2(vec_sldw(a, a, 1)); }
471
+
472
+ VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
473
+ VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
474
+ VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)
475
+
476
+ VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
477
+ VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
478
+
479
+ #define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
480
+ VSX_FINLINE(rt) fnm(const rg& a) \
481
+ { \
482
+ rt v4 = fn2(a); \
483
+ return vec_sldw(v4, v4, 3); \
484
+ }
485
+
486
+ VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
487
+ VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
488
+ VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
489
+
490
+ VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
491
+ VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)
492
+
493
+ // Only for Eigen!
494
+ /*
495
+ * changing behavior of conversion intrinsics for gcc has effect on Eigen
496
+ * so we redefine old behavior again only on gcc, clang
497
+ */
498
+ #if !defined(__clang__) || __clang_major__ > 4
499
+ // ignoring second arg since Eigen only truncates toward zero
500
+ # define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2) \
501
+ VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
502
+ { \
503
+ assert(only_truncate == 0); \
504
+ CV_UNUSED(only_truncate); \
505
+ return fn2(a); \
506
+ }
507
+ VSX_IMPL_CONV_2VARIANT(vec_int4, vec_float4, vec_cts, vec_cts)
508
+ VSX_IMPL_CONV_2VARIANT(vec_uint4, vec_float4, vec_ctu, vec_ctu)
509
+ VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4, vec_ctf, vec_ctf)
510
+ VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4, vec_ctf, vec_ctf)
511
+ // define vec_cts for converting double precision to signed doubleword
512
+ // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
513
+ VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
514
+ #endif // Eigen
515
+
516
+ #endif // Common GCC, CLANG compatibility
517
+
518
+ /*
519
+ * XLC VSX compatibility
520
+ **/
521
+ #if defined(__IBMCPP__)
522
+
523
+ // vector population count
524
+ #define vec_popcntu vec_popcnt
525
+
526
+ // overload and redirect with setting second arg to zero
527
+ // since we only support conversions without the second arg
528
+ #define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
529
+ VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
530
+
531
+ VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
532
+ VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
533
+ VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
534
+ VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
535
+
536
+ VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
537
+ VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
538
+ VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
539
+ VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
540
+
541
+ VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
542
+ VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
543
+
544
+ VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
545
+ VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
546
+
547
+ VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
548
+ VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
549
+
550
+ VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
551
+ VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
552
+
553
+ // fixme: implement conversions of odd-numbered elements in a dirty way
554
+ // since xlc doesn't support VSX registers operand in inline asm.
555
+ #define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
556
+ VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
557
+
558
+ VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
559
+ VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
560
+ VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
561
+
562
+ VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
563
+ VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
564
+
565
+ #define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2) \
566
+ VSX_FINLINE(rt) fnm(const rg& a) \
567
+ { \
568
+ rt v4 = fn2(a); \
569
+ return vec_sldw(v4, v4, 1); \
570
+ }
571
+
572
+ VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
573
+ VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
574
+ VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
575
+
576
+ VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
577
+ VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)
578
+
579
+ #endif // XLC VSX compatibility
580
+
581
+ // ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
582
+ #if defined(__GNUG__) && !defined(__clang__)
583
+ # define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
584
+ #else // CLANG, XLC
585
+ # define VSX_UNUSED(Tvec) Tvec
586
+ #endif
587
+
588
+ // gcc can find his way in casting log int and XLC, CLANG ambiguous
589
+ #if defined(__clang__) || defined(__IBMCPP__)
590
+ VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
591
+ { return vec_splats((unsigned long long) v); }
592
+
593
+ VSX_FINLINE(vec_dword2) vec_splats(int64 v)
594
+ { return vec_splats((long long) v); }
595
+
596
+ VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
597
+ { return vec_promote((unsigned long long) a, b); }
598
+
599
+ VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
600
+ { return vec_promote((long long) a, b); }
601
+ #endif
602
+
603
+ /*
604
+ * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
605
+ * load and set using offset depend on the pointer type
606
+ *
607
+ * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
608
+ * load and set using offset depend on fixed bytes size
609
+ *
610
+ * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
611
+ * so we are using vec_vsx_ld, vec_vsx_st instead
612
+ */
613
+
614
+ #if defined(__clang__) && !defined(__IBMCPP__)
615
+ # define vsx_ldf vec_vsx_ld
616
+ # define vsx_stf vec_vsx_st
617
+ #else // GCC , XLC
618
+ # define vsx_ldf vec_xl
619
+ # define vsx_stf vec_xst
620
+ #endif
621
+
622
+ #define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
623
+ #define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
624
+ #define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
625
+
626
+ /*
627
+ * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
628
+ * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
629
+ * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
630
+ *
631
+ * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
632
+ */
633
+ #if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
634
+ VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
635
+ { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
636
+
637
+ VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
638
+ { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
639
+
640
+ VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
641
+ { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
642
+
643
+ VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
644
+ { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
645
+ #else // XLC
646
+ VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
647
+ { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
648
+
649
+ VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
650
+ { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
651
+
652
+ VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
653
+ { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
654
+
655
+ VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
656
+ { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
657
+ #endif
658
+
659
+ // Store lower 8 byte
660
+ #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
661
+
662
+ // Store higher 8 byte
663
+ #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
664
+
665
+ // Load 64-bits of integer data to lower part
666
+ #define VSX_IMPL_LOAD_L8(Tvec, Tp) \
667
+ VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) \
668
+ { return ((Tvec)vec_promote(*((uint64*)p), 0)); }
669
+
670
+ VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
671
+ VSX_IMPL_LOAD_L8(vec_char16, schar)
672
+ VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
673
+ VSX_IMPL_LOAD_L8(vec_short8, short)
674
+ VSX_IMPL_LOAD_L8(vec_uint4, uint)
675
+ VSX_IMPL_LOAD_L8(vec_int4, int)
676
+ VSX_IMPL_LOAD_L8(vec_float4, float)
677
+ VSX_IMPL_LOAD_L8(vec_udword2, uint64)
678
+ VSX_IMPL_LOAD_L8(vec_dword2, int64)
679
+ VSX_IMPL_LOAD_L8(vec_double2, double)
680
+
681
+ // logical not
682
+ #define vec_not(a) vec_nor(a, a)
683
+
684
+ // power9 yaya
685
+ // not equal
686
+ #ifndef vec_cmpne
687
+ # define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
688
+ #endif
689
+
690
+ // absolute difference
691
+ #ifndef _ARCH_PWR9
692
+ # undef vec_absd
693
+ # define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
694
+ #endif
695
+
696
+ /*
697
+ * Implement vec_unpacklu and vec_unpackhu
698
+ * since vec_unpackl, vec_unpackh only support signed integers
699
+ **/
700
+ #define VSX_IMPL_UNPACKU(rt, rg, zero) \
701
+ VSX_FINLINE(rt) vec_unpacklu(const rg& a) \
702
+ { return (rt)(vec_mergel(a, zero)); } \
703
+ VSX_FINLINE(rt) vec_unpackhu(const rg& a) \
704
+ { return (rt)(vec_mergeh(a, zero)); }
705
+
706
+ VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
707
+ VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
708
+ VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
709
+
710
+ /*
711
+ * Implement vec_mergesqe and vec_mergesqo
712
+ * Merges the sequence values of even and odd elements of two vectors
713
+ */
714
+ #define VSX_IMPL_PERM(rt, fnm, ...) \
715
+ VSX_FINLINE(rt) fnm(const rt& a, const rt& b) \
716
+ { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
717
+
718
+ // 16
719
+ #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
720
+ #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
721
+ VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
722
+ VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
723
+ VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
724
+ VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
725
+ // 8
726
+ #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
727
+ #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
728
+ VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
729
+ VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
730
+ VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
731
+ VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
732
+ // 4
733
+ #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
734
+ #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
735
+ VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
736
+ VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
737
+ VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
738
+ VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
739
+ VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
740
+ VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
741
+ // 2
742
+ VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
743
+ VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
744
+ VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
745
+ VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
746
+ VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
747
+ VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
748
+
749
+ /*
750
+ * Implement vec_mergesqh and vec_mergesql
751
+ * Merges the sequence most and least significant halves of two vectors
752
+ */
753
+ #define VSX_IMPL_MERGESQHL(Tvec) \
754
+ VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
755
+ { return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
756
+ VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
757
+ { return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
758
+ VSX_IMPL_MERGESQHL(vec_uchar16)
759
+ VSX_IMPL_MERGESQHL(vec_char16)
760
+ VSX_IMPL_MERGESQHL(vec_ushort8)
761
+ VSX_IMPL_MERGESQHL(vec_short8)
762
+ VSX_IMPL_MERGESQHL(vec_uint4)
763
+ VSX_IMPL_MERGESQHL(vec_int4)
764
+ VSX_IMPL_MERGESQHL(vec_float4)
765
+ VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
766
+ VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
767
+ VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
768
+ VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
769
+ VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
770
+ VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
771
+
772
+
773
+ // 2 and 4 channels interleave for all types except 2 lanes
774
+ #define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
775
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
776
+ { \
777
+ vsx_stf(vec_mergeh(a, b), 0, ptr); \
778
+ vsx_stf(vec_mergel(a, b), 16, ptr); \
779
+ } \
780
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
781
+ const Tvec& c, const Tvec& d, Tp* ptr) \
782
+ { \
783
+ Tvec ac = vec_mergeh(a, c); \
784
+ Tvec bd = vec_mergeh(b, d); \
785
+ vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
786
+ vsx_stf(vec_mergel(ac, bd), 16, ptr); \
787
+ ac = vec_mergel(a, c); \
788
+ bd = vec_mergel(b, d); \
789
+ vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
790
+ vsx_stf(vec_mergel(ac, bd), 48, ptr); \
791
+ }
792
+ VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
793
+ VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
794
+ VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
795
+ VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
796
+ VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
797
+ VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
798
+ VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
799
+
800
+ // 2 and 4 channels deinterleave for 16 lanes
801
+ #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
802
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
803
+ { \
804
+ Tvec v0 = vsx_ld(0, ptr); \
805
+ Tvec v1 = vsx_ld(16, ptr); \
806
+ a = vec_mergesqe(v0, v1); \
807
+ b = vec_mergesqo(v0, v1); \
808
+ } \
809
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
810
+ Tvec& c, Tvec& d) \
811
+ { \
812
+ Tvec v0 = vsx_ld(0, ptr); \
813
+ Tvec v1 = vsx_ld(16, ptr); \
814
+ Tvec v2 = vsx_ld(32, ptr); \
815
+ Tvec v3 = vsx_ld(48, ptr); \
816
+ Tvec m0 = vec_mergesqe(v0, v1); \
817
+ Tvec m1 = vec_mergesqe(v2, v3); \
818
+ a = vec_mergesqe(m0, m1); \
819
+ c = vec_mergesqo(m0, m1); \
820
+ m0 = vec_mergesqo(v0, v1); \
821
+ m1 = vec_mergesqo(v2, v3); \
822
+ b = vec_mergesqe(m0, m1); \
823
+ d = vec_mergesqo(m0, m1); \
824
+ }
825
+ VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
826
+ VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
827
+
828
+ // 2 and 4 channels deinterleave for 8 lanes
829
+ #define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
830
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
831
+ { \
832
+ Tvec v0 = vsx_ld(0, ptr); \
833
+ Tvec v1 = vsx_ld(8, ptr); \
834
+ a = vec_mergesqe(v0, v1); \
835
+ b = vec_mergesqo(v0, v1); \
836
+ } \
837
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
838
+ Tvec& c, Tvec& d) \
839
+ { \
840
+ Tvec v0 = vsx_ld(0, ptr); \
841
+ Tvec v1 = vsx_ld(8, ptr); \
842
+ Tvec m0 = vec_mergeh(v0, v1); \
843
+ Tvec m1 = vec_mergel(v0, v1); \
844
+ Tvec ab0 = vec_mergeh(m0, m1); \
845
+ Tvec cd0 = vec_mergel(m0, m1); \
846
+ v0 = vsx_ld(16, ptr); \
847
+ v1 = vsx_ld(24, ptr); \
848
+ m0 = vec_mergeh(v0, v1); \
849
+ m1 = vec_mergel(v0, v1); \
850
+ Tvec ab1 = vec_mergeh(m0, m1); \
851
+ Tvec cd1 = vec_mergel(m0, m1); \
852
+ a = vec_mergesqh(ab0, ab1); \
853
+ b = vec_mergesql(ab0, ab1); \
854
+ c = vec_mergesqh(cd0, cd1); \
855
+ d = vec_mergesql(cd0, cd1); \
856
+ }
857
+ VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
858
+ VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
859
+
860
+ // 2 and 4 channels deinterleave for 4 lanes
861
+ #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
862
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
863
+ { \
864
+ a = vsx_ld(0, ptr); \
865
+ b = vsx_ld(4, ptr); \
866
+ Tvec m0 = vec_mergeh(a, b); \
867
+ Tvec m1 = vec_mergel(a, b); \
868
+ a = vec_mergeh(m0, m1); \
869
+ b = vec_mergel(m0, m1); \
870
+ } \
871
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
872
+ Tvec& c, Tvec& d) \
873
+ { \
874
+ Tvec v0 = vsx_ld(0, ptr); \
875
+ Tvec v1 = vsx_ld(4, ptr); \
876
+ Tvec v2 = vsx_ld(8, ptr); \
877
+ Tvec v3 = vsx_ld(12, ptr); \
878
+ Tvec m0 = vec_mergeh(v0, v2); \
879
+ Tvec m1 = vec_mergeh(v1, v3); \
880
+ a = vec_mergeh(m0, m1); \
881
+ b = vec_mergel(m0, m1); \
882
+ m0 = vec_mergel(v0, v2); \
883
+ m1 = vec_mergel(v1, v3); \
884
+ c = vec_mergeh(m0, m1); \
885
+ d = vec_mergel(m0, m1); \
886
+ }
887
+ VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
888
+ VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
889
+ VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
890
+
891
+ // 2 and 4 channels interleave and deinterleave for 2 lanes
892
+ #define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
893
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
894
+ { \
895
+ st_func(vec_mergeh(a, b), 0, ptr); \
896
+ st_func(vec_mergel(a, b), 2, ptr); \
897
+ } \
898
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
899
+ const Tvec& c, const Tvec& d, Tp* ptr) \
900
+ { \
901
+ st_func(vec_mergeh(a, b), 0, ptr); \
902
+ st_func(vec_mergeh(c, d), 2, ptr); \
903
+ st_func(vec_mergel(a, b), 4, ptr); \
904
+ st_func(vec_mergel(c, d), 6, ptr); \
905
+ } \
906
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
907
+ { \
908
+ Tvec m0 = ld_func(0, ptr); \
909
+ Tvec m1 = ld_func(2, ptr); \
910
+ a = vec_mergeh(m0, m1); \
911
+ b = vec_mergel(m0, m1); \
912
+ } \
913
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
914
+ Tvec& c, Tvec& d) \
915
+ { \
916
+ Tvec v0 = ld_func(0, ptr); \
917
+ Tvec v1 = ld_func(2, ptr); \
918
+ Tvec v2 = ld_func(4, ptr); \
919
+ Tvec v3 = ld_func(6, ptr); \
920
+ a = vec_mergeh(v0, v2); \
921
+ b = vec_mergel(v0, v2); \
922
+ c = vec_mergeh(v1, v3); \
923
+ d = vec_mergel(v1, v3); \
924
+ }
925
+ VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
926
+ VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
927
+ VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
928
+
929
+ /* 3 channels */
930
+ #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
931
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
932
+ const Tvec& c, Tp* ptr) \
933
+ { \
934
+ static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
935
+ static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
936
+ vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
937
+ static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
938
+ static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
939
+ vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
940
+ static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
941
+ static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
942
+ vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
943
+ } \
944
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
945
+ { \
946
+ Tvec v1 = vsx_ld(0, ptr); \
947
+ Tvec v2 = vsx_ld(16, ptr); \
948
+ Tvec v3 = vsx_ld(32, ptr); \
949
+ static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
950
+ static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
951
+ a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
952
+ static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
953
+ static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
954
+ b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
955
+ static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
956
+ static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
957
+ c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
958
+ }
959
+ VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
960
+ VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
961
+
962
+ #define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
963
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
964
+ const Tvec& c, Tp* ptr) \
965
+ { \
966
+ static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
967
+ static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
968
+ vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
969
+ static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
970
+ static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
971
+ vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
972
+ static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
973
+ static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
974
+ vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
975
+ } \
976
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
977
+ { \
978
+ Tvec v1 = vsx_ld(0, ptr); \
979
+ Tvec v2 = vsx_ld(8, ptr); \
980
+ Tvec v3 = vsx_ld(16, ptr); \
981
+ static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
982
+ static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
983
+ a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
984
+ static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
985
+ static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
986
+ b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
987
+ static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
988
+ static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
989
+ c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
990
+ }
991
+ VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
992
+ VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
993
+
994
+ #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
995
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
996
+ const Tvec& c, Tp* ptr) \
997
+ { \
998
+ Tvec hbc = vec_mergeh(b, c); \
999
+ static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
1000
+ vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
1001
+ Tvec lab = vec_mergel(a, b); \
1002
+ vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
1003
+ static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
1004
+ vsx_st(vec_perm(c, lab, clab), 8, ptr); \
1005
+ } \
1006
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
1007
+ { \
1008
+ Tvec v1 = vsx_ld(0, ptr); \
1009
+ Tvec v2 = vsx_ld(4, ptr); \
1010
+ Tvec v3 = vsx_ld(8, ptr); \
1011
+ static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
1012
+ a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
1013
+ static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
1014
+ b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
1015
+ c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
1016
+ }
1017
+ VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
1018
+ VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
1019
+ VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
1020
+
1021
+ #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
1022
+ VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
1023
+ const Tvec& c, Tp* ptr) \
1024
+ { \
1025
+ st_func(vec_mergeh(a, b), 0, ptr); \
1026
+ st_func(vec_permi(c, a, 1), 2, ptr); \
1027
+ st_func(vec_mergel(b, c), 4, ptr); \
1028
+ } \
1029
+ VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
1030
+ Tvec& b, Tvec& c) \
1031
+ { \
1032
+ Tvec v1 = ld_func(0, ptr); \
1033
+ Tvec v2 = ld_func(2, ptr); \
1034
+ Tvec v3 = ld_func(4, ptr); \
1035
+ a = vec_permi(v1, v2, 1); \
1036
+ b = vec_permi(v1, v3, 2); \
1037
+ c = vec_permi(v2, v3, 1); \
1038
+ }
1039
+ VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
1040
+ VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
1041
+ VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
1042
+
1043
+ #endif // CV_VSX
1044
+
1045
+ //! @}
1046
+
1047
+ #endif // OPENCV_HAL_VSX_UTILS_HPP