node-native-win-utils 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. package/README.md +144 -27
  2. package/binding.gyp +18 -5
  3. package/dist/index.d.ts +146 -4
  4. package/dist/index.js +107 -3
  5. package/include/opencv2/core/affine.hpp +678 -0
  6. package/include/opencv2/core/async.hpp +105 -0
  7. package/include/opencv2/core/base.hpp +664 -0
  8. package/include/opencv2/core/bindings_utils.hpp +325 -0
  9. package/include/opencv2/core/bufferpool.hpp +40 -0
  10. package/include/opencv2/core/check.hpp +170 -0
  11. package/include/opencv2/core/core.hpp +48 -0
  12. package/include/opencv2/core/core_c.h +3128 -0
  13. package/include/opencv2/core/cuda/block.hpp +211 -0
  14. package/include/opencv2/core/cuda/border_interpolate.hpp +722 -0
  15. package/include/opencv2/core/cuda/color.hpp +309 -0
  16. package/include/opencv2/core/cuda/common.hpp +131 -0
  17. package/include/opencv2/core/cuda/datamov_utils.hpp +113 -0
  18. package/include/opencv2/core/cuda/detail/color_detail.hpp +2018 -0
  19. package/include/opencv2/core/cuda/detail/reduce.hpp +365 -0
  20. package/include/opencv2/core/cuda/detail/reduce_key_val.hpp +502 -0
  21. package/include/opencv2/core/cuda/detail/transform_detail.hpp +392 -0
  22. package/include/opencv2/core/cuda/detail/type_traits_detail.hpp +191 -0
  23. package/include/opencv2/core/cuda/detail/vec_distance_detail.hpp +121 -0
  24. package/include/opencv2/core/cuda/dynamic_smem.hpp +88 -0
  25. package/include/opencv2/core/cuda/emulation.hpp +269 -0
  26. package/include/opencv2/core/cuda/filters.hpp +293 -0
  27. package/include/opencv2/core/cuda/funcattrib.hpp +79 -0
  28. package/include/opencv2/core/cuda/functional.hpp +805 -0
  29. package/include/opencv2/core/cuda/limits.hpp +128 -0
  30. package/include/opencv2/core/cuda/reduce.hpp +209 -0
  31. package/include/opencv2/core/cuda/saturate_cast.hpp +292 -0
  32. package/include/opencv2/core/cuda/scan.hpp +258 -0
  33. package/include/opencv2/core/cuda/simd_functions.hpp +869 -0
  34. package/include/opencv2/core/cuda/transform.hpp +75 -0
  35. package/include/opencv2/core/cuda/type_traits.hpp +90 -0
  36. package/include/opencv2/core/cuda/utility.hpp +230 -0
  37. package/include/opencv2/core/cuda/vec_distance.hpp +232 -0
  38. package/include/opencv2/core/cuda/vec_math.hpp +923 -0
  39. package/include/opencv2/core/cuda/vec_traits.hpp +288 -0
  40. package/include/opencv2/core/cuda/warp.hpp +139 -0
  41. package/include/opencv2/core/cuda/warp_reduce.hpp +76 -0
  42. package/include/opencv2/core/cuda/warp_shuffle.hpp +162 -0
  43. package/include/opencv2/core/cuda.hpp +1279 -0
  44. package/include/opencv2/core/cuda.inl.hpp +763 -0
  45. package/include/opencv2/core/cuda_stream_accessor.hpp +86 -0
  46. package/include/opencv2/core/cuda_types.hpp +144 -0
  47. package/include/opencv2/core/cv_cpu_dispatch.h +381 -0
  48. package/include/opencv2/core/cv_cpu_helper.h +550 -0
  49. package/include/opencv2/core/cvdef.h +973 -0
  50. package/include/opencv2/core/cvstd.hpp +190 -0
  51. package/include/opencv2/core/cvstd.inl.hpp +197 -0
  52. package/include/opencv2/core/cvstd_wrapper.hpp +154 -0
  53. package/include/opencv2/core/detail/async_promise.hpp +71 -0
  54. package/include/opencv2/core/detail/dispatch_helper.impl.hpp +49 -0
  55. package/include/opencv2/core/detail/exception_ptr.hpp +27 -0
  56. package/include/opencv2/core/directx.hpp +184 -0
  57. package/include/opencv2/core/dualquaternion.hpp +979 -0
  58. package/include/opencv2/core/dualquaternion.inl.hpp +487 -0
  59. package/include/opencv2/core/eigen.hpp +402 -0
  60. package/include/opencv2/core/fast_math.hpp +433 -0
  61. package/include/opencv2/core/hal/hal.hpp +256 -0
  62. package/include/opencv2/core/hal/interface.h +190 -0
  63. package/include/opencv2/core/hal/intrin.hpp +939 -0
  64. package/include/opencv2/core/hal/intrin_avx.hpp +3177 -0
  65. package/include/opencv2/core/hal/intrin_avx512.hpp +3090 -0
  66. package/include/opencv2/core/hal/intrin_cpp.hpp +3321 -0
  67. package/include/opencv2/core/hal/intrin_forward.hpp +191 -0
  68. package/include/opencv2/core/hal/intrin_lasx.hpp +3236 -0
  69. package/include/opencv2/core/hal/intrin_msa.hpp +1887 -0
  70. package/include/opencv2/core/hal/intrin_neon.hpp +2610 -0
  71. package/include/opencv2/core/hal/intrin_rvv.hpp +3320 -0
  72. package/include/opencv2/core/hal/intrin_rvv071.hpp +2545 -0
  73. package/include/opencv2/core/hal/intrin_rvv_scalable.hpp +2080 -0
  74. package/include/opencv2/core/hal/intrin_sse.hpp +3467 -0
  75. package/include/opencv2/core/hal/intrin_sse_em.hpp +180 -0
  76. package/include/opencv2/core/hal/intrin_vsx.hpp +1608 -0
  77. package/include/opencv2/core/hal/intrin_wasm.hpp +2782 -0
  78. package/include/opencv2/core/hal/msa_macros.h +1558 -0
  79. package/include/opencv2/core/hal/simd_utils.impl.hpp +186 -0
  80. package/include/opencv2/core/llapi/llapi.h +102 -0
  81. package/include/opencv2/core/mat.hpp +3775 -0
  82. package/include/opencv2/core/mat.inl.hpp +3422 -0
  83. package/include/opencv2/core/matx.hpp +1536 -0
  84. package/include/opencv2/core/neon_utils.hpp +128 -0
  85. package/include/opencv2/core/ocl.hpp +917 -0
  86. package/include/opencv2/core/ocl_genbase.hpp +69 -0
  87. package/include/opencv2/core/opencl/ocl_defs.hpp +82 -0
  88. package/include/opencv2/core/opencl/opencl_info.hpp +212 -0
  89. package/include/opencv2/core/opencl/opencl_svm.hpp +81 -0
  90. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp +602 -0
  91. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp +146 -0
  92. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp +371 -0
  93. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp +272 -0
  94. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp +62 -0
  95. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp +42 -0
  96. package/include/opencv2/core/opencl/runtime/opencl_clblas.hpp +53 -0
  97. package/include/opencv2/core/opencl/runtime/opencl_clfft.hpp +53 -0
  98. package/include/opencv2/core/opencl/runtime/opencl_core.hpp +84 -0
  99. package/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp +47 -0
  100. package/include/opencv2/core/opencl/runtime/opencl_gl.hpp +53 -0
  101. package/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp +47 -0
  102. package/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp +48 -0
  103. package/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp +42 -0
  104. package/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp +166 -0
  105. package/include/opencv2/core/opengl.hpp +733 -0
  106. package/include/opencv2/core/openvx/ovx_defs.hpp +48 -0
  107. package/include/opencv2/core/operations.hpp +610 -0
  108. package/include/opencv2/core/optim.hpp +302 -0
  109. package/include/opencv2/core/ovx.hpp +28 -0
  110. package/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp +72 -0
  111. package/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp +153 -0
  112. package/include/opencv2/core/parallel/parallel_backend.hpp +90 -0
  113. package/include/opencv2/core/persistence.hpp +1350 -0
  114. package/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp +30 -0
  115. package/include/opencv2/core/private.cuda.hpp +169 -0
  116. package/include/opencv2/core/private.hpp +896 -0
  117. package/include/opencv2/core/quaternion.hpp +1696 -0
  118. package/include/opencv2/core/quaternion.inl.hpp +1063 -0
  119. package/include/opencv2/core/saturate.hpp +180 -0
  120. package/include/opencv2/core/simd_intrinsics.hpp +87 -0
  121. package/include/opencv2/core/softfloat.hpp +514 -0
  122. package/include/opencv2/core/sse_utils.hpp +652 -0
  123. package/include/opencv2/core/traits.hpp +417 -0
  124. package/include/opencv2/core/types.hpp +2457 -0
  125. package/include/opencv2/core/types_c.h +2126 -0
  126. package/include/opencv2/core/utility.hpp +1229 -0
  127. package/include/opencv2/core/utils/allocator_stats.hpp +29 -0
  128. package/include/opencv2/core/utils/allocator_stats.impl.hpp +158 -0
  129. package/include/opencv2/core/utils/buffer_area.private.hpp +136 -0
  130. package/include/opencv2/core/utils/configuration.private.hpp +22 -0
  131. package/include/opencv2/core/utils/filesystem.hpp +82 -0
  132. package/include/opencv2/core/utils/filesystem.private.hpp +66 -0
  133. package/include/opencv2/core/utils/fp_control.private.hpp +29 -0
  134. package/include/opencv2/core/utils/fp_control_utils.hpp +69 -0
  135. package/include/opencv2/core/utils/instrumentation.hpp +125 -0
  136. package/include/opencv2/core/utils/lock.private.hpp +119 -0
  137. package/include/opencv2/core/utils/logger.defines.hpp +42 -0
  138. package/include/opencv2/core/utils/logger.hpp +218 -0
  139. package/include/opencv2/core/utils/logtag.hpp +28 -0
  140. package/include/opencv2/core/utils/plugin_loader.private.hpp +165 -0
  141. package/include/opencv2/core/utils/tls.hpp +235 -0
  142. package/include/opencv2/core/utils/trace.hpp +252 -0
  143. package/include/opencv2/core/utils/trace.private.hpp +421 -0
  144. package/include/opencv2/core/va_intel.hpp +75 -0
  145. package/include/opencv2/core/version.hpp +26 -0
  146. package/include/opencv2/core/vsx_utils.hpp +1047 -0
  147. package/include/opencv2/core.hpp +3365 -0
  148. package/include/opencv2/imgcodecs/imgcodecs.hpp +48 -0
  149. package/include/opencv2/imgcodecs/imgcodecs_c.h +1 -0
  150. package/include/opencv2/imgcodecs/ios.h +59 -0
  151. package/include/opencv2/imgcodecs/legacy/constants_c.h +54 -0
  152. package/include/opencv2/imgcodecs/macosx.h +20 -0
  153. package/include/opencv2/imgcodecs.hpp +407 -0
  154. package/include/opencv2/imgproc/bindings.hpp +34 -0
  155. package/include/opencv2/imgproc/detail/gcgraph.hpp +395 -0
  156. package/include/opencv2/imgproc/hal/hal.hpp +246 -0
  157. package/include/opencv2/imgproc/hal/interface.h +46 -0
  158. package/include/opencv2/imgproc/imgproc.hpp +48 -0
  159. package/include/opencv2/imgproc/imgproc_c.h +1177 -0
  160. package/include/opencv2/imgproc/segmentation.hpp +141 -0
  161. package/include/opencv2/imgproc/types_c.h +659 -0
  162. package/include/opencv2/imgproc.hpp +5035 -0
  163. package/include/opencv2/opencv_modules.hpp +17 -0
  164. package/libs/libjpeg-turbo.lib +0 -0
  165. package/libs/libpng.lib +0 -0
  166. package/libs/opencv_core470.lib +0 -0
  167. package/libs/opencv_imgcodecs470.lib +0 -0
  168. package/libs/opencv_imgproc470.lib +0 -0
  169. package/libs/zlib.lib +0 -0
  170. package/package.json +14 -3
  171. package/prebuilds/win32-x64/node.napi.node +0 -0
  172. package/src/cpp/capturewindow.cpp +36 -46
  173. package/src/cpp/main.cpp +10 -2
  174. package/src/cpp/opencv.cpp +425 -0
@@ -0,0 +1,3321 @@
1
+ /*M///////////////////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
+ //
5
+ // By downloading, copying, installing or using the software you agree to this license.
6
+ // If you do not agree to this license, do not download, install,
7
+ // copy or use the software.
8
+ //
9
+ //
10
+ // License Agreement
11
+ // For Open Source Computer Vision Library
12
+ //
13
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
+ // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
+ // Copyright (C) 2015, Itseez Inc., all rights reserved.
17
+ // Third party copyrights are property of their respective owners.
18
+ //
19
+ // Redistribution and use in source and binary forms, with or without modification,
20
+ // are permitted provided that the following conditions are met:
21
+ //
22
+ // * Redistribution's of source code must retain the above copyright notice,
23
+ // this list of conditions and the following disclaimer.
24
+ //
25
+ // * Redistribution's in binary form must reproduce the above copyright notice,
26
+ // this list of conditions and the following disclaimer in the documentation
27
+ // and/or other materials provided with the distribution.
28
+ //
29
+ // * The name of the copyright holders may not be used to endorse or promote products
30
+ // derived from this software without specific prior written permission.
31
+ //
32
+ // This software is provided by the copyright holders and contributors "as is" and
33
+ // any express or implied warranties, including, but not limited to, the implied
34
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
35
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
36
+ // indirect, incidental, special, exemplary, or consequential damages
37
+ // (including, but not limited to, procurement of substitute goods or services;
38
+ // loss of use, data, or profits; or business interruption) however caused
39
+ // and on any theory of liability, whether in contract, strict liability,
40
+ // or tort (including negligence or otherwise) arising in any way out of
41
+ // the use of this software, even if advised of the possibility of such damage.
42
+ //
43
+ //M*/
44
+
45
+ #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46
+ #define OPENCV_HAL_INTRIN_CPP_HPP
47
+
48
+ #include <limits>
49
+ #include <cstring>
50
+ #include <algorithm>
51
+ #include "opencv2/core/utility.hpp"
52
+ #include "opencv2/core/saturate.hpp"
53
+
54
+ //! @cond IGNORED
55
+ #define CV_SIMD128_CPP 1
56
+ #if defined(CV_FORCE_SIMD128_CPP)
57
+ #define CV_SIMD128 1
58
+ #define CV_SIMD128_64F 1
59
+ #endif
60
+ #if defined(CV_DOXYGEN)
61
+ #define CV_SIMD128 1
62
+ #define CV_SIMD128_64F 1
63
+ #define CV_SIMD256 1
64
+ #define CV_SIMD256_64F 1
65
+ #define CV_SIMD512 1
66
+ #define CV_SIMD512_64F 1
67
+ #else
68
+ #define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
69
+ #define CV_SIMD512 0 // to avoid warnings during compilation
70
+ #endif
71
+ //! @endcond
72
+
73
+ namespace cv
74
+ {
75
+
76
+ #ifndef CV_DOXYGEN
77
+ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
78
+ #endif
79
+
80
+ /** @addtogroup core_hal_intrin
81
+
82
+ "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
83
+ different platforms. Currently a few different SIMD extensions on different architectures are supported.
84
+ 128 bit registers of various types support is implemented for a wide range of architectures
85
+ including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
86
+ 256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
87
+ In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
88
+ will be chosen and code will work as expected although it could be slower.
89
+
90
+ ### Types
91
+
92
+ There are several types representing packed values vector registers, each type is
93
+ implemented as a structure based on a one SIMD register.
94
+
95
+ - cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
96
+ - cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
97
+ - cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
98
+ - cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
99
+ - cv::v_float32: 32-bit floating point values (signed) - float
100
+ - cv::v_float64: 64-bit floating point values (signed) - double
101
+
102
+ Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
103
+ capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
104
+ to check for exact value quantity of the type.
105
+
106
+ In case the exact bit length of the type is important it is possible to use specific fixed length register types.
107
+
108
+ There are several types representing 128-bit registers.
109
+
110
+ - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
111
+ - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
112
+ - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
113
+ - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
114
+ - cv::v_float32x4: four 32-bit floating point values (signed) - float
115
+ - cv::v_float64x2: two 64-bit floating point values (signed) - double
116
+
117
+ There are several types representing 256-bit registers.
118
+
119
+ - cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
120
+ - cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
121
+ - cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
122
+ - cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
123
+ - cv::v_float32x8: eight 32-bit floating point values (signed) - float
124
+ - cv::v_float64x4: four 64-bit floating point values (signed) - double
125
+
126
+ @note
127
+ 256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
128
+ don't forget to check the CV_SIMD256 preprocessor definition:
129
+ @code
130
+ #if CV_SIMD256
131
+ //...
132
+ #endif
133
+ @endcode
134
+
135
+ There are several types representing 512-bit registers.
136
+
137
+ - cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
138
+ - cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
139
+ - cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
140
+ - cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
141
+ - cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
142
+ - cv::v_float64x8: eight 64-bit floating point values (signed) - double
143
+ @note
144
+ 512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
145
+ don't forget to check the CV_SIMD512 preprocessor definition.
146
+
147
+ @note
148
+ cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
149
+ check the CV_SIMD128_64F preprocessor definition.
150
+
151
+ ### Load and store operations
152
+
153
+ These operations allow to set contents of the register explicitly or by loading it from some memory
154
+ block and to save contents of the register to memory block.
155
+
156
+ There are variable size register load operations that provide result of maximum available size
157
+ depending on chosen platform capabilities.
158
+ - Constructors:
159
+ @ref v_reg::v_reg(const _Tp *ptr) "from memory",
160
+ - Other create methods:
161
+ vx_setall_s8, vx_setall_u8, ...,
162
+ vx_setzero_u8, vx_setzero_s8, ...
163
+ - Memory load operations:
164
+ vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
165
+ - Memory operations with expansion of values:
166
+ vx_load_expand, vx_load_expand_q
167
+
168
+ Also there are fixed size register load/store operations.
169
+
170
+ For 128 bit registers
171
+ - Constructors:
172
+ @ref v_reg::v_reg(const _Tp *ptr) "from memory",
173
+ @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
174
+ - Other create methods:
175
+ @ref v_setall_s8, @ref v_setall_u8, ...,
176
+ @ref v_setzero_u8, @ref v_setzero_s8, ...
177
+ - Memory load operations:
178
+ @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
179
+ - Memory operations with expansion of values:
180
+ @ref v_load_expand, @ref v_load_expand_q
181
+
182
+ For 256 bit registers(check CV_SIMD256 preprocessor definition)
183
+ - Constructors:
184
+ @ref v_reg::v_reg(const _Tp *ptr) "from memory",
185
+ @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
186
+ - Other create methods:
187
+ @ref v256_setall_s8, @ref v256_setall_u8, ...,
188
+ @ref v256_setzero_u8, @ref v256_setzero_s8, ...
189
+ - Memory load operations:
190
+ @ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
191
+ - Memory operations with expansion of values:
192
+ @ref v256_load_expand, @ref v256_load_expand_q
193
+
194
+ For 512 bit registers(check CV_SIMD512 preprocessor definition)
195
+ - Constructors:
196
+ @ref v_reg::v_reg(const _Tp *ptr) "from memory",
197
+ @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
198
+ - Other create methods:
199
+ @ref v512_setall_s8, @ref v512_setall_u8, ...,
200
+ @ref v512_setzero_u8, @ref v512_setzero_s8, ...
201
+ - Memory load operations:
202
+ @ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
203
+ - Memory operations with expansion of values:
204
+ @ref v512_load_expand, @ref v512_load_expand_q
205
+
206
+ Store to memory operations are similar across different platform capabilities:
207
+ @ref v_store, @ref v_store_aligned,
208
+ @ref v_store_high, @ref v_store_low
209
+
210
+ ### Value reordering
211
+
212
+ These operations allow to reorder or recombine elements in one or multiple vectors.
213
+
214
+ - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
215
+ - Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
216
+ - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
217
+ @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
218
+ - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
219
+ - Reverse: @ref v_reverse
220
+ - Extract: @ref v_extract
221
+
222
+
223
+ ### Arithmetic, bitwise and comparison operations
224
+
225
+ Element-wise binary and unary operations.
226
+
227
+ - Arithmetics:
228
+ @ref operator +(const v_reg &a, const v_reg &b) "+",
229
+ @ref operator -(const v_reg &a, const v_reg &b) "-",
230
+ @ref operator *(const v_reg &a, const v_reg &b) "*",
231
+ @ref operator /(const v_reg &a, const v_reg &b) "/",
232
+ @ref v_mul_expand
233
+
234
+ - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
235
+
236
+ - Bitwise shifts:
237
+ @ref operator <<(const v_reg &a, int s) "<<",
238
+ @ref operator >>(const v_reg &a, int s) ">>",
239
+ @ref v_shl, @ref v_shr
240
+
241
+ - Bitwise logic:
242
+ @ref operator &(const v_reg &a, const v_reg &b) "&",
243
+ @ref operator |(const v_reg &a, const v_reg &b) "|",
244
+ @ref operator ^(const v_reg &a, const v_reg &b) "^",
245
+ @ref operator ~(const v_reg &a) "~"
246
+
247
+ - Comparison:
248
+ @ref operator >(const v_reg &a, const v_reg &b) ">",
249
+ @ref operator >=(const v_reg &a, const v_reg &b) ">=",
250
+ @ref operator <(const v_reg &a, const v_reg &b) "<",
251
+ @ref operator <=(const v_reg &a, const v_reg &b) "<=",
252
+ @ref operator ==(const v_reg &a, const v_reg &b) "==",
253
+ @ref operator !=(const v_reg &a, const v_reg &b) "!="
254
+
255
+ - min/max: @ref v_min, @ref v_max
256
+
257
+ ### Reduce and mask
258
+
259
+ Most of these operations return only one value.
260
+
261
+ - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
262
+ - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
263
+
264
+ ### Other math
265
+
266
+ - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
267
+ - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
268
+
269
+ ### Conversions
270
+
271
+ Different type conversions and casts:
272
+
273
+ - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
274
+ - To float: @ref v_cvt_f32, @ref v_cvt_f64
275
+ - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
276
+
277
+ ### Matrix operations
278
+
279
+ In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
280
+ @ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
281
+
282
+ ### Usability
283
+
284
+ Most operations are implemented only for some subset of the available types, following matrices
285
+ shows the applicability of different operations to the types.
286
+
287
+ Regular integers:
288
+
289
+ | Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
290
+ |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
291
+ |load, store | x | x | x | x | x | x |
292
+ |interleave | x | x | x | x | x | x |
293
+ |expand | x | x | x | x | x | x |
294
+ |expand_low | x | x | x | x | x | x |
295
+ |expand_high | x | x | x | x | x | x |
296
+ |expand_q | x | x | | | | |
297
+ |add, sub | x | x | x | x | x | x |
298
+ |add_wrap, sub_wrap | x | x | x | x | | |
299
+ |mul_wrap | x | x | x | x | | |
300
+ |mul | x | x | x | x | x | x |
301
+ |mul_expand | x | x | x | x | x | |
302
+ |compare | x | x | x | x | x | x |
303
+ |shift | | | x | x | x | x |
304
+ |dotprod | | | | x | | x |
305
+ |dotprod_fast | | | | x | | x |
306
+ |dotprod_expand | x | x | x | x | | x |
307
+ |dotprod_expand_fast| x | x | x | x | | x |
308
+ |logical | x | x | x | x | x | x |
309
+ |min, max | x | x | x | x | x | x |
310
+ |absdiff | x | x | x | x | x | x |
311
+ |absdiffs | | x | | x | | |
312
+ |reduce | x | x | x | x | x | x |
313
+ |mask | x | x | x | x | x | x |
314
+ |pack | x | x | x | x | x | x |
315
+ |pack_u | x | | x | | | |
316
+ |pack_b | x | | | | | |
317
+ |unpack | x | x | x | x | x | x |
318
+ |extract | x | x | x | x | x | x |
319
+ |rotate (lanes) | x | x | x | x | x | x |
320
+ |cvt_flt32 | | | | | | x |
321
+ |cvt_flt64 | | | | | | x |
322
+ |transpose4x4 | | | | | x | x |
323
+ |reverse | x | x | x | x | x | x |
324
+ |extract_n | x | x | x | x | x | x |
325
+ |broadcast_element | | | | | x | x |
326
+
327
+ Big integers:
328
+
329
+ | Operations\\Types | uint 64 | int 64 |
330
+ |-------------------|:-:|:-:|
331
+ |load, store | x | x |
332
+ |add, sub | x | x |
333
+ |shift | x | x |
334
+ |logical | x | x |
335
+ |reverse | x | x |
336
+ |extract | x | x |
337
+ |rotate (lanes) | x | x |
338
+ |cvt_flt64 | | x |
339
+ |extract_n | x | x |
340
+
341
+ Floating point:
342
+
343
+ | Operations\\Types | float 32 | float 64 |
344
+ |-------------------|:-:|:-:|
345
+ |load, store | x | x |
346
+ |interleave | x | |
347
+ |add, sub | x | x |
348
+ |mul | x | x |
349
+ |div | x | x |
350
+ |compare | x | x |
351
+ |min, max | x | x |
352
+ |absdiff | x | x |
353
+ |reduce | x | |
354
+ |mask | x | x |
355
+ |unpack | x | x |
356
+ |cvt_flt32 | | x |
357
+ |cvt_flt64 | x | |
358
+ |sqrt, abs | x | x |
359
+ |float math | x | x |
360
+ |transpose4x4 | x | |
361
+ |extract | x | x |
362
+ |rotate (lanes) | x | x |
363
+ |reverse | x | x |
364
+ |extract_n | x | x |
365
+ |broadcast_element | x | |
366
+
367
+ @{ */
368
+
369
+ template<typename _Tp, int n> struct v_reg
370
+ {
371
+ //! @cond IGNORED
372
+ typedef _Tp lane_type;
373
+ enum { nlanes = n };
374
+ // !@endcond
375
+
376
+ /** @brief Constructor
377
+
378
+ Initializes register with data from memory
379
+ @param ptr pointer to memory block with data for register */
380
+ explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
381
+
382
+ /** @brief Constructor
383
+
384
+ Initializes register with two 64-bit values */
385
+ v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
386
+
387
+ /** @brief Constructor
388
+
389
+ Initializes register with four 32-bit values */
390
+ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
391
+
392
+ /** @brief Constructor
393
+
394
+ Initializes register with eight 16-bit values */
395
+ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
396
+ _Tp s4, _Tp s5, _Tp s6, _Tp s7)
397
+ {
398
+ s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
399
+ s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
400
+ }
401
+
402
+ /** @brief Constructor
403
+
404
+ Initializes register with sixteen 8-bit values */
405
+ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
406
+ _Tp s4, _Tp s5, _Tp s6, _Tp s7,
407
+ _Tp s8, _Tp s9, _Tp s10, _Tp s11,
408
+ _Tp s12, _Tp s13, _Tp s14, _Tp s15)
409
+ {
410
+ s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
411
+ s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
412
+ s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
413
+ s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
414
+ }
415
+
416
+ /** @brief Default constructor
417
+
418
+ Does not initialize anything*/
419
+ v_reg() {}
420
+
421
+ /** @brief Copy constructor */
422
+ v_reg(const v_reg<_Tp, n> & r)
423
+ {
424
+ for( int i = 0; i < n; i++ )
425
+ s[i] = r.s[i];
426
+ }
427
+ /** @brief Access first value
428
+
429
+ Returns value of the first lane according to register type, for example:
430
+ @code{.cpp}
431
+ v_int32x4 r(1, 2, 3, 4);
432
+ int v = r.get0(); // returns 1
433
+ v_uint64x2 r(1, 2);
434
+ uint64_t v = r.get0(); // returns 1
435
+ @endcode
436
+ */
437
+ _Tp get0() const { return s[0]; }
438
+
439
+ //! @cond IGNORED
440
+ _Tp get(const int i) const { return s[i]; }
441
+ v_reg<_Tp, n> high() const
442
+ {
443
+ v_reg<_Tp, n> c;
444
+ int i;
445
+ for( i = 0; i < n/2; i++ )
446
+ {
447
+ c.s[i] = s[i+(n/2)];
448
+ c.s[i+(n/2)] = 0;
449
+ }
450
+ return c;
451
+ }
452
+
453
+ static v_reg<_Tp, n> zero()
454
+ {
455
+ v_reg<_Tp, n> c;
456
+ for( int i = 0; i < n; i++ )
457
+ c.s[i] = (_Tp)0;
458
+ return c;
459
+ }
460
+
461
+ static v_reg<_Tp, n> all(_Tp s)
462
+ {
463
+ v_reg<_Tp, n> c;
464
+ for( int i = 0; i < n; i++ )
465
+ c.s[i] = s;
466
+ return c;
467
+ }
468
+
469
+ template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
470
+ {
471
+ size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
472
+ v_reg<_Tp2, n2> c;
473
+ std::memcpy(&c.s[0], &s[0], bytes);
474
+ return c;
475
+ }
476
+
477
+ v_reg& operator=(const v_reg<_Tp, n> & r)
478
+ {
479
+ for( int i = 0; i < n; i++ )
480
+ s[i] = r.s[i];
481
+ return *this;
482
+ }
483
+
484
+ _Tp s[n];
485
+ //! @endcond
486
+ };
487
+
488
+ /** @brief Sixteen 8-bit unsigned integer values */
489
+ typedef v_reg<uchar, 16> v_uint8x16;
490
+ /** @brief Sixteen 8-bit signed integer values */
491
+ typedef v_reg<schar, 16> v_int8x16;
492
+ /** @brief Eight 16-bit unsigned integer values */
493
+ typedef v_reg<ushort, 8> v_uint16x8;
494
+ /** @brief Eight 16-bit signed integer values */
495
+ typedef v_reg<short, 8> v_int16x8;
496
+ /** @brief Four 32-bit unsigned integer values */
497
+ typedef v_reg<unsigned, 4> v_uint32x4;
498
+ /** @brief Four 32-bit signed integer values */
499
+ typedef v_reg<int, 4> v_int32x4;
500
+ /** @brief Four 32-bit floating point values (single precision) */
501
+ typedef v_reg<float, 4> v_float32x4;
502
+ /** @brief Two 64-bit floating point values (double precision) */
503
+ typedef v_reg<double, 2> v_float64x2;
504
+ /** @brief Two 64-bit unsigned integer values */
505
+ typedef v_reg<uint64, 2> v_uint64x2;
506
+ /** @brief Two 64-bit signed integer values */
507
+ typedef v_reg<int64, 2> v_int64x2;
508
+
509
+ #if CV_SIMD256
510
+ /** @brief Thirty two 8-bit unsigned integer values */
511
+ typedef v_reg<uchar, 32> v_uint8x32;
512
+ /** @brief Thirty two 8-bit signed integer values */
513
+ typedef v_reg<schar, 32> v_int8x32;
514
+ /** @brief Sixteen 16-bit unsigned integer values */
515
+ typedef v_reg<ushort, 16> v_uint16x16;
516
+ /** @brief Sixteen 16-bit signed integer values */
517
+ typedef v_reg<short, 16> v_int16x16;
518
+ /** @brief Eight 32-bit unsigned integer values */
519
+ typedef v_reg<unsigned, 8> v_uint32x8;
520
+ /** @brief Eight 32-bit signed integer values */
521
+ typedef v_reg<int, 8> v_int32x8;
522
+ /** @brief Eight 32-bit floating point values (single precision) */
523
+ typedef v_reg<float, 8> v_float32x8;
524
+ /** @brief Four 64-bit floating point values (double precision) */
525
+ typedef v_reg<double, 4> v_float64x4;
526
+ /** @brief Four 64-bit unsigned integer values */
527
+ typedef v_reg<uint64, 4> v_uint64x4;
528
+ /** @brief Four 64-bit signed integer values */
529
+ typedef v_reg<int64, 4> v_int64x4;
530
+ #endif
531
+
532
+ #if CV_SIMD512
533
+ /** @brief Sixty four 8-bit unsigned integer values */
534
+ typedef v_reg<uchar, 64> v_uint8x64;
535
+ /** @brief Sixty four 8-bit signed integer values */
536
+ typedef v_reg<schar, 64> v_int8x64;
537
+ /** @brief Thirty two 16-bit unsigned integer values */
538
+ typedef v_reg<ushort, 32> v_uint16x32;
539
+ /** @brief Thirty two 16-bit signed integer values */
540
+ typedef v_reg<short, 32> v_int16x32;
541
+ /** @brief Sixteen 32-bit unsigned integer values */
542
+ typedef v_reg<unsigned, 16> v_uint32x16;
543
+ /** @brief Sixteen 32-bit signed integer values */
544
+ typedef v_reg<int, 16> v_int32x16;
545
+ /** @brief Sixteen 32-bit floating point values (single precision) */
546
+ typedef v_reg<float, 16> v_float32x16;
547
+ /** @brief Eight 64-bit floating point values (double precision) */
548
+ typedef v_reg<double, 8> v_float64x8;
549
+ /** @brief Eight 64-bit unsigned integer values */
550
+ typedef v_reg<uint64, 8> v_uint64x8;
551
+ /** @brief Eight 64-bit signed integer values */
552
+ typedef v_reg<int64, 8> v_int64x8;
553
+ #endif
554
+
555
+ enum {
556
+ simd128_width = 16,
557
+ #if CV_SIMD256
558
+ simd256_width = 32,
559
+ #endif
560
+ #if CV_SIMD512
561
+ simd512_width = 64,
562
+ simdmax_width = simd512_width
563
+ #elif CV_SIMD256
564
+ simdmax_width = simd256_width
565
+ #else
566
+ simdmax_width = simd128_width
567
+ #endif
568
+ };
569
+
570
+ /** @brief Add values
571
+
572
+ For all types. */
573
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
574
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
575
+
576
+ /** @brief Subtract values
577
+
578
+ For all types. */
579
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
580
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
581
+
582
+ /** @brief Multiply values
583
+
584
+ For 16- and 32-bit integer types and floating types. */
585
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
586
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
587
+
588
+ /** @brief Divide values
589
+
590
+ For floating types only. */
591
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
592
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
593
+
594
+
595
+ /** @brief Bitwise AND
596
+
597
+ Only for integer types. */
598
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
599
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
600
+
601
+ /** @brief Bitwise OR
602
+
603
+ Only for integer types. */
604
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
605
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
606
+
607
+ /** @brief Bitwise XOR
608
+
609
+ Only for integer types.*/
610
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
611
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
612
+
613
+ /** @brief Bitwise NOT
614
+
615
+ Only for integer types.*/
616
+ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
617
+
618
+
619
+ #ifndef CV_DOXYGEN
620
+
621
+ #define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
622
+ __CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
623
+ __CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
624
+ __CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
625
+ __CV_EXPAND(macro_name(short, __VA_ARGS__)) \
626
+ __CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
627
+ __CV_EXPAND(macro_name(int, __VA_ARGS__)) \
628
+ __CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
629
+ __CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
630
+
631
+ #define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
632
+ __CV_EXPAND(macro_name(float, __VA_ARGS__)) \
633
+ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
634
+
635
+ #define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
636
+ CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
637
+ CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
638
+
639
+ #define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
640
+ template<int n> inline \
641
+ v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
642
+ { \
643
+ v_reg<_Tp, n> c; \
644
+ for( int i = 0; i < n; i++ ) \
645
+ c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
646
+ return c; \
647
+ } \
648
+ template<int n> inline \
649
+ v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
650
+ { \
651
+ for( int i = 0; i < n; i++ ) \
652
+ a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
653
+ return a; \
654
+ }
655
+
656
+ #define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
657
+
658
+ CV__HAL_INTRIN_IMPL_BIN_OP(+)
659
+ CV__HAL_INTRIN_IMPL_BIN_OP(-)
660
+ CV__HAL_INTRIN_IMPL_BIN_OP(*)
661
+ CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
662
+
663
+ #define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
664
+ template<int n> CV_INLINE \
665
+ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
666
+ { \
667
+ v_reg<_Tp, n> c; \
668
+ typedef typename V_TypeTraits<_Tp>::int_type itype; \
669
+ for( int i = 0; i < n; i++ ) \
670
+ c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
671
+ V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
672
+ return c; \
673
+ } \
674
+ template<int n> CV_INLINE \
675
+ v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
676
+ { \
677
+ typedef typename V_TypeTraits<_Tp>::int_type itype; \
678
+ for( int i = 0; i < n; i++ ) \
679
+ a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
680
+ V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
681
+ return a; \
682
+ }
683
+
684
+ #define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
685
+ CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
686
+ CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
687
+
688
+
689
+ CV__HAL_INTRIN_IMPL_BIT_OP(&)
690
+ CV__HAL_INTRIN_IMPL_BIT_OP(|)
691
+ CV__HAL_INTRIN_IMPL_BIT_OP(^)
692
+
693
+ #define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
694
+ template<int n> CV_INLINE \
695
+ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
696
+ { \
697
+ v_reg<_Tp, n> c; \
698
+ for( int i = 0; i < n; i++ ) \
699
+ c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
700
+ return c; \
701
+ } \
702
+
703
+ CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
704
+
705
+ #endif // !CV_DOXYGEN
706
+
707
+
708
+ //! @brief Helper macro
709
+ //! @ingroup core_hal_intrin_impl
710
+ #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
711
+ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
712
+ { \
713
+ v_reg<_Tp2, n> c; \
714
+ for( int i = 0; i < n; i++ ) \
715
+ c.s[i] = cfunc(a.s[i]); \
716
+ return c; \
717
+ }
718
+
719
+ /** @brief Square root of elements
720
+
721
+ Only for floating point types.*/
722
+ OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
723
+
724
+ //! @cond IGNORED
725
+ OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
726
+ OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
727
+ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
728
+ OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
729
+ //! @endcond
730
+
731
+ /** @brief Absolute value of elements
732
+
733
+ Only for floating point types.*/
734
+ OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
735
+ typename V_TypeTraits<_Tp>::abs_type)
736
+
737
+ //! @brief Helper macro
738
+ //! @ingroup core_hal_intrin_impl
739
+ #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
740
+ template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
741
+ { \
742
+ v_reg<_Tp, n> c; \
743
+ for( int i = 0; i < n; i++ ) \
744
+ c.s[i] = cfunc(a.s[i], b.s[i]); \
745
+ return c; \
746
+ }
747
+
748
+ //! @brief Helper macro
749
+ //! @ingroup core_hal_intrin_impl
750
+ #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
751
+ template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
752
+ { \
753
+ _Tp c = a.s[0]; \
754
+ for( int i = 1; i < n; i++ ) \
755
+ c = cfunc(c, a.s[i]); \
756
+ return c; \
757
+ }
758
+
759
+ /** @brief Choose min values for each pair
760
+
761
+ Scheme:
762
+ @code
763
+ {A1 A2 ...}
764
+ {B1 B2 ...}
765
+ --------------
766
+ {min(A1,B1) min(A2,B2) ...}
767
+ @endcode
768
+ For all types except 64-bit integer. */
769
+ OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
770
+
771
+ /** @brief Choose max values for each pair
772
+
773
+ Scheme:
774
+ @code
775
+ {A1 A2 ...}
776
+ {B1 B2 ...}
777
+ --------------
778
+ {max(A1,B1) max(A2,B2) ...}
779
+ @endcode
780
+ For all types except 64-bit integer. */
781
+ OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
782
+
783
+ /** @brief Find one min value
784
+
785
+ Scheme:
786
+ @code
787
+ {A1 A2 A3 ...} => min(A1,A2,A3,...)
788
+ @endcode
789
+ For all types except 64-bit integer and 64-bit floating point types. */
790
+ OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
791
+
792
+ /** @brief Find one max value
793
+
794
+ Scheme:
795
+ @code
796
+ {A1 A2 A3 ...} => max(A1,A2,A3,...)
797
+ @endcode
798
+ For all types except 64-bit integer and 64-bit floating point types. */
799
+ OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
800
+
801
+ static const unsigned char popCountTable[] =
802
+ {
803
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
804
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
805
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
806
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
807
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
808
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
809
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
810
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
811
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
812
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
813
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
814
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
815
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
816
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
817
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
818
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
819
+ };
820
+ /** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
821
+
822
+ Scheme:
823
+ @code
824
+ {A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
825
+ @endcode
826
+ For all integer types. */
827
+ template<typename _Tp, int n>
828
+ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
829
+ {
830
+ v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
831
+ for (int i = 0; i < n*(int)sizeof(_Tp); i++)
832
+ b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
833
+ return b;
834
+ }
835
+
836
+
837
+ //! @cond IGNORED
838
+ template<typename _Tp, int n>
839
+ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
840
+ v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
841
+ {
842
+ for( int i = 0; i < n; i++ )
843
+ {
844
+ minval.s[i] = std::min(a.s[i], b.s[i]);
845
+ maxval.s[i] = std::max(a.s[i], b.s[i]);
846
+ }
847
+ }
848
+ //! @endcond
849
+
850
+ //! @brief Helper macro
851
+ //! @ingroup core_hal_intrin_impl
852
+ #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
853
+ template<typename _Tp, int n> \
854
+ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
855
+ { \
856
+ typedef typename V_TypeTraits<_Tp>::int_type itype; \
857
+ v_reg<_Tp, n> c; \
858
+ for( int i = 0; i < n; i++ ) \
859
+ c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
860
+ return c; \
861
+ }
862
+
863
+ /** @brief Less-than comparison
864
+
865
+ For all types except 64-bit integer values. */
866
+ OPENCV_HAL_IMPL_CMP_OP(<)
867
+
868
+ /** @brief Greater-than comparison
869
+
870
+ For all types except 64-bit integer values. */
871
+ OPENCV_HAL_IMPL_CMP_OP(>)
872
+
873
+ /** @brief Less-than or equal comparison
874
+
875
+ For all types except 64-bit integer values. */
876
+ OPENCV_HAL_IMPL_CMP_OP(<=)
877
+
878
+ /** @brief Greater-than or equal comparison
879
+
880
+ For all types except 64-bit integer values. */
881
+ OPENCV_HAL_IMPL_CMP_OP(>=)
882
+
883
+ /** @brief Equal comparison
884
+
885
+ For all types except 64-bit integer values. */
886
+ OPENCV_HAL_IMPL_CMP_OP(==)
887
+
888
+ /** @brief Not equal comparison
889
+
890
+ For all types except 64-bit integer values. */
891
+ OPENCV_HAL_IMPL_CMP_OP(!=)
892
+
893
+ template<int n>
894
+ inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
895
+ {
896
+ typedef typename V_TypeTraits<float>::int_type itype;
897
+ v_reg<float, n> c;
898
+ for (int i = 0; i < n; i++)
899
+ c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
900
+ return c;
901
+ }
902
+ template<int n>
903
+ inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
904
+ {
905
+ typedef typename V_TypeTraits<double>::int_type itype;
906
+ v_reg<double, n> c;
907
+ for (int i = 0; i < n; i++)
908
+ c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
909
+ return c;
910
+ }
911
+
912
+ //! @brief Helper macro
913
+ //! @ingroup core_hal_intrin_impl
914
+ #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
915
+ template<typename _Tp, int n> \
916
+ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
917
+ { \
918
+ typedef _Tp2 rtype; \
919
+ v_reg<rtype, n> c; \
920
+ for( int i = 0; i < n; i++ ) \
921
+ c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
922
+ return c; \
923
+ }
924
+
925
+ /** @brief Add values without saturation
926
+
927
+ For 8- and 16-bit integer values. */
928
+ OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
929
+
930
+ /** @brief Subtract values without saturation
931
+
932
+ For 8- and 16-bit integer values. */
933
+ OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
934
+
935
+ /** @brief Multiply values without saturation
936
+
937
+ For 8- and 16-bit integer values. */
938
+ OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
939
+
940
+ //! @cond IGNORED
941
+ template<typename T> inline T _absdiff(T a, T b)
942
+ {
943
+ return a > b ? a - b : b - a;
944
+ }
945
+ //! @endcond
946
+
947
+ /** @brief Absolute difference
948
+
949
+ Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
950
+ Example:
951
+ @code{.cpp}
952
+ v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
953
+ v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
954
+ @endcode
955
+ For 8-, 16-, 32-bit integer source types. */
956
+ template<typename _Tp, int n>
957
+ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
958
+ {
959
+ typedef typename V_TypeTraits<_Tp>::abs_type rtype;
960
+ v_reg<rtype, n> c;
961
+ const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
962
+ for( int i = 0; i < n; i++ )
963
+ {
964
+ rtype ua = a.s[i] ^ mask;
965
+ rtype ub = b.s[i] ^ mask;
966
+ c.s[i] = _absdiff(ua, ub);
967
+ }
968
+ return c;
969
+ }
970
+
971
+ /** @overload
972
+
973
+ For 32-bit floating point values */
974
+ template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
975
+ {
976
+ v_reg<float, n> c;
977
+ for( int i = 0; i < c.nlanes; i++ )
978
+ c.s[i] = _absdiff(a.s[i], b.s[i]);
979
+ return c;
980
+ }
981
+
982
+ /** @overload
983
+
984
+ For 64-bit floating point values */
985
+ template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
986
+ {
987
+ v_reg<double, n> c;
988
+ for( int i = 0; i < c.nlanes; i++ )
989
+ c.s[i] = _absdiff(a.s[i], b.s[i]);
990
+ return c;
991
+ }
992
+
993
+ /** @brief Saturating absolute difference
994
+
995
+ Returns \f$ saturate(|a - b|) \f$ .
996
+ For 8-, 16-bit signed integer source types. */
997
+ template<typename _Tp, int n>
998
+ inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
999
+ {
1000
+ v_reg<_Tp, n> c;
1001
+ for( int i = 0; i < n; i++)
1002
+ c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
1003
+ return c;
1004
+ }
1005
+
1006
+ /** @brief Inversed square root
1007
+
1008
+ Returns \f$ 1/sqrt(a) \f$
1009
+ For floating point types only. */
1010
+ template<typename _Tp, int n>
1011
+ inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
1012
+ {
1013
+ v_reg<_Tp, n> c;
1014
+ for( int i = 0; i < n; i++ )
1015
+ c.s[i] = 1.f/std::sqrt(a.s[i]);
1016
+ return c;
1017
+ }
1018
+
1019
+ /** @brief Magnitude
1020
+
1021
+ Returns \f$ sqrt(a^2 + b^2) \f$
1022
+ For floating point types only. */
1023
+ template<typename _Tp, int n>
1024
+ inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1025
+ {
1026
+ v_reg<_Tp, n> c;
1027
+ for( int i = 0; i < n; i++ )
1028
+ c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
1029
+ return c;
1030
+ }
1031
+
1032
+ /** @brief Square of the magnitude
1033
+
1034
+ Returns \f$ a^2 + b^2 \f$
1035
+ For floating point types only. */
1036
+ template<typename _Tp, int n>
1037
+ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1038
+ {
1039
+ v_reg<_Tp, n> c;
1040
+ for( int i = 0; i < n; i++ )
1041
+ c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
1042
+ return c;
1043
+ }
1044
+
1045
+ /** @brief Multiply and add
1046
+
1047
+ Returns \f$ a*b + c \f$
1048
+ For floating point types and signed 32bit int only. */
1049
+ template<typename _Tp, int n>
1050
+ inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1051
+ const v_reg<_Tp, n>& c)
1052
+ {
1053
+ v_reg<_Tp, n> d;
1054
+ for( int i = 0; i < n; i++ )
1055
+ d.s[i] = a.s[i]*b.s[i] + c.s[i];
1056
+ return d;
1057
+ }
1058
+
1059
+ /** @brief A synonym for v_fma */
1060
+ template<typename _Tp, int n>
1061
+ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1062
+ const v_reg<_Tp, n>& c)
1063
+ {
1064
+ return v_fma(a, b, c);
1065
+ }
1066
+
1067
+ /** @brief Dot product of elements
1068
+
1069
+ Multiply values in two registers and sum adjacent result pairs.
1070
+
1071
+ Scheme:
1072
+ @code
1073
+ {A1 A2 ...} // 16-bit
1074
+ x {B1 B2 ...} // 16-bit
1075
+ -------------
1076
+ {A1B1+A2B2 ...} // 32-bit
1077
+
1078
+ @endcode
1079
+ */
1080
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1081
+ v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1082
+ {
1083
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1084
+ v_reg<w_type, n/2> c;
1085
+ for( int i = 0; i < (n/2); i++ )
1086
+ c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
1087
+ return c;
1088
+ }
1089
+
1090
+ /** @brief Dot product of elements
1091
+
1092
+ Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
1093
+ Scheme:
1094
+ @code
1095
+ {A1 A2 ...} // 16-bit
1096
+ x {B1 B2 ...} // 16-bit
1097
+ -------------
1098
+ {A1B1+A2B2+C1 ...} // 32-bit
1099
+ @endcode
1100
+ */
1101
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1102
+ v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1103
+ const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1104
+ {
1105
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1106
+ v_reg<w_type, n/2> s;
1107
+ for( int i = 0; i < (n/2); i++ )
1108
+ s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
1109
+ return s;
1110
+ }
1111
+
1112
+ /** @brief Fast Dot product of elements
1113
+
1114
+ Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
1115
+ this intrinsic can be used if the sum among all lanes is only matters
1116
+ and also it should be yielding better performance on the affected platforms.
1117
+
1118
+ */
1119
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1120
+ v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1121
+ { return v_dotprod(a, b); }
1122
+
1123
+ /** @brief Fast Dot product of elements
1124
+
1125
+ Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
1126
+ */
1127
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1128
+ v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1129
+ const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1130
+ { return v_dotprod(a, b, c); }
1131
+
1132
+ /** @brief Dot product of elements and expand
1133
+
1134
+ Multiply values in two registers and expand the sum of adjacent result pairs.
1135
+
1136
+ Scheme:
1137
+ @code
1138
+ {A1 A2 A3 A4 ...} // 8-bit
1139
+ x {B1 B2 B3 B4 ...} // 8-bit
1140
+ -------------
1141
+ {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
1142
+
1143
+ @endcode
1144
+ */
1145
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1146
+ v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1147
+ {
1148
+ typedef typename V_TypeTraits<_Tp>::q_type q_type;
1149
+ v_reg<q_type, n/4> s;
1150
+ for( int i = 0; i < (n/4); i++ )
1151
+ s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1152
+ (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
1153
+ return s;
1154
+ }
1155
+
1156
+ /** @brief Dot product of elements
1157
+
1158
+ Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
1159
+ Scheme:
1160
+ @code
1161
+ {A1 A2 A3 A4 ...} // 8-bit
1162
+ x {B1 B2 B3 B4 ...} // 8-bit
1163
+ -------------
1164
+ {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
1165
+ @endcode
1166
+ */
1167
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1168
+ v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1169
+ const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1170
+ {
1171
+ typedef typename V_TypeTraits<_Tp>::q_type q_type;
1172
+ v_reg<q_type, n/4> s;
1173
+ for( int i = 0; i < (n/4); i++ )
1174
+ s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1175
+ (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
1176
+ return s;
1177
+ }
1178
+
1179
+ /** @brief Fast Dot product of elements and expand
1180
+
1181
+ Multiply values in two registers and expand the sum of adjacent result pairs.
1182
+
1183
+ Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
1184
+ this intrinsic can be used if the sum among all lanes is only matters
1185
+ and also it should be yielding better performance on the affected platforms.
1186
+
1187
+ */
1188
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1189
+ v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1190
+ { return v_dotprod_expand(a, b); }
1191
+
1192
+ /** @brief Fast Dot product of elements
1193
+
1194
+ Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
1195
+ */
1196
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
1197
+ v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1198
+ const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1199
+ { return v_dotprod_expand(a, b, c); }
1200
+
1201
+ /** @brief Multiply and expand
1202
+
1203
+ Multiply values two registers and store results in two registers with wider pack type.
1204
+ Scheme:
1205
+ @code
1206
+ {A B C D} // 32-bit
1207
+ x {E F G H} // 32-bit
1208
+ ---------------
1209
+ {AE BF} // 64-bit
1210
+ {CG DH} // 64-bit
1211
+ @endcode
1212
+ Example:
1213
+ @code{.cpp}
1214
+ v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
1215
+ v_uint64x2 c, d; // results
1216
+ v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
1217
+ @endcode
1218
+ Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
1219
+ */
1220
+ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1221
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
1222
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
1223
+ {
1224
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1225
+ for( int i = 0; i < (n/2); i++ )
1226
+ {
1227
+ c.s[i] = (w_type)a.s[i]*b.s[i];
1228
+ d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
1229
+ }
1230
+ }
1231
+
1232
+ /** @brief Multiply and extract high part
1233
+
1234
+ Multiply values two registers and store high part of the results.
1235
+ Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
1236
+ */
1237
+ template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1238
+ {
1239
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1240
+ v_reg<_Tp, n> c;
1241
+ for (int i = 0; i < n; i++)
1242
+ c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
1243
+ return c;
1244
+ }
1245
+
1246
+ //! @cond IGNORED
1247
+ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
1248
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
1249
+ {
1250
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1251
+ for( int i = 0; i < (n/2); i++ )
1252
+ {
1253
+ c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
1254
+ }
1255
+ }
1256
+ //! @endcond
1257
+
1258
+ //! @brief Helper macro
1259
+ //! @ingroup core_hal_intrin_impl
1260
+ #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
1261
+ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
1262
+ { \
1263
+ v_reg<_Tp, n> c; \
1264
+ for( int i = 0; i < n; i++ ) \
1265
+ c.s[i] = (_Tp)(a.s[i] shift_op imm); \
1266
+ return c; \
1267
+ }
1268
+
1269
+ /** @brief Bitwise shift left
1270
+
1271
+ For 16-, 32- and 64-bit integer values. */
1272
+ OPENCV_HAL_IMPL_SHIFT_OP(<< )
1273
+
1274
+ /** @brief Bitwise shift right
1275
+
1276
+ For 16-, 32- and 64-bit integer values. */
1277
+ OPENCV_HAL_IMPL_SHIFT_OP(>> )
1278
+
1279
+ //! @brief Helper macro
1280
+ //! @ingroup core_hal_intrin_impl
1281
+ #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
1282
+ template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
1283
+ { \
1284
+ v_reg<_Tp, n> b; \
1285
+ for (int i = 0; i < n; i++) \
1286
+ { \
1287
+ int sIndex = i opA imm; \
1288
+ if (0 <= sIndex && sIndex < n) \
1289
+ { \
1290
+ b.s[i] = a.s[sIndex]; \
1291
+ } \
1292
+ else \
1293
+ { \
1294
+ b.s[i] = 0; \
1295
+ } \
1296
+ } \
1297
+ return b; \
1298
+ } \
1299
+ template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
1300
+ { \
1301
+ v_reg<_Tp, n> c; \
1302
+ for (int i = 0; i < n; i++) \
1303
+ { \
1304
+ int aIndex = i opA imm; \
1305
+ int bIndex = i opA imm opB n; \
1306
+ if (0 <= bIndex && bIndex < n) \
1307
+ { \
1308
+ c.s[i] = b.s[bIndex]; \
1309
+ } \
1310
+ else if (0 <= aIndex && aIndex < n) \
1311
+ { \
1312
+ c.s[i] = a.s[aIndex]; \
1313
+ } \
1314
+ else \
1315
+ { \
1316
+ c.s[i] = 0; \
1317
+ } \
1318
+ } \
1319
+ return c; \
1320
+ }
1321
+
1322
+ /** @brief Element shift left among vector
1323
+
1324
+ For all type */
1325
+ OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
1326
+
1327
+ /** @brief Element shift right among vector
1328
+
1329
+ For all type */
1330
+ OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
1331
+
1332
+ /** @brief Sum packed values
1333
+
1334
+ Scheme:
1335
+ @code
1336
+ {A1 A2 A3 ...} => sum{A1,A2,A3,...}
1337
+ @endcode
1338
+ */
1339
+ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
1340
+ {
1341
+ typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
1342
+ for( int i = 1; i < n; i++ )
1343
+ c += a.s[i];
1344
+ return c;
1345
+ }
1346
+
1347
+ /** @brief Sums all elements of each input vector, returns the vector of sums
1348
+
1349
+ Scheme:
1350
+ @code
1351
+ result[0] = a[0] + a[1] + a[2] + a[3]
1352
+ result[1] = b[0] + b[1] + b[2] + b[3]
1353
+ result[2] = c[0] + c[1] + c[2] + c[3]
1354
+ result[3] = d[0] + d[1] + d[2] + d[3]
1355
+ @endcode
1356
+ */
1357
+ template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
1358
+ const v_reg<float, n>& c, const v_reg<float, n>& d)
1359
+ {
1360
+ v_reg<float, n> r;
1361
+ for(int i = 0; i < (n/4); i++)
1362
+ {
1363
+ r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
1364
+ r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
1365
+ r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
1366
+ r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
1367
+ }
1368
+ return r;
1369
+ }
1370
+
1371
+ /** @brief Sum absolute differences of values
1372
+
1373
+ Scheme:
1374
+ @code
1375
+ {A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
1376
+ @endcode
1377
+ For all types except 64-bit types.*/
1378
+ template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1379
+ {
1380
+ typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
1381
+ for (int i = 1; i < n; i++)
1382
+ c += _absdiff(a.s[i], b.s[i]);
1383
+ return c;
1384
+ }
1385
+
1386
+ /** @brief Get negative values mask
1387
+ @deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
1388
+
1389
+ Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
1390
+ Example:
1391
+ @code{.cpp}
1392
+ v_int32x4 r; // set to {-1, -1, 1, 1}
1393
+ int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
1394
+ @endcode
1395
+ */
1396
+ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1397
+ {
1398
+ int mask = 0;
1399
+ for( int i = 0; i < n; i++ )
1400
+ mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1401
+ return mask;
1402
+ }
1403
+
1404
+ /** @brief Get first negative lane index
1405
+
1406
+ Returned value is an index of first negative lane (undefined for input of all positive values)
1407
+ Example:
1408
+ @code{.cpp}
1409
+ v_int32x4 r; // set to {0, 0, -1, -1}
1410
+ int idx = v_heading_zeros(r); // idx = 2
1411
+ @endcode
1412
+ */
1413
+ template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
1414
+ {
1415
+ for (int i = 0; i < n; i++)
1416
+ if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
1417
+ return i;
1418
+ return 0;
1419
+ }
1420
+
1421
+ /** @brief Check if all packed values are less than zero
1422
+
1423
+ Unsigned values will be casted to signed: `uchar 254 => char -2`.
1424
+ */
1425
+ template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1426
+ {
1427
+ for( int i = 0; i < n; i++ )
1428
+ if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1429
+ return false;
1430
+ return true;
1431
+ }
1432
+
1433
+ /** @brief Check if any of packed values is less than zero
1434
+
1435
+ Unsigned values will be casted to signed: `uchar 254 => char -2`.
1436
+ */
1437
+ template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1438
+ {
1439
+ for( int i = 0; i < n; i++ )
1440
+ if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1441
+ return true;
1442
+ return false;
1443
+ }
1444
+
1445
+ /** @brief Per-element select (blend operation)
1446
+
1447
+ Return value will be built by combining values _a_ and _b_ using the following scheme:
1448
+ result[i] = mask[i] ? a[i] : b[i];
1449
+
1450
+ @note: _mask_ element values are restricted to these values:
1451
+ - 0: select element from _b_
1452
+ - 0xff/0xffff/etc: select element from _a_
1453
+ (fully compatible with bitwise-based operator)
1454
+ */
1455
+ template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1456
+ const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1457
+ {
1458
+ typedef V_TypeTraits<_Tp> Traits;
1459
+ typedef typename Traits::int_type int_type;
1460
+ v_reg<_Tp, n> c;
1461
+ for( int i = 0; i < n; i++ )
1462
+ {
1463
+ int_type m = Traits::reinterpret_int(mask.s[i]);
1464
+ CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
1465
+ c.s[i] = m ? a.s[i] : b.s[i];
1466
+ }
1467
+ return c;
1468
+ }
1469
+
1470
+ /** @brief Expand values to the wider pack type
1471
+
1472
+ Copy contents of register to two registers with 2x wider pack type.
1473
+ Scheme:
1474
+ @code
1475
+ int32x4 int64x2 int64x2
1476
+ {A B C D} ==> {A B} , {C D}
1477
+ @endcode */
1478
+ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1479
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1480
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1481
+ {
1482
+ for( int i = 0; i < (n/2); i++ )
1483
+ {
1484
+ b0.s[i] = a.s[i];
1485
+ b1.s[i] = a.s[i+(n/2)];
1486
+ }
1487
+ }
1488
+
1489
+ /** @brief Expand lower values to the wider pack type
1490
+
1491
+ Same as cv::v_expand, but return lower half of the vector.
1492
+
1493
+ Scheme:
1494
+ @code
1495
+ int32x4 int64x2
1496
+ {A B C D} ==> {A B}
1497
+ @endcode */
1498
+ template<typename _Tp, int n>
1499
+ inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1500
+ v_expand_low(const v_reg<_Tp, n>& a)
1501
+ {
1502
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1503
+ for( int i = 0; i < (n/2); i++ )
1504
+ b.s[i] = a.s[i];
1505
+ return b;
1506
+ }
1507
+
1508
+ /** @brief Expand higher values to the wider pack type
1509
+
1510
+ Same as cv::v_expand_low, but expand higher half of the vector instead.
1511
+
1512
+ Scheme:
1513
+ @code
1514
+ int32x4 int64x2
1515
+ {A B C D} ==> {C D}
1516
+ @endcode */
1517
+ template<typename _Tp, int n>
1518
+ inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
1519
+ v_expand_high(const v_reg<_Tp, n>& a)
1520
+ {
1521
+ v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1522
+ for( int i = 0; i < (n/2); i++ )
1523
+ b.s[i] = a.s[i+(n/2)];
1524
+ return b;
1525
+ }
1526
+
1527
+ //! @cond IGNORED
1528
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
1529
+ v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1530
+ {
1531
+ v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1532
+ for( int i = 0; i < n; i++ )
1533
+ c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1534
+ return c;
1535
+ }
1536
+
1537
+ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
1538
+ v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1539
+ {
1540
+ v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1541
+ for( int i = 0; i < n; i++ )
1542
+ c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1543
+ return c;
1544
+ }
1545
+ //! @endcond
1546
+
1547
+ /** @brief Interleave two vectors
1548
+
1549
+ Scheme:
1550
+ @code
1551
+ {A1 A2 A3 A4}
1552
+ {B1 B2 B3 B4}
1553
+ ---------------
1554
+ {A1 B1 A2 B2} and {A3 B3 A4 B4}
1555
+ @endcode
1556
+ For all types except 64-bit.
1557
+ */
1558
+ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1559
+ v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1560
+ {
1561
+ int i;
1562
+ for( i = 0; i < n/2; i++ )
1563
+ {
1564
+ b0.s[i*2] = a0.s[i];
1565
+ b0.s[i*2+1] = a1.s[i];
1566
+ }
1567
+ for( ; i < n; i++ )
1568
+ {
1569
+ b1.s[i*2-n] = a0.s[i];
1570
+ b1.s[i*2-n+1] = a1.s[i];
1571
+ }
1572
+ }
1573
+
1574
+ /** @brief Load register contents from memory
1575
+
1576
+ @param ptr pointer to memory block with data
1577
+ @return register object
1578
+
1579
+ @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
1580
+
1581
+ @note Use vx_load version to get maximum available register length result
1582
+
1583
+ @note Alignment requirement:
1584
+ if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1585
+ Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1586
+ */
1587
+ template<typename _Tp>
1588
+ inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
1589
+ {
1590
+ #if CV_STRONG_ALIGNMENT
1591
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1592
+ #endif
1593
+ return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1594
+ }
1595
+
1596
+ #if CV_SIMD256
1597
+ /** @brief Load 256-bit length register contents from memory
1598
+
1599
+ @param ptr pointer to memory block with data
1600
+ @return register object
1601
+
1602
+ @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
1603
+
1604
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1605
+ Use vx_load version to get maximum available register length result
1606
+
1607
+ @note Alignment requirement:
1608
+ if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1609
+ Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1610
+ */
1611
+ template<typename _Tp>
1612
+ inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
1613
+ {
1614
+ #if CV_STRONG_ALIGNMENT
1615
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1616
+ #endif
1617
+ return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1618
+ }
1619
+ #endif
1620
+
1621
+ #if CV_SIMD512
1622
+ /** @brief Load 512-bit length register contents from memory
1623
+
1624
+ @param ptr pointer to memory block with data
1625
+ @return register object
1626
+
1627
+ @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
1628
+
1629
+ @note Check CV_SIMD512 preprocessor definition prior to use.
1630
+ Use vx_load version to get maximum available register length result
1631
+
1632
+ @note Alignment requirement:
1633
+ if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1634
+ Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1635
+ */
1636
+ template<typename _Tp>
1637
+ inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
1638
+ {
1639
+ #if CV_STRONG_ALIGNMENT
1640
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1641
+ #endif
1642
+ return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1643
+ }
1644
+ #endif
1645
+
1646
+ /** @brief Load register contents from memory (aligned)
1647
+
1648
+ similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
1649
+
1650
+ @note Use vx_load_aligned version to get maximum available register length result
1651
+ */
1652
+ template<typename _Tp>
1653
+ inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
1654
+ {
1655
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
1656
+ return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1657
+ }
1658
+
1659
+ #if CV_SIMD256
1660
+ /** @brief Load register contents from memory (aligned)
1661
+
1662
+ similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
1663
+
1664
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1665
+ Use vx_load_aligned version to get maximum available register length result
1666
+ */
1667
+ template<typename _Tp>
1668
+ inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
1669
+ {
1670
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
1671
+ return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1672
+ }
1673
+ #endif
1674
+
1675
+ #if CV_SIMD512
1676
+ /** @brief Load register contents from memory (aligned)
1677
+
1678
+ similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
1679
+
1680
+ @note Check CV_SIMD512 preprocessor definition prior to use.
1681
+ Use vx_load_aligned version to get maximum available register length result
1682
+ */
1683
+ template<typename _Tp>
1684
+ inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
1685
+ {
1686
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
1687
+ return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1688
+ }
1689
+ #endif
1690
+
1691
+ /** @brief Load 64-bits of data to lower part (high part is undefined).
1692
+
1693
+ @param ptr memory block containing data for first half (0..n/2)
1694
+
1695
+ @code{.cpp}
1696
+ int lo[2] = { 1, 2 };
1697
+ v_int32x4 r = v_load_low(lo);
1698
+ @endcode
1699
+
1700
+ @note Use vx_load_low version to get maximum available register length result
1701
+ */
1702
+ template<typename _Tp>
1703
+ inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
1704
+ {
1705
+ #if CV_STRONG_ALIGNMENT
1706
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1707
+ #endif
1708
+ v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1709
+ for( int i = 0; i < c.nlanes/2; i++ )
1710
+ {
1711
+ c.s[i] = ptr[i];
1712
+ }
1713
+ return c;
1714
+ }
1715
+
1716
+ #if CV_SIMD256
1717
+ /** @brief Load 128-bits of data to lower part (high part is undefined).
1718
+
1719
+ @param ptr memory block containing data for first half (0..n/2)
1720
+
1721
+ @code{.cpp}
1722
+ int lo[4] = { 1, 2, 3, 4 };
1723
+ v_int32x8 r = v256_load_low(lo);
1724
+ @endcode
1725
+
1726
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1727
+ Use vx_load_low version to get maximum available register length result
1728
+ */
1729
+ template<typename _Tp>
1730
+ inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
1731
+ {
1732
+ #if CV_STRONG_ALIGNMENT
1733
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1734
+ #endif
1735
+ v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1736
+ for (int i = 0; i < c.nlanes / 2; i++)
1737
+ {
1738
+ c.s[i] = ptr[i];
1739
+ }
1740
+ return c;
1741
+ }
1742
+ #endif
1743
+
1744
+ #if CV_SIMD512
1745
+ /** @brief Load 256-bits of data to lower part (high part is undefined).
1746
+
1747
+ @param ptr memory block containing data for first half (0..n/2)
1748
+
1749
+ @code{.cpp}
1750
+ int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
1751
+ v_int32x16 r = v512_load_low(lo);
1752
+ @endcode
1753
+
1754
+ @note Check CV_SIMD512 preprocessor definition prior to use.
1755
+ Use vx_load_low version to get maximum available register length result
1756
+ */
1757
+ template<typename _Tp>
1758
+ inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
1759
+ {
1760
+ #if CV_STRONG_ALIGNMENT
1761
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1762
+ #endif
1763
+ v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1764
+ for (int i = 0; i < c.nlanes / 2; i++)
1765
+ {
1766
+ c.s[i] = ptr[i];
1767
+ }
1768
+ return c;
1769
+ }
1770
+ #endif
1771
+
1772
+ /** @brief Load register contents from two memory blocks
1773
+
1774
+ @param loptr memory block containing data for first half (0..n/2)
1775
+ @param hiptr memory block containing data for second half (n/2..n)
1776
+
1777
+ @code{.cpp}
1778
+ int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1779
+ v_int32x4 r = v_load_halves(lo, hi);
1780
+ @endcode
1781
+
1782
+ @note Use vx_load_halves version to get maximum available register length result
1783
+ */
1784
+ template<typename _Tp>
1785
+ inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1786
+ {
1787
+ #if CV_STRONG_ALIGNMENT
1788
+ CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1789
+ CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1790
+ #endif
1791
+ v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1792
+ for( int i = 0; i < c.nlanes/2; i++ )
1793
+ {
1794
+ c.s[i] = loptr[i];
1795
+ c.s[i+c.nlanes/2] = hiptr[i];
1796
+ }
1797
+ return c;
1798
+ }
1799
+
1800
+ #if CV_SIMD256
1801
+ /** @brief Load register contents from two memory blocks
1802
+
1803
+ @param loptr memory block containing data for first half (0..n/2)
1804
+ @param hiptr memory block containing data for second half (n/2..n)
1805
+
1806
+ @code{.cpp}
1807
+ int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
1808
+ v_int32x8 r = v256_load_halves(lo, hi);
1809
+ @endcode
1810
+
1811
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1812
+ Use vx_load_halves version to get maximum available register length result
1813
+ */
1814
+ template<typename _Tp>
1815
+ inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
1816
+ {
1817
+ #if CV_STRONG_ALIGNMENT
1818
+ CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1819
+ CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1820
+ #endif
1821
+ v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1822
+ for (int i = 0; i < c.nlanes / 2; i++)
1823
+ {
1824
+ c.s[i] = loptr[i];
1825
+ c.s[i + c.nlanes / 2] = hiptr[i];
1826
+ }
1827
+ return c;
1828
+ }
1829
+ #endif
1830
+
1831
+ #if CV_SIMD512
1832
+ /** @brief Load register contents from two memory blocks
1833
+
1834
+ @param loptr memory block containing data for first half (0..n/2)
1835
+ @param hiptr memory block containing data for second half (n/2..n)
1836
+
1837
+ @code{.cpp}
1838
+ int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
1839
+ v_int32x16 r = v512_load_halves(lo, hi);
1840
+ @endcode
1841
+
1842
+ @note Check CV_SIMD512 preprocessor definition prior to use.
1843
+ Use vx_load_halves version to get maximum available register length result
1844
+ */
1845
+ template<typename _Tp>
1846
+ inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
1847
+ {
1848
+ #if CV_STRONG_ALIGNMENT
1849
+ CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1850
+ CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1851
+ #endif
1852
+ v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1853
+ for (int i = 0; i < c.nlanes / 2; i++)
1854
+ {
1855
+ c.s[i] = loptr[i];
1856
+ c.s[i + c.nlanes / 2] = hiptr[i];
1857
+ }
1858
+ return c;
1859
+ }
1860
+ #endif
1861
+
1862
+ /** @brief Load register contents from memory with double expand
1863
+
1864
+ Same as cv::v_load, but result pack type will be 2x wider than memory type.
1865
+
1866
+ @code{.cpp}
1867
+ short buf[4] = {1, 2, 3, 4}; // type is int16
1868
+ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1869
+ @endcode
1870
+ For 8-, 16-, 32-bit integer source types.
1871
+
1872
+ @note Use vx_load_expand version to get maximum available register length result
1873
+ */
1874
+ template<typename _Tp>
1875
+ inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1876
+ v_load_expand(const _Tp* ptr)
1877
+ {
1878
+ #if CV_STRONG_ALIGNMENT
1879
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1880
+ #endif
1881
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1882
+ v_reg<w_type, simd128_width / sizeof(w_type)> c;
1883
+ for( int i = 0; i < c.nlanes; i++ )
1884
+ {
1885
+ c.s[i] = ptr[i];
1886
+ }
1887
+ return c;
1888
+ }
1889
+
1890
+ #if CV_SIMD256
1891
+ /** @brief Load register contents from memory with double expand
1892
+
1893
+ Same as cv::v256_load, but result pack type will be 2x wider than memory type.
1894
+
1895
+ @code{.cpp}
1896
+ short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
1897
+ v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1898
+ @endcode
1899
+ For 8-, 16-, 32-bit integer source types.
1900
+
1901
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1902
+ Use vx_load_expand version to get maximum available register length result
1903
+ */
1904
+ template<typename _Tp>
1905
+ inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1906
+ v256_load_expand(const _Tp* ptr)
1907
+ {
1908
+ #if CV_STRONG_ALIGNMENT
1909
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1910
+ #endif
1911
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1912
+ v_reg<w_type, simd256_width / sizeof(w_type)> c;
1913
+ for (int i = 0; i < c.nlanes; i++)
1914
+ {
1915
+ c.s[i] = ptr[i];
1916
+ }
1917
+ return c;
1918
+ }
1919
+ #endif
1920
+
1921
+ #if CV_SIMD512
1922
+ /** @brief Load register contents from memory with double expand
1923
+
1924
+ Same as cv::v512_load, but result pack type will be 2x wider than memory type.
1925
+
1926
+ @code{.cpp}
1927
+ short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
1928
+ v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
1929
+ @endcode
1930
+ For 8-, 16-, 32-bit integer source types.
1931
+
1932
+ @note Check CV_SIMD512 preprocessor definition prior to use.
1933
+ Use vx_load_expand version to get maximum available register length result
1934
+ */
1935
+ template<typename _Tp>
1936
+ inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
1937
+ v512_load_expand(const _Tp* ptr)
1938
+ {
1939
+ #if CV_STRONG_ALIGNMENT
1940
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1941
+ #endif
1942
+ typedef typename V_TypeTraits<_Tp>::w_type w_type;
1943
+ v_reg<w_type, simd512_width / sizeof(w_type)> c;
1944
+ for (int i = 0; i < c.nlanes; i++)
1945
+ {
1946
+ c.s[i] = ptr[i];
1947
+ }
1948
+ return c;
1949
+ }
1950
+ #endif
1951
+
1952
+ /** @brief Load register contents from memory with quad expand
1953
+
1954
+ Same as cv::v_load_expand, but result type is 4 times wider than source.
1955
+ @code{.cpp}
1956
+ char buf[4] = {1, 2, 3, 4}; // type is int8
1957
+ v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
1958
+ @endcode
1959
+ For 8-bit integer source types.
1960
+
1961
+ @note Use vx_load_expand_q version to get maximum available register length result
1962
+ */
1963
+ template<typename _Tp>
1964
+ inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
1965
+ v_load_expand_q(const _Tp* ptr)
1966
+ {
1967
+ #if CV_STRONG_ALIGNMENT
1968
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1969
+ #endif
1970
+ typedef typename V_TypeTraits<_Tp>::q_type q_type;
1971
+ v_reg<q_type, simd128_width / sizeof(q_type)> c;
1972
+ for( int i = 0; i < c.nlanes; i++ )
1973
+ {
1974
+ c.s[i] = ptr[i];
1975
+ }
1976
+ return c;
1977
+ }
1978
+
1979
+ #if CV_SIMD256
1980
+ /** @brief Load register contents from memory with quad expand
1981
+
1982
+ Same as cv::v256_load_expand, but result type is 4 times wider than source.
1983
+ @code{.cpp}
1984
+ char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
1985
+ v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1986
+ @endcode
1987
+ For 8-bit integer source types.
1988
+
1989
+ @note Check CV_SIMD256 preprocessor definition prior to use.
1990
+ Use vx_load_expand_q version to get maximum available register length result
1991
+ */
1992
+ template<typename _Tp>
1993
+ inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
1994
+ v256_load_expand_q(const _Tp* ptr)
1995
+ {
1996
+ #if CV_STRONG_ALIGNMENT
1997
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1998
+ #endif
1999
+ typedef typename V_TypeTraits<_Tp>::q_type q_type;
2000
+ v_reg<q_type, simd256_width / sizeof(q_type)> c;
2001
+ for (int i = 0; i < c.nlanes; i++)
2002
+ {
2003
+ c.s[i] = ptr[i];
2004
+ }
2005
+ return c;
2006
+ }
2007
+ #endif
2008
+
2009
+ #if CV_SIMD512
2010
+ /** @brief Load register contents from memory with quad expand
2011
+
2012
+ Same as cv::v512_load_expand, but result type is 4 times wider than source.
2013
+ @code{.cpp}
2014
+ char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
2015
+ v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
2016
+ @endcode
2017
+ For 8-bit integer source types.
2018
+
2019
+ @note Check CV_SIMD512 preprocessor definition prior to use.
2020
+ Use vx_load_expand_q version to get maximum available register length result
2021
+ */
2022
+ template<typename _Tp>
2023
+ inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
2024
+ v512_load_expand_q(const _Tp* ptr)
2025
+ {
2026
+ #if CV_STRONG_ALIGNMENT
2027
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2028
+ #endif
2029
+ typedef typename V_TypeTraits<_Tp>::q_type q_type;
2030
+ v_reg<q_type, simd512_width / sizeof(q_type)> c;
2031
+ for (int i = 0; i < c.nlanes; i++)
2032
+ {
2033
+ c.s[i] = ptr[i];
2034
+ }
2035
+ return c;
2036
+ }
2037
+ #endif
2038
+
2039
+ /** @brief Load and deinterleave (2 channels)
2040
+
2041
+ Load data from memory deinterleave and store to 2 registers.
2042
+ Scheme:
2043
+ @code
2044
+ {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
2045
+ @endcode
2046
+ For all types except 64-bit. */
2047
+ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2048
+ v_reg<_Tp, n>& b)
2049
+ {
2050
+ #if CV_STRONG_ALIGNMENT
2051
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2052
+ #endif
2053
+ int i, i2;
2054
+ for( i = i2 = 0; i < n; i++, i2 += 2 )
2055
+ {
2056
+ a.s[i] = ptr[i2];
2057
+ b.s[i] = ptr[i2+1];
2058
+ }
2059
+ }
2060
+
2061
+ /** @brief Load and deinterleave (3 channels)
2062
+
2063
+ Load data from memory deinterleave and store to 3 registers.
2064
+ Scheme:
2065
+ @code
2066
+ {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
2067
+ @endcode
2068
+ For all types except 64-bit. */
2069
+ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2070
+ v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
2071
+ {
2072
+ #if CV_STRONG_ALIGNMENT
2073
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2074
+ #endif
2075
+ int i, i3;
2076
+ for( i = i3 = 0; i < n; i++, i3 += 3 )
2077
+ {
2078
+ a.s[i] = ptr[i3];
2079
+ b.s[i] = ptr[i3+1];
2080
+ c.s[i] = ptr[i3+2];
2081
+ }
2082
+ }
2083
+
2084
+ /** @brief Load and deinterleave (4 channels)
2085
+
2086
+ Load data from memory deinterleave and store to 4 registers.
2087
+ Scheme:
2088
+ @code
2089
+ {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
2090
+ @endcode
2091
+ For all types except 64-bit. */
2092
+ template<typename _Tp, int n>
2093
+ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2094
+ v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
2095
+ v_reg<_Tp, n>& d)
2096
+ {
2097
+ #if CV_STRONG_ALIGNMENT
2098
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2099
+ #endif
2100
+ int i, i4;
2101
+ for( i = i4 = 0; i < n; i++, i4 += 4 )
2102
+ {
2103
+ a.s[i] = ptr[i4];
2104
+ b.s[i] = ptr[i4+1];
2105
+ c.s[i] = ptr[i4+2];
2106
+ d.s[i] = ptr[i4+3];
2107
+ }
2108
+ }
2109
+
2110
+ /** @brief Interleave and store (2 channels)
2111
+
2112
+ Interleave and store data from 2 registers to memory.
2113
+ Scheme:
2114
+ @code
2115
+ {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
2116
+ @endcode
2117
+ For all types except 64-bit. */
2118
+ template<typename _Tp, int n>
2119
+ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2120
+ const v_reg<_Tp, n>& b,
2121
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2122
+ {
2123
+ #if CV_STRONG_ALIGNMENT
2124
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2125
+ #endif
2126
+ int i, i2;
2127
+ for( i = i2 = 0; i < n; i++, i2 += 2 )
2128
+ {
2129
+ ptr[i2] = a.s[i];
2130
+ ptr[i2+1] = b.s[i];
2131
+ }
2132
+ }
2133
+
2134
+ /** @brief Interleave and store (3 channels)
2135
+
2136
+ Interleave and store data from 3 registers to memory.
2137
+ Scheme:
2138
+ @code
2139
+ {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
2140
+ @endcode
2141
+ For all types except 64-bit. */
2142
+ template<typename _Tp, int n>
2143
+ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2144
+ const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2145
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2146
+ {
2147
+ #if CV_STRONG_ALIGNMENT
2148
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2149
+ #endif
2150
+ int i, i3;
2151
+ for( i = i3 = 0; i < n; i++, i3 += 3 )
2152
+ {
2153
+ ptr[i3] = a.s[i];
2154
+ ptr[i3+1] = b.s[i];
2155
+ ptr[i3+2] = c.s[i];
2156
+ }
2157
+ }
2158
+
2159
+ /** @brief Interleave and store (4 channels)
2160
+
2161
+ Interleave and store data from 4 registers to memory.
2162
+ Scheme:
2163
+ @code
2164
+ {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
2165
+ @endcode
2166
+ For all types except 64-bit. */
2167
+ template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2168
+ const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2169
+ const v_reg<_Tp, n>& d,
2170
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2171
+ {
2172
+ #if CV_STRONG_ALIGNMENT
2173
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2174
+ #endif
2175
+ int i, i4;
2176
+ for( i = i4 = 0; i < n; i++, i4 += 4 )
2177
+ {
2178
+ ptr[i4] = a.s[i];
2179
+ ptr[i4+1] = b.s[i];
2180
+ ptr[i4+2] = c.s[i];
2181
+ ptr[i4+3] = d.s[i];
2182
+ }
2183
+ }
2184
+
2185
+ /** @brief Store data to memory
2186
+
2187
+ Store register contents to memory.
2188
+ Scheme:
2189
+ @code
2190
+ REG {A B C D} ==> MEM {A B C D}
2191
+ @endcode
2192
+ Pointer can be unaligned. */
2193
+ template<typename _Tp, int n>
2194
+ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
2195
+ {
2196
+ #if CV_STRONG_ALIGNMENT
2197
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2198
+ #endif
2199
+ for( int i = 0; i < n; i++ )
2200
+ ptr[i] = a.s[i];
2201
+ }
2202
+
2203
+ template<typename _Tp, int n>
2204
+ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2205
+ {
2206
+ #if CV_STRONG_ALIGNMENT
2207
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2208
+ #endif
2209
+ v_store(ptr, a);
2210
+ }
2211
+
2212
+ /** @brief Store data to memory (lower half)
2213
+
2214
+ Store lower half of register contents to memory.
2215
+ Scheme:
2216
+ @code
2217
+ REG {A B C D} ==> MEM {A B}
2218
+ @endcode */
2219
+ template<typename _Tp, int n>
2220
+ inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
2221
+ {
2222
+ #if CV_STRONG_ALIGNMENT
2223
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2224
+ #endif
2225
+ for( int i = 0; i < (n/2); i++ )
2226
+ ptr[i] = a.s[i];
2227
+ }
2228
+
2229
+ /** @brief Store data to memory (higher half)
2230
+
2231
+ Store higher half of register contents to memory.
2232
+ Scheme:
2233
+ @code
2234
+ REG {A B C D} ==> MEM {C D}
2235
+ @endcode */
2236
+ template<typename _Tp, int n>
2237
+ inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
2238
+ {
2239
+ #if CV_STRONG_ALIGNMENT
2240
+ CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2241
+ #endif
2242
+ for( int i = 0; i < (n/2); i++ )
2243
+ ptr[i] = a.s[i+(n/2)];
2244
+ }
2245
+
2246
+ /** @brief Store data to memory (aligned)
2247
+
2248
+ Store register contents to memory.
2249
+ Scheme:
2250
+ @code
2251
+ REG {A B C D} ==> MEM {A B C D}
2252
+ @endcode
2253
+ Pointer __should__ be aligned by 16-byte boundary. */
2254
+ template<typename _Tp, int n>
2255
+ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
2256
+ {
2257
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2258
+ v_store(ptr, a);
2259
+ }
2260
+
2261
+ template<typename _Tp, int n>
2262
+ inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
2263
+ {
2264
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2265
+ v_store(ptr, a);
2266
+ }
2267
+
2268
+ template<typename _Tp, int n>
2269
+ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2270
+ {
2271
+ CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2272
+ v_store(ptr, a);
2273
+ }
2274
+
2275
+ /** @brief Combine vector from first elements of two vectors
2276
+
2277
+ Scheme:
2278
+ @code
2279
+ {A1 A2 A3 A4}
2280
+ {B1 B2 B3 B4}
2281
+ ---------------
2282
+ {A1 A2 B1 B2}
2283
+ @endcode
2284
+ For all types except 64-bit. */
2285
+ template<typename _Tp, int n>
2286
+ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2287
+ {
2288
+ v_reg<_Tp, n> c;
2289
+ for( int i = 0; i < (n/2); i++ )
2290
+ {
2291
+ c.s[i] = a.s[i];
2292
+ c.s[i+(n/2)] = b.s[i];
2293
+ }
2294
+ return c;
2295
+ }
2296
+
2297
+ /** @brief Combine vector from last elements of two vectors
2298
+
2299
+ Scheme:
2300
+ @code
2301
+ {A1 A2 A3 A4}
2302
+ {B1 B2 B3 B4}
2303
+ ---------------
2304
+ {A3 A4 B3 B4}
2305
+ @endcode
2306
+ For all types except 64-bit. */
2307
+ template<typename _Tp, int n>
2308
+ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2309
+ {
2310
+ v_reg<_Tp, n> c;
2311
+ for( int i = 0; i < (n/2); i++ )
2312
+ {
2313
+ c.s[i] = a.s[i+(n/2)];
2314
+ c.s[i+(n/2)] = b.s[i+(n/2)];
2315
+ }
2316
+ return c;
2317
+ }
2318
+
2319
+ /** @brief Combine two vectors from lower and higher parts of two other vectors
2320
+
2321
+ @code{.cpp}
2322
+ low = cv::v_combine_low(a, b);
2323
+ high = cv::v_combine_high(a, b);
2324
+ @endcode */
2325
+ template<typename _Tp, int n>
2326
+ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
2327
+ v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
2328
+ {
2329
+ for( int i = 0; i < (n/2); i++ )
2330
+ {
2331
+ low.s[i] = a.s[i];
2332
+ low.s[i+(n/2)] = b.s[i];
2333
+ high.s[i] = a.s[i+(n/2)];
2334
+ high.s[i+(n/2)] = b.s[i+(n/2)];
2335
+ }
2336
+ }
2337
+
2338
+ /** @brief Vector reverse order
2339
+
2340
+ Reverse the order of the vector
2341
+ Scheme:
2342
+ @code
2343
+ REG {A1 ... An} ==> REG {An ... A1}
2344
+ @endcode
2345
+ For all types. */
2346
+ template<typename _Tp, int n>
2347
+ inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
2348
+ {
2349
+ v_reg<_Tp, n> c;
2350
+ for( int i = 0; i < n; i++ )
2351
+ c.s[i] = a.s[n-i-1];
2352
+ return c;
2353
+ }
2354
+
2355
+ /** @brief Vector extract
2356
+
2357
+ Scheme:
2358
+ @code
2359
+ {A1 A2 A3 A4}
2360
+ {B1 B2 B3 B4}
2361
+ ========================
2362
+ shift = 1 {A2 A3 A4 B1}
2363
+ shift = 2 {A3 A4 B1 B2}
2364
+ shift = 3 {A4 B1 B2 B3}
2365
+ @endcode
2366
+ Restriction: 0 <= shift < nlanes
2367
+
2368
+ Usage:
2369
+ @code
2370
+ v_int32x4 a, b, c;
2371
+ c = v_extract<2>(a, b);
2372
+ @endcode
2373
+ For all types. */
2374
+ template<int s, typename _Tp, int n>
2375
+ inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2376
+ {
2377
+ v_reg<_Tp, n> r;
2378
+ const int shift = n - s;
2379
+ int i = 0;
2380
+ for (; i < shift; ++i)
2381
+ r.s[i] = a.s[i+s];
2382
+ for (; i < n; ++i)
2383
+ r.s[i] = b.s[i-shift];
2384
+ return r;
2385
+ }
2386
+
2387
+ /** @brief Vector extract
2388
+
2389
+ Scheme:
2390
+ Return the s-th element of v.
2391
+ Restriction: 0 <= s < nlanes
2392
+
2393
+ Usage:
2394
+ @code
2395
+ v_int32x4 a;
2396
+ int r;
2397
+ r = v_extract_n<2>(a);
2398
+ @endcode
2399
+ For all types. */
2400
+ template<int s, typename _Tp, int n>
2401
+ inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
2402
+ {
2403
+ CV_DbgAssert(s >= 0 && s < n);
2404
+ return v.s[s];
2405
+ }
2406
+
2407
+ /** @brief Broadcast i-th element of vector
2408
+
2409
+ Scheme:
2410
+ @code
2411
+ { v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
2412
+ @endcode
2413
+ Restriction: 0 <= i < nlanes
2414
+ Supported types: 32-bit integers and floats (s32/u32/f32)
2415
+ */
2416
+ template<int i, typename _Tp, int n>
2417
+ inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
2418
+ {
2419
+ CV_DbgAssert(i >= 0 && i < n);
2420
+ return v_reg<_Tp, n>::all(a.s[i]);
2421
+ }
2422
+
2423
+ /** @brief Round elements
2424
+
2425
+ Rounds each value. Input type is float vector ==> output type is int vector.
2426
+ @note Only for floating point types.
2427
+ */
2428
+ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
2429
+ {
2430
+ v_reg<int, n> c;
2431
+ for( int i = 0; i < n; i++ )
2432
+ c.s[i] = cvRound(a.s[i]);
2433
+ return c;
2434
+ }
2435
+
2436
+ /** @overload */
2437
+ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
2438
+ {
2439
+ v_reg<int, n*2> c;
2440
+ for( int i = 0; i < n; i++ )
2441
+ {
2442
+ c.s[i] = cvRound(a.s[i]);
2443
+ c.s[i+n] = cvRound(b.s[i]);
2444
+ }
2445
+ return c;
2446
+ }
2447
+
2448
+ /** @brief Floor elements
2449
+
2450
+ Floor each value. Input type is float vector ==> output type is int vector.
2451
+ @note Only for floating point types.
2452
+ */
2453
+ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
2454
+ {
2455
+ v_reg<int, n> c;
2456
+ for( int i = 0; i < n; i++ )
2457
+ c.s[i] = cvFloor(a.s[i]);
2458
+ return c;
2459
+ }
2460
+
2461
+ /** @brief Ceil elements
2462
+
2463
+ Ceil each value. Input type is float vector ==> output type is int vector.
2464
+ @note Only for floating point types.
2465
+ */
2466
+ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
2467
+ {
2468
+ v_reg<int, n> c;
2469
+ for( int i = 0; i < n; i++ )
2470
+ c.s[i] = cvCeil(a.s[i]);
2471
+ return c;
2472
+ }
2473
+
2474
+ /** @brief Truncate elements
2475
+
2476
+ Truncate each value. Input type is float vector ==> output type is int vector.
2477
+ @note Only for floating point types.
2478
+ */
2479
+ template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
2480
+ {
2481
+ v_reg<int, n> c;
2482
+ for( int i = 0; i < n; i++ )
2483
+ c.s[i] = (int)(a.s[i]);
2484
+ return c;
2485
+ }
2486
+
2487
+ /** @overload */
2488
+ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
2489
+ {
2490
+ v_reg<int, n*2> c;
2491
+ for( int i = 0; i < n; i++ )
2492
+ {
2493
+ c.s[i] = cvRound(a.s[i]);
2494
+ c.s[i+n] = 0;
2495
+ }
2496
+ return c;
2497
+ }
2498
+
2499
+ /** @overload */
2500
+ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
2501
+ {
2502
+ v_reg<int, n*2> c;
2503
+ for( int i = 0; i < n; i++ )
2504
+ {
2505
+ c.s[i] = cvFloor(a.s[i]);
2506
+ c.s[i+n] = 0;
2507
+ }
2508
+ return c;
2509
+ }
2510
+
2511
+ /** @overload */
2512
+ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
2513
+ {
2514
+ v_reg<int, n*2> c;
2515
+ for( int i = 0; i < n; i++ )
2516
+ {
2517
+ c.s[i] = cvCeil(a.s[i]);
2518
+ c.s[i+n] = 0;
2519
+ }
2520
+ return c;
2521
+ }
2522
+
2523
+ /** @overload */
2524
+ template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
2525
+ {
2526
+ v_reg<int, n*2> c;
2527
+ for( int i = 0; i < n; i++ )
2528
+ {
2529
+ c.s[i] = (int)(a.s[i]);
2530
+ c.s[i+n] = 0;
2531
+ }
2532
+ return c;
2533
+ }
2534
+
2535
+ /** @brief Convert to float
2536
+
2537
+ Supported input type is cv::v_int32. */
2538
+ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
2539
+ {
2540
+ v_reg<float, n> c;
2541
+ for( int i = 0; i < n; i++ )
2542
+ c.s[i] = (float)a.s[i];
2543
+ return c;
2544
+ }
2545
+
2546
+ /** @brief Convert lower half to float
2547
+
2548
+ Supported input type is cv::v_float64. */
2549
+ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
2550
+ {
2551
+ v_reg<float, n*2> c;
2552
+ for( int i = 0; i < n; i++ )
2553
+ {
2554
+ c.s[i] = (float)a.s[i];
2555
+ c.s[i+n] = 0;
2556
+ }
2557
+ return c;
2558
+ }
2559
+
2560
+ /** @brief Convert to float
2561
+
2562
+ Supported input type is cv::v_float64. */
2563
+ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
2564
+ {
2565
+ v_reg<float, n*2> c;
2566
+ for( int i = 0; i < n; i++ )
2567
+ {
2568
+ c.s[i] = (float)a.s[i];
2569
+ c.s[i+n] = (float)b.s[i];
2570
+ }
2571
+ return c;
2572
+ }
2573
+
2574
+ /** @brief Convert lower half to double
2575
+
2576
+ Supported input type is cv::v_int32. */
2577
+ template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
2578
+ {
2579
+ v_reg<double, (n/2)> c;
2580
+ for( int i = 0; i < (n/2); i++ )
2581
+ c.s[i] = (double)a.s[i];
2582
+ return c;
2583
+ }
2584
+
2585
+ /** @brief Convert to double high part of vector
2586
+
2587
+ Supported input type is cv::v_int32. */
2588
+ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
2589
+ {
2590
+ v_reg<double, (n/2)> c;
2591
+ for( int i = 0; i < (n/2); i++ )
2592
+ c.s[i] = (double)a.s[i + (n/2)];
2593
+ return c;
2594
+ }
2595
+
2596
+ /** @brief Convert lower half to double
2597
+
2598
+ Supported input type is cv::v_float32. */
2599
+ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
2600
+ {
2601
+ v_reg<double, (n/2)> c;
2602
+ for( int i = 0; i < (n/2); i++ )
2603
+ c.s[i] = (double)a.s[i];
2604
+ return c;
2605
+ }
2606
+
2607
+ /** @brief Convert to double high part of vector
2608
+
2609
+ Supported input type is cv::v_float32. */
2610
+ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
2611
+ {
2612
+ v_reg<double, (n/2)> c;
2613
+ for( int i = 0; i < (n/2); i++ )
2614
+ c.s[i] = (double)a.s[i + (n/2)];
2615
+ return c;
2616
+ }
2617
+
2618
+ /** @brief Convert to double
2619
+
2620
+ Supported input type is cv::v_int64. */
2621
+ template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
2622
+ {
2623
+ v_reg<double, n> c;
2624
+ for( int i = 0; i < n; i++ )
2625
+ c.s[i] = (double)a.s[i];
2626
+ return c;
2627
+ }
2628
+
2629
+
2630
+ template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
2631
+ {
2632
+ v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2633
+ for (int i = 0; i < c.nlanes; i++)
2634
+ c.s[i] = tab[idx[i]];
2635
+ return c;
2636
+ }
2637
+ template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
2638
+ {
2639
+ v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2640
+ for (int i = 0; i < c.nlanes; i++)
2641
+ c.s[i] = tab[idx[i / 2] + i % 2];
2642
+ return c;
2643
+ }
2644
+ template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
2645
+ {
2646
+ v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2647
+ for (int i = 0; i < c.nlanes; i++)
2648
+ c.s[i] = tab[idx[i / 4] + i % 4];
2649
+ return c;
2650
+ }
2651
+
2652
+ template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
2653
+ {
2654
+ v_reg<int, n> c;
2655
+ for( int i = 0; i < n; i++ )
2656
+ c.s[i] = tab[idx.s[i]];
2657
+ return c;
2658
+ }
2659
+
2660
+ template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
2661
+ {
2662
+ v_reg<int, n> c;
2663
+ for (int i = 0; i < n; i++)
2664
+ c.s[i] = tab[idx.s[i]];
2665
+ return c;
2666
+ }
2667
+
2668
+ template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
2669
+ {
2670
+ v_reg<float, n> c;
2671
+ for( int i = 0; i < n; i++ )
2672
+ c.s[i] = tab[idx.s[i]];
2673
+ return c;
2674
+ }
2675
+
2676
+ template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
2677
+ {
2678
+ v_reg<double, n/2> c;
2679
+ for( int i = 0; i < n/2; i++ )
2680
+ c.s[i] = tab[idx.s[i]];
2681
+ return c;
2682
+ }
2683
+
2684
+
2685
+ template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
2686
+ v_reg<float, n>& x, v_reg<float, n>& y)
2687
+ {
2688
+ for( int i = 0; i < n; i++ )
2689
+ {
2690
+ int j = idx.s[i];
2691
+ x.s[i] = tab[j];
2692
+ y.s[i] = tab[j+1];
2693
+ }
2694
+ }
2695
+
2696
+ template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
2697
+ v_reg<double, n>& x, v_reg<double, n>& y)
2698
+ {
2699
+ for( int i = 0; i < n; i++ )
2700
+ {
2701
+ int j = idx.s[i];
2702
+ x.s[i] = tab[j];
2703
+ y.s[i] = tab[j+1];
2704
+ }
2705
+ }
2706
+
2707
+ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
2708
+ {
2709
+ v_reg<_Tp, n> c;
2710
+ for (int i = 0; i < n/4; i++)
2711
+ {
2712
+ c.s[4*i ] = vec.s[4*i ];
2713
+ c.s[4*i+1] = vec.s[4*i+2];
2714
+ c.s[4*i+2] = vec.s[4*i+1];
2715
+ c.s[4*i+3] = vec.s[4*i+3];
2716
+ }
2717
+ return c;
2718
+ }
2719
+
2720
+ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
2721
+ {
2722
+ v_reg<_Tp, n> c;
2723
+ for (int i = 0; i < n/8; i++)
2724
+ {
2725
+ c.s[8*i ] = vec.s[8*i ];
2726
+ c.s[8*i+1] = vec.s[8*i+4];
2727
+ c.s[8*i+2] = vec.s[8*i+1];
2728
+ c.s[8*i+3] = vec.s[8*i+5];
2729
+ c.s[8*i+4] = vec.s[8*i+2];
2730
+ c.s[8*i+5] = vec.s[8*i+6];
2731
+ c.s[8*i+6] = vec.s[8*i+3];
2732
+ c.s[8*i+7] = vec.s[8*i+7];
2733
+ }
2734
+ return c;
2735
+ }
2736
+
2737
+ template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
2738
+ {
2739
+ v_reg<_Tp, n> c;
2740
+ for (int i = 0; i < n/4; i++)
2741
+ {
2742
+ c.s[3*i ] = vec.s[4*i ];
2743
+ c.s[3*i+1] = vec.s[4*i+1];
2744
+ c.s[3*i+2] = vec.s[4*i+2];
2745
+ }
2746
+ return c;
2747
+ }
2748
+
2749
+ /** @brief Transpose 4x4 matrix
2750
+
2751
+ Scheme:
2752
+ @code
2753
+ a0 {A1 A2 A3 A4}
2754
+ a1 {B1 B2 B3 B4}
2755
+ a2 {C1 C2 C3 C4}
2756
+ a3 {D1 D2 D3 D4}
2757
+ ===============
2758
+ b0 {A1 B1 C1 D1}
2759
+ b1 {A2 B2 C2 D2}
2760
+ b2 {A3 B3 C3 D3}
2761
+ b3 {A4 B4 C4 D4}
2762
+ @endcode
2763
+ */
2764
+ template<typename _Tp, int n>
2765
+ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
2766
+ const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
2767
+ v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
2768
+ v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
2769
+ {
2770
+ for (int i = 0; i < n / 4; i++)
2771
+ {
2772
+ b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
2773
+ b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
2774
+ b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
2775
+ b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
2776
+ b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
2777
+ b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
2778
+ b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
2779
+ b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
2780
+ }
2781
+ }
2782
+
2783
+ //! @brief Helper macro
2784
+ //! @ingroup core_hal_intrin_impl
2785
+ #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
2786
+ inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
2787
+
2788
+ //! @name Init with zero
2789
+ //! @{
2790
+ //! @brief Create new vector with zero elements
2791
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
2792
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
2793
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
2794
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
2795
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
2796
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
2797
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
2798
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
2799
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
2800
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
2801
+
2802
+ #if CV_SIMD256
2803
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
2804
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
2805
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
2806
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
2807
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
2808
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
2809
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
2810
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
2811
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
2812
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
2813
+ #endif
2814
+
2815
+ #if CV_SIMD512
2816
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
2817
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
2818
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
2819
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
2820
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
2821
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
2822
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
2823
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
2824
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
2825
+ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
2826
+ #endif
2827
+ //! @}
2828
+
2829
+ //! @brief Helper macro
2830
+ //! @ingroup core_hal_intrin_impl
2831
+ #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
2832
+ inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
2833
+
2834
+ //! @name Init with value
2835
+ //! @{
2836
+ //! @brief Create new vector with elements set to a specific value
2837
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
2838
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
2839
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
2840
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
2841
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
2842
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
2843
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
2844
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
2845
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
2846
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
2847
+
2848
+ #if CV_SIMD256
2849
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
2850
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
2851
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
2852
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
2853
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
2854
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
2855
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
2856
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
2857
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
2858
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
2859
+ #endif
2860
+
2861
+ #if CV_SIMD512
2862
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
2863
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
2864
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
2865
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
2866
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
2867
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
2868
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
2869
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
2870
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
2871
+ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
2872
+ #endif
2873
+ //! @}
2874
+
2875
+ //! @brief Helper macro
2876
+ //! @ingroup core_hal_intrin_impl
2877
+ #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
2878
+ template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
2879
+ v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
2880
+ { return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
2881
+
2882
+ //! @name Reinterpret
2883
+ //! @{
2884
+ //! @brief Convert vector to different type without modifying underlying data.
2885
+ OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
2886
+ OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
2887
+ OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
2888
+ OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
2889
+ OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
2890
+ OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
2891
+ OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
2892
+ OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
2893
+ OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
2894
+ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
2895
+ //! @}
2896
+
2897
+ //! @brief Helper macro
2898
+ //! @ingroup core_hal_intrin_impl
2899
+ #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
2900
+ template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
2901
+ { return a << shift; }
2902
+
2903
+ //! @name Left shift
2904
+ //! @{
2905
+ //! @brief Shift left
2906
+ OPENCV_HAL_IMPL_C_SHIFTL(ushort)
2907
+ OPENCV_HAL_IMPL_C_SHIFTL(short)
2908
+ OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
2909
+ OPENCV_HAL_IMPL_C_SHIFTL(int)
2910
+ OPENCV_HAL_IMPL_C_SHIFTL(uint64)
2911
+ OPENCV_HAL_IMPL_C_SHIFTL(int64)
2912
+ //! @}
2913
+
2914
+ //! @brief Helper macro
2915
+ //! @ingroup core_hal_intrin_impl
2916
+ #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
2917
+ template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
2918
+ { return a >> shift; }
2919
+
2920
+ //! @name Right shift
2921
+ //! @{
2922
+ //! @brief Shift right
2923
+ OPENCV_HAL_IMPL_C_SHIFTR(ushort)
2924
+ OPENCV_HAL_IMPL_C_SHIFTR(short)
2925
+ OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
2926
+ OPENCV_HAL_IMPL_C_SHIFTR(int)
2927
+ OPENCV_HAL_IMPL_C_SHIFTR(uint64)
2928
+ OPENCV_HAL_IMPL_C_SHIFTR(int64)
2929
+ //! @}
2930
+
2931
+ //! @brief Helper macro
2932
+ //! @ingroup core_hal_intrin_impl
2933
+ #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
2934
+ template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
2935
+ { \
2936
+ v_reg<_Tp, n> c; \
2937
+ for( int i = 0; i < n; i++ ) \
2938
+ c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2939
+ return c; \
2940
+ }
2941
+
2942
+ //! @name Rounding shift
2943
+ //! @{
2944
+ //! @brief Rounding shift right
2945
+ OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
2946
+ OPENCV_HAL_IMPL_C_RSHIFTR(short)
2947
+ OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
2948
+ OPENCV_HAL_IMPL_C_RSHIFTR(int)
2949
+ OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
2950
+ OPENCV_HAL_IMPL_C_RSHIFTR(int64)
2951
+ //! @}
2952
+
2953
+ //! @brief Helper macro
2954
+ //! @ingroup core_hal_intrin_impl
2955
+ #define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
2956
+ template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2957
+ { \
2958
+ v_reg<_Tpn, 2*n> c; \
2959
+ for( int i = 0; i < n; i++ ) \
2960
+ { \
2961
+ c.s[i] = cast<_Tpn>(a.s[i]); \
2962
+ c.s[i+n] = cast<_Tpn>(b.s[i]); \
2963
+ } \
2964
+ return c; \
2965
+ }
2966
+
2967
+ //! @name Pack
2968
+ //! @{
2969
+ //! @brief Pack values from two vectors to one
2970
+ //!
2971
+ //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
2972
+ //! converts to corresponding unsigned type.
2973
+ //!
2974
+ //! - pack: for 16-, 32- and 64-bit integer input types
2975
+ //! - pack_u: for 16- and 32-bit signed integer input types
2976
+ //!
2977
+ //! @note All variants except 64-bit use saturation.
2978
+ OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
2979
+ OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
2980
+ OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
2981
+ OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
2982
+ OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
2983
+ OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
2984
+ OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
2985
+ OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
2986
+ //! @}
2987
+
2988
+ //! @brief Helper macro
2989
+ //! @ingroup core_hal_intrin_impl
2990
+ #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
2991
+ template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2992
+ { \
2993
+ v_reg<_Tpn, 2*n> c; \
2994
+ for( int i = 0; i < n; i++ ) \
2995
+ { \
2996
+ c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2997
+ c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2998
+ } \
2999
+ return c; \
3000
+ }
3001
+
3002
+ //! @name Pack with rounding shift
3003
+ //! @{
3004
+ //! @brief Pack values from two vectors to one with rounding shift
3005
+ //!
3006
+ //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
3007
+ //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
3008
+ //!
3009
+ //! - pack: for 16-, 32- and 64-bit integer input types
3010
+ //! - pack_u: for 16- and 32-bit signed integer input types
3011
+ //!
3012
+ //! @note All variants except 64-bit use saturation.
3013
+ OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
3014
+ OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
3015
+ OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
3016
+ OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
3017
+ OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
3018
+ OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
3019
+ OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
3020
+ OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
3021
+ //! @}
3022
+
3023
+ //! @brief Helper macro
3024
+ //! @ingroup core_hal_intrin_impl
3025
+ #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3026
+ template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3027
+ { \
3028
+ for( int i = 0; i < n; i++ ) \
3029
+ ptr[i] = cast<_Tpn>(a.s[i]); \
3030
+ }
3031
+
3032
+ //! @name Pack and store
3033
+ //! @{
3034
+ //! @brief Store values from the input vector into memory with pack
3035
+ //!
3036
+ //! Values will be stored into memory with conversion to narrower type.
3037
+ //! Variant with _u_ suffix converts to corresponding unsigned type.
3038
+ //!
3039
+ //! - pack: for 16-, 32- and 64-bit integer input types
3040
+ //! - pack_u: for 16- and 32-bit signed integer input types
3041
+ //!
3042
+ //! @note All variants except 64-bit use saturation.
3043
+ OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
3044
+ OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
3045
+ OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3046
+ OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
3047
+ OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
3048
+ OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
3049
+ OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
3050
+ OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
3051
+ //! @}
3052
+
3053
+ //! @brief Helper macro
3054
+ //! @ingroup core_hal_intrin_impl
3055
+ #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3056
+ template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3057
+ { \
3058
+ for( int i = 0; i < n; i++ ) \
3059
+ ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
3060
+ }
3061
+
3062
+ //! @name Pack and store with rounding shift
3063
+ //! @{
3064
+ //! @brief Store values from the input vector into memory with pack
3065
+ //!
3066
+ //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
3067
+ //! memory. Variant with _u_ suffix converts to unsigned type.
3068
+ //!
3069
+ //! - pack: for 16-, 32- and 64-bit integer input types
3070
+ //! - pack_u: for 16- and 32-bit signed integer input types
3071
+ //!
3072
+ //! @note All variants except 64-bit use saturation.
3073
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
3074
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
3075
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3076
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
3077
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
3078
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
3079
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
3080
+ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
3081
+ //! @}
3082
+
3083
+ //! @cond IGNORED
3084
+ template<typename _Tpm, typename _Tp, int n>
3085
+ inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
3086
+ {
3087
+ for (int i = 0; i < n; ++i)
3088
+ {
3089
+ mptr[i] = (_Tpm)a.s[i];
3090
+ mptr[i + n] = (_Tpm)b.s[i];
3091
+ }
3092
+ }
3093
+ //! @endcond
3094
+
3095
+ //! @name Pack boolean values
3096
+ //! @{
3097
+ //! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
3098
+ //!
3099
+ //! @note Must provide valid boolean values to guarantee same result for all architectures.
3100
+
3101
+ /** @brief
3102
+ //! For 16-bit boolean values
3103
+
3104
+ Scheme:
3105
+ @code
3106
+ a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
3107
+ b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
3108
+ ===============
3109
+ {
3110
+ 0xFF 0 0 0xFF 0 0xFF 0xFF 0
3111
+ 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3112
+ }
3113
+ @endcode */
3114
+
3115
+ template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
3116
+ {
3117
+ v_reg<uchar, 2*n> mask;
3118
+ _pack_b(mask.s, a, b);
3119
+ return mask;
3120
+ }
3121
+
3122
+ /** @overload
3123
+ For 32-bit boolean values
3124
+
3125
+ Scheme:
3126
+ @code
3127
+ a {0xFFFF.. 0 0 0xFFFF..}
3128
+ b {0 0xFFFF.. 0xFFFF.. 0}
3129
+ c {0xFFFF.. 0 0xFFFF.. 0}
3130
+ d {0 0xFFFF.. 0 0xFFFF..}
3131
+ ===============
3132
+ {
3133
+ 0xFF 0 0 0xFF 0 0xFF 0xFF 0
3134
+ 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3135
+ }
3136
+ @endcode */
3137
+
3138
+ template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
3139
+ const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
3140
+ {
3141
+ v_reg<uchar, 4*n> mask;
3142
+ _pack_b(mask.s, a, b);
3143
+ _pack_b(mask.s + 2*n, c, d);
3144
+ return mask;
3145
+ }
3146
+
3147
+ /** @overload
3148
+ For 64-bit boolean values
3149
+
3150
+ Scheme:
3151
+ @code
3152
+ a {0xFFFF.. 0}
3153
+ b {0 0xFFFF..}
3154
+ c {0xFFFF.. 0}
3155
+ d {0 0xFFFF..}
3156
+
3157
+ e {0xFFFF.. 0}
3158
+ f {0xFFFF.. 0}
3159
+ g {0 0xFFFF..}
3160
+ h {0 0xFFFF..}
3161
+ ===============
3162
+ {
3163
+ 0xFF 0 0 0xFF 0xFF 0 0 0xFF
3164
+ 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3165
+ }
3166
+ @endcode */
3167
+ template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
3168
+ const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
3169
+ const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
3170
+ const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
3171
+ {
3172
+ v_reg<uchar, 8*n> mask;
3173
+ _pack_b(mask.s, a, b);
3174
+ _pack_b(mask.s + 2*n, c, d);
3175
+ _pack_b(mask.s + 4*n, e, f);
3176
+ _pack_b(mask.s + 6*n, g, h);
3177
+ return mask;
3178
+ }
3179
+ //! @}
3180
+
3181
+ /** @brief Matrix multiplication
3182
+
3183
+ Scheme:
3184
+ @code
3185
+ {A0 A1 A2 A3} |V0|
3186
+ {B0 B1 B2 B3} |V1|
3187
+ {C0 C1 C2 C3} |V2|
3188
+ {D0 D1 D2 D3} x |V3|
3189
+ ====================
3190
+ {R0 R1 R2 R3}, where:
3191
+ R0 = A0V0 + B0V1 + C0V2 + D0V3,
3192
+ R1 = A1V0 + B1V1 + C1V2 + D1V3
3193
+ ...
3194
+ @endcode
3195
+ */
3196
+ template<int n>
3197
+ inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
3198
+ const v_reg<float, n>& a, const v_reg<float, n>& b,
3199
+ const v_reg<float, n>& c, const v_reg<float, n>& d)
3200
+ {
3201
+ v_reg<float, n> res;
3202
+ for (int i = 0; i < n / 4; i++)
3203
+ {
3204
+ res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
3205
+ res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
3206
+ res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
3207
+ res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
3208
+ }
3209
+ return res;
3210
+ }
3211
+
3212
+ /** @brief Matrix multiplication and add
3213
+
3214
+ Scheme:
3215
+ @code
3216
+ {A0 A1 A2 A3} |V0| |D0|
3217
+ {B0 B1 B2 B3} |V1| |D1|
3218
+ {C0 C1 C2 C3} x |V2| + |D2|
3219
+ ==================== |D3|
3220
+ {R0 R1 R2 R3}, where:
3221
+ R0 = A0V0 + B0V1 + C0V2 + D0,
3222
+ R1 = A1V0 + B1V1 + C1V2 + D1
3223
+ ...
3224
+ @endcode
3225
+ */
3226
+ template<int n>
3227
+ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
3228
+ const v_reg<float, n>& a, const v_reg<float, n>& b,
3229
+ const v_reg<float, n>& c, const v_reg<float, n>& d)
3230
+ {
3231
+ v_reg<float, n> res;
3232
+ for (int i = 0; i < n / 4; i++)
3233
+ {
3234
+ res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
3235
+ res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
3236
+ res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
3237
+ res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
3238
+ }
3239
+ return res;
3240
+ }
3241
+
3242
+
3243
+ template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
3244
+ { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
3245
+ template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
3246
+ const v_reg<double, n/2>& c)
3247
+ { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
3248
+
3249
+ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
3250
+ { return v_dotprod_expand(a, b); }
3251
+ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
3252
+ const v_reg<double, n/2>& c)
3253
+ { return v_dotprod_expand(a, b, c); }
3254
+
3255
+ ////// FP16 support ///////
3256
+
3257
+ inline v_reg<float, simd128_width / sizeof(float)>
3258
+ v_load_expand(const float16_t* ptr)
3259
+ {
3260
+ v_reg<float, simd128_width / sizeof(float)> v;
3261
+ for( int i = 0; i < v.nlanes; i++ )
3262
+ {
3263
+ v.s[i] = ptr[i];
3264
+ }
3265
+ return v;
3266
+ }
3267
+ #if CV_SIMD256
3268
+ inline v_reg<float, simd256_width / sizeof(float)>
3269
+ v256_load_expand(const float16_t* ptr)
3270
+ {
3271
+ v_reg<float, simd256_width / sizeof(float)> v;
3272
+ for (int i = 0; i < v.nlanes; i++)
3273
+ {
3274
+ v.s[i] = ptr[i];
3275
+ }
3276
+ return v;
3277
+ }
3278
+ #endif
3279
+ #if CV_SIMD512
3280
+ inline v_reg<float, simd512_width / sizeof(float)>
3281
+ v512_load_expand(const float16_t* ptr)
3282
+ {
3283
+ v_reg<float, simd512_width / sizeof(float)> v;
3284
+ for (int i = 0; i < v.nlanes; i++)
3285
+ {
3286
+ v.s[i] = ptr[i];
3287
+ }
3288
+ return v;
3289
+ }
3290
+ #endif
3291
+
3292
+ template<int n> inline void
3293
+ v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
3294
+ {
3295
+ for( int i = 0; i < v.nlanes; i++ )
3296
+ {
3297
+ ptr[i] = float16_t(v.s[i]);
3298
+ }
3299
+ }
3300
+
3301
+ inline void v_cleanup() {}
3302
+ #if CV_SIMD256
3303
+ inline void v256_cleanup() {}
3304
+ #endif
3305
+ #if CV_SIMD512
3306
+ inline void v512_cleanup() {}
3307
+ #endif
3308
+
3309
+ //! @}
3310
+
3311
+ #ifndef CV_DOXYGEN
3312
+ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3313
+ #endif
3314
+ }
3315
+
3316
+ #if !defined(CV_DOXYGEN)
3317
+ #undef CV_SIMD256
3318
+ #undef CV_SIMD512
3319
+ #endif
3320
+
3321
+ #endif