node-native-win-utils 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -27
- package/binding.gyp +18 -5
- package/dist/index.d.ts +146 -4
- package/dist/index.js +107 -3
- package/include/opencv2/core/affine.hpp +678 -0
- package/include/opencv2/core/async.hpp +105 -0
- package/include/opencv2/core/base.hpp +664 -0
- package/include/opencv2/core/bindings_utils.hpp +325 -0
- package/include/opencv2/core/bufferpool.hpp +40 -0
- package/include/opencv2/core/check.hpp +170 -0
- package/include/opencv2/core/core.hpp +48 -0
- package/include/opencv2/core/core_c.h +3128 -0
- package/include/opencv2/core/cuda/block.hpp +211 -0
- package/include/opencv2/core/cuda/border_interpolate.hpp +722 -0
- package/include/opencv2/core/cuda/color.hpp +309 -0
- package/include/opencv2/core/cuda/common.hpp +131 -0
- package/include/opencv2/core/cuda/datamov_utils.hpp +113 -0
- package/include/opencv2/core/cuda/detail/color_detail.hpp +2018 -0
- package/include/opencv2/core/cuda/detail/reduce.hpp +365 -0
- package/include/opencv2/core/cuda/detail/reduce_key_val.hpp +502 -0
- package/include/opencv2/core/cuda/detail/transform_detail.hpp +392 -0
- package/include/opencv2/core/cuda/detail/type_traits_detail.hpp +191 -0
- package/include/opencv2/core/cuda/detail/vec_distance_detail.hpp +121 -0
- package/include/opencv2/core/cuda/dynamic_smem.hpp +88 -0
- package/include/opencv2/core/cuda/emulation.hpp +269 -0
- package/include/opencv2/core/cuda/filters.hpp +293 -0
- package/include/opencv2/core/cuda/funcattrib.hpp +79 -0
- package/include/opencv2/core/cuda/functional.hpp +805 -0
- package/include/opencv2/core/cuda/limits.hpp +128 -0
- package/include/opencv2/core/cuda/reduce.hpp +209 -0
- package/include/opencv2/core/cuda/saturate_cast.hpp +292 -0
- package/include/opencv2/core/cuda/scan.hpp +258 -0
- package/include/opencv2/core/cuda/simd_functions.hpp +869 -0
- package/include/opencv2/core/cuda/transform.hpp +75 -0
- package/include/opencv2/core/cuda/type_traits.hpp +90 -0
- package/include/opencv2/core/cuda/utility.hpp +230 -0
- package/include/opencv2/core/cuda/vec_distance.hpp +232 -0
- package/include/opencv2/core/cuda/vec_math.hpp +923 -0
- package/include/opencv2/core/cuda/vec_traits.hpp +288 -0
- package/include/opencv2/core/cuda/warp.hpp +139 -0
- package/include/opencv2/core/cuda/warp_reduce.hpp +76 -0
- package/include/opencv2/core/cuda/warp_shuffle.hpp +162 -0
- package/include/opencv2/core/cuda.hpp +1279 -0
- package/include/opencv2/core/cuda.inl.hpp +763 -0
- package/include/opencv2/core/cuda_stream_accessor.hpp +86 -0
- package/include/opencv2/core/cuda_types.hpp +144 -0
- package/include/opencv2/core/cv_cpu_dispatch.h +381 -0
- package/include/opencv2/core/cv_cpu_helper.h +550 -0
- package/include/opencv2/core/cvdef.h +973 -0
- package/include/opencv2/core/cvstd.hpp +190 -0
- package/include/opencv2/core/cvstd.inl.hpp +197 -0
- package/include/opencv2/core/cvstd_wrapper.hpp +154 -0
- package/include/opencv2/core/detail/async_promise.hpp +71 -0
- package/include/opencv2/core/detail/dispatch_helper.impl.hpp +49 -0
- package/include/opencv2/core/detail/exception_ptr.hpp +27 -0
- package/include/opencv2/core/directx.hpp +184 -0
- package/include/opencv2/core/dualquaternion.hpp +979 -0
- package/include/opencv2/core/dualquaternion.inl.hpp +487 -0
- package/include/opencv2/core/eigen.hpp +402 -0
- package/include/opencv2/core/fast_math.hpp +433 -0
- package/include/opencv2/core/hal/hal.hpp +256 -0
- package/include/opencv2/core/hal/interface.h +190 -0
- package/include/opencv2/core/hal/intrin.hpp +939 -0
- package/include/opencv2/core/hal/intrin_avx.hpp +3177 -0
- package/include/opencv2/core/hal/intrin_avx512.hpp +3090 -0
- package/include/opencv2/core/hal/intrin_cpp.hpp +3321 -0
- package/include/opencv2/core/hal/intrin_forward.hpp +191 -0
- package/include/opencv2/core/hal/intrin_lasx.hpp +3236 -0
- package/include/opencv2/core/hal/intrin_msa.hpp +1887 -0
- package/include/opencv2/core/hal/intrin_neon.hpp +2610 -0
- package/include/opencv2/core/hal/intrin_rvv.hpp +3320 -0
- package/include/opencv2/core/hal/intrin_rvv071.hpp +2545 -0
- package/include/opencv2/core/hal/intrin_rvv_scalable.hpp +2080 -0
- package/include/opencv2/core/hal/intrin_sse.hpp +3467 -0
- package/include/opencv2/core/hal/intrin_sse_em.hpp +180 -0
- package/include/opencv2/core/hal/intrin_vsx.hpp +1608 -0
- package/include/opencv2/core/hal/intrin_wasm.hpp +2782 -0
- package/include/opencv2/core/hal/msa_macros.h +1558 -0
- package/include/opencv2/core/hal/simd_utils.impl.hpp +186 -0
- package/include/opencv2/core/llapi/llapi.h +102 -0
- package/include/opencv2/core/mat.hpp +3775 -0
- package/include/opencv2/core/mat.inl.hpp +3422 -0
- package/include/opencv2/core/matx.hpp +1536 -0
- package/include/opencv2/core/neon_utils.hpp +128 -0
- package/include/opencv2/core/ocl.hpp +917 -0
- package/include/opencv2/core/ocl_genbase.hpp +69 -0
- package/include/opencv2/core/opencl/ocl_defs.hpp +82 -0
- package/include/opencv2/core/opencl/opencl_info.hpp +212 -0
- package/include/opencv2/core/opencl/opencl_svm.hpp +81 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp +602 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp +146 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp +371 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp +272 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp +62 -0
- package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp +42 -0
- package/include/opencv2/core/opencl/runtime/opencl_clblas.hpp +53 -0
- package/include/opencv2/core/opencl/runtime/opencl_clfft.hpp +53 -0
- package/include/opencv2/core/opencl/runtime/opencl_core.hpp +84 -0
- package/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp +47 -0
- package/include/opencv2/core/opencl/runtime/opencl_gl.hpp +53 -0
- package/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp +47 -0
- package/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp +48 -0
- package/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp +42 -0
- package/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp +166 -0
- package/include/opencv2/core/opengl.hpp +733 -0
- package/include/opencv2/core/openvx/ovx_defs.hpp +48 -0
- package/include/opencv2/core/operations.hpp +610 -0
- package/include/opencv2/core/optim.hpp +302 -0
- package/include/opencv2/core/ovx.hpp +28 -0
- package/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp +72 -0
- package/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp +153 -0
- package/include/opencv2/core/parallel/parallel_backend.hpp +90 -0
- package/include/opencv2/core/persistence.hpp +1350 -0
- package/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp +30 -0
- package/include/opencv2/core/private.cuda.hpp +169 -0
- package/include/opencv2/core/private.hpp +896 -0
- package/include/opencv2/core/quaternion.hpp +1696 -0
- package/include/opencv2/core/quaternion.inl.hpp +1063 -0
- package/include/opencv2/core/saturate.hpp +180 -0
- package/include/opencv2/core/simd_intrinsics.hpp +87 -0
- package/include/opencv2/core/softfloat.hpp +514 -0
- package/include/opencv2/core/sse_utils.hpp +652 -0
- package/include/opencv2/core/traits.hpp +417 -0
- package/include/opencv2/core/types.hpp +2457 -0
- package/include/opencv2/core/types_c.h +2126 -0
- package/include/opencv2/core/utility.hpp +1229 -0
- package/include/opencv2/core/utils/allocator_stats.hpp +29 -0
- package/include/opencv2/core/utils/allocator_stats.impl.hpp +158 -0
- package/include/opencv2/core/utils/buffer_area.private.hpp +136 -0
- package/include/opencv2/core/utils/configuration.private.hpp +22 -0
- package/include/opencv2/core/utils/filesystem.hpp +82 -0
- package/include/opencv2/core/utils/filesystem.private.hpp +66 -0
- package/include/opencv2/core/utils/fp_control.private.hpp +29 -0
- package/include/opencv2/core/utils/fp_control_utils.hpp +69 -0
- package/include/opencv2/core/utils/instrumentation.hpp +125 -0
- package/include/opencv2/core/utils/lock.private.hpp +119 -0
- package/include/opencv2/core/utils/logger.defines.hpp +42 -0
- package/include/opencv2/core/utils/logger.hpp +218 -0
- package/include/opencv2/core/utils/logtag.hpp +28 -0
- package/include/opencv2/core/utils/plugin_loader.private.hpp +165 -0
- package/include/opencv2/core/utils/tls.hpp +235 -0
- package/include/opencv2/core/utils/trace.hpp +252 -0
- package/include/opencv2/core/utils/trace.private.hpp +421 -0
- package/include/opencv2/core/va_intel.hpp +75 -0
- package/include/opencv2/core/version.hpp +26 -0
- package/include/opencv2/core/vsx_utils.hpp +1047 -0
- package/include/opencv2/core.hpp +3365 -0
- package/include/opencv2/imgcodecs/imgcodecs.hpp +48 -0
- package/include/opencv2/imgcodecs/imgcodecs_c.h +1 -0
- package/include/opencv2/imgcodecs/ios.h +59 -0
- package/include/opencv2/imgcodecs/legacy/constants_c.h +54 -0
- package/include/opencv2/imgcodecs/macosx.h +20 -0
- package/include/opencv2/imgcodecs.hpp +407 -0
- package/include/opencv2/imgproc/bindings.hpp +34 -0
- package/include/opencv2/imgproc/detail/gcgraph.hpp +395 -0
- package/include/opencv2/imgproc/hal/hal.hpp +246 -0
- package/include/opencv2/imgproc/hal/interface.h +46 -0
- package/include/opencv2/imgproc/imgproc.hpp +48 -0
- package/include/opencv2/imgproc/imgproc_c.h +1177 -0
- package/include/opencv2/imgproc/segmentation.hpp +141 -0
- package/include/opencv2/imgproc/types_c.h +659 -0
- package/include/opencv2/imgproc.hpp +5035 -0
- package/include/opencv2/opencv_modules.hpp +17 -0
- package/libs/libjpeg-turbo.lib +0 -0
- package/libs/libpng.lib +0 -0
- package/libs/opencv_core470.lib +0 -0
- package/libs/opencv_imgcodecs470.lib +0 -0
- package/libs/opencv_imgproc470.lib +0 -0
- package/libs/zlib.lib +0 -0
- package/package.json +8 -2
- package/prebuilds/win32-x64/node.napi.node +0 -0
- package/src/cpp/capturewindow.cpp +36 -46
- package/src/cpp/main.cpp +10 -2
- package/src/cpp/opencv.cpp +425 -0
|
@@ -0,0 +1,2080 @@
|
|
|
1
|
+
// This file is part of OpenCV project.
|
|
2
|
+
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
3
|
+
// of this distribution and at http://opencv.org/license.html.
|
|
4
|
+
|
|
5
|
+
// The original implementation is contributed by HAN Liutong.
|
|
6
|
+
// Copyright (C) 2022, Institute of Software, Chinese Academy of Sciences.
|
|
7
|
+
|
|
8
|
+
#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|
|
9
|
+
#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|
|
10
|
+
|
|
11
|
+
#include <initializer_list>
|
|
12
|
+
#include <assert.h>
|
|
13
|
+
#include <vector>
|
|
14
|
+
#include <opencv2/core/check.hpp>
|
|
15
|
+
|
|
16
|
+
#if defined(__GNUC__) && !defined(__clang__)
|
|
17
|
+
// FIXIT: eliminate massive warnigs from templates
|
|
18
|
+
// GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
|
|
19
|
+
// doesn't work: #pragma GCC diagnostic push
|
|
20
|
+
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
|
21
|
+
#endif
|
|
22
|
+
|
|
23
|
+
#ifndef CV_RVV_MAX_VLEN
|
|
24
|
+
#define CV_RVV_MAX_VLEN 1024
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
namespace cv
|
|
28
|
+
{
|
|
29
|
+
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
|
30
|
+
|
|
31
|
+
#define CV_SIMD_SCALABLE 1
|
|
32
|
+
#define CV_SIMD_SCALABLE_64F 1
|
|
33
|
+
|
|
34
|
+
using v_uint8 = vuint8m1_t;
|
|
35
|
+
using v_int8 = vint8m1_t;
|
|
36
|
+
using v_uint16 = vuint16m1_t;
|
|
37
|
+
using v_int16 = vint16m1_t;
|
|
38
|
+
using v_uint32 = vuint32m1_t;
|
|
39
|
+
using v_int32 = vint32m1_t;
|
|
40
|
+
using v_uint64 = vuint64m1_t;
|
|
41
|
+
using v_int64 = vint64m1_t;
|
|
42
|
+
|
|
43
|
+
using v_float32 = vfloat32m1_t;
|
|
44
|
+
#if CV_SIMD_SCALABLE_64F
|
|
45
|
+
using v_float64 = vfloat64m1_t;
|
|
46
|
+
#endif
|
|
47
|
+
|
|
48
|
+
using uchar = unsigned char;
|
|
49
|
+
using schar = signed char;
|
|
50
|
+
using ushort = unsigned short;
|
|
51
|
+
using uint = unsigned int;
|
|
52
|
+
using uint64 = unsigned long int;
|
|
53
|
+
using int64 = long int;
|
|
54
|
+
|
|
55
|
+
static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1();
|
|
56
|
+
static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1();
|
|
57
|
+
static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1();
|
|
58
|
+
static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1();
|
|
59
|
+
|
|
60
|
+
template <class T>
|
|
61
|
+
struct VTraits;
|
|
62
|
+
|
|
63
|
+
template <>
|
|
64
|
+
struct VTraits<v_uint8>
|
|
65
|
+
{
|
|
66
|
+
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
|
|
67
|
+
using lane_type = uchar;
|
|
68
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
template <>
|
|
72
|
+
struct VTraits<v_int8>
|
|
73
|
+
{
|
|
74
|
+
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
|
|
75
|
+
using lane_type = schar;
|
|
76
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
|
|
77
|
+
};
|
|
78
|
+
template <>
|
|
79
|
+
struct VTraits<v_uint16>
|
|
80
|
+
{
|
|
81
|
+
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
|
|
82
|
+
using lane_type = ushort;
|
|
83
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
|
|
84
|
+
};
|
|
85
|
+
template <>
|
|
86
|
+
struct VTraits<v_int16>
|
|
87
|
+
{
|
|
88
|
+
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
|
|
89
|
+
using lane_type = short;
|
|
90
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
|
|
91
|
+
};
|
|
92
|
+
template <>
|
|
93
|
+
struct VTraits<v_uint32>
|
|
94
|
+
{
|
|
95
|
+
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
|
96
|
+
using lane_type = uint;
|
|
97
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
|
98
|
+
};
|
|
99
|
+
template <>
|
|
100
|
+
struct VTraits<v_int32>
|
|
101
|
+
{
|
|
102
|
+
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
|
103
|
+
using lane_type = int;
|
|
104
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
template <>
|
|
108
|
+
struct VTraits<v_float32>
|
|
109
|
+
{
|
|
110
|
+
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
|
111
|
+
using lane_type = float;
|
|
112
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
|
113
|
+
};
|
|
114
|
+
template <>
|
|
115
|
+
struct VTraits<v_uint64>
|
|
116
|
+
{
|
|
117
|
+
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
|
118
|
+
using lane_type = uint64;
|
|
119
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
|
120
|
+
};
|
|
121
|
+
template <>
|
|
122
|
+
struct VTraits<v_int64>
|
|
123
|
+
{
|
|
124
|
+
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
|
125
|
+
using lane_type = int64;
|
|
126
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
|
127
|
+
};
|
|
128
|
+
#if CV_SIMD_SCALABLE_64F
|
|
129
|
+
template <>
|
|
130
|
+
struct VTraits<v_float64>
|
|
131
|
+
{
|
|
132
|
+
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
|
133
|
+
using lane_type = double;
|
|
134
|
+
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
|
135
|
+
};
|
|
136
|
+
#endif
|
|
137
|
+
|
|
138
|
+
//////////// get0 ////////////
|
|
139
|
+
#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
|
|
140
|
+
inline _Tp v_get0(const v_##_Tpvec& v) \
|
|
141
|
+
{ \
|
|
142
|
+
return vmv_x(v); \
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
|
|
146
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
|
|
147
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
|
|
148
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
|
|
149
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
|
|
150
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
|
|
151
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
|
|
152
|
+
OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
|
|
153
|
+
|
|
154
|
+
inline float v_get0(const v_float32& v) \
|
|
155
|
+
{ \
|
|
156
|
+
return vfmv_f(v); \
|
|
157
|
+
}
|
|
158
|
+
#if CV_SIMD_SCALABLE_64F
|
|
159
|
+
inline double v_get0(const v_float64& v) \
|
|
160
|
+
{ \
|
|
161
|
+
return vfmv_f(v); \
|
|
162
|
+
}
|
|
163
|
+
#endif
|
|
164
|
+
|
|
165
|
+
//////////// Initial ////////////
|
|
166
|
+
|
|
167
|
+
#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
|
|
168
|
+
inline v_##_Tpvec v_setzero_##suffix1() \
|
|
169
|
+
{ \
|
|
170
|
+
return vmv_v_x_##suffix2##m1(0, vl); \
|
|
171
|
+
} \
|
|
172
|
+
inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
|
|
173
|
+
{ \
|
|
174
|
+
return vmv_v_x_##suffix2##m1(v, vl); \
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
|
|
178
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
|
|
179
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
|
|
180
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
|
|
181
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
|
|
182
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
|
|
183
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
|
|
184
|
+
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
|
|
185
|
+
|
|
186
|
+
#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
|
|
187
|
+
inline v_##_Tpv v_setzero_##suffix() \
|
|
188
|
+
{ \
|
|
189
|
+
return vfmv_v_f_##suffix##m1(0, vl); \
|
|
190
|
+
} \
|
|
191
|
+
inline v_##_Tpv v_setall_##suffix(_Tp v) \
|
|
192
|
+
{ \
|
|
193
|
+
return vfmv_v_f_##suffix##m1(v, vl); \
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
|
|
197
|
+
#if CV_SIMD_SCALABLE_64F
|
|
198
|
+
OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
|
|
199
|
+
#endif
|
|
200
|
+
|
|
201
|
+
//////////// Reinterpret ////////////
|
|
202
|
+
#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
|
|
203
|
+
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
|
|
204
|
+
{ \
|
|
205
|
+
return v;\
|
|
206
|
+
}
|
|
207
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
|
|
208
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
|
|
209
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
|
|
210
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
|
|
211
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
|
|
212
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
|
|
213
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
|
|
214
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
|
|
215
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
|
|
216
|
+
#if CV_SIMD_SCALABLE_64F
|
|
217
|
+
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
|
|
218
|
+
#endif
|
|
219
|
+
// TODO: can be simplified by using overloaded RV intrinsic
|
|
220
|
+
#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
|
|
221
|
+
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
|
|
222
|
+
{ \
|
|
223
|
+
return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
|
|
224
|
+
} \
|
|
225
|
+
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
|
|
226
|
+
{ \
|
|
227
|
+
return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
|
|
231
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
|
|
232
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
|
|
233
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
|
|
234
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
|
|
235
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
|
|
236
|
+
#if CV_SIMD_SCALABLE_64F
|
|
237
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
|
|
238
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
|
|
239
|
+
#endif
|
|
240
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
|
|
241
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
|
|
242
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
|
|
243
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
|
|
244
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
|
|
245
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
|
|
246
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
|
|
247
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
|
|
248
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
|
|
249
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
|
|
250
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
|
|
251
|
+
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
|
|
255
|
+
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
|
|
256
|
+
{ \
|
|
257
|
+
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
|
|
258
|
+
} \
|
|
259
|
+
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
|
|
260
|
+
{ \
|
|
261
|
+
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
|
|
265
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
|
|
266
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
|
|
267
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
|
|
268
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
|
|
269
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
|
|
270
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
|
|
271
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
|
|
272
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
|
|
273
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
|
|
274
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
|
|
275
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
|
|
276
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
|
|
277
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
|
|
278
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
|
|
279
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
|
|
280
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
|
|
281
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
|
|
282
|
+
#if CV_SIMD_SCALABLE_64F
|
|
283
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
|
|
284
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
|
|
285
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
|
|
286
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
|
|
287
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
|
|
288
|
+
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
|
|
289
|
+
// Three times reinterpret
|
|
290
|
+
inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
|
|
291
|
+
{ \
|
|
292
|
+
return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
|
|
296
|
+
{ \
|
|
297
|
+
return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
|
|
298
|
+
}
|
|
299
|
+
#endif
|
|
300
|
+
|
|
301
|
+
//////////// Extract //////////////
|
|
302
|
+
|
|
303
|
+
#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \
|
|
304
|
+
template <int s = 0> \
|
|
305
|
+
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
|
|
306
|
+
{ \
|
|
307
|
+
return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
|
|
308
|
+
} \
|
|
309
|
+
template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
|
|
310
|
+
{ \
|
|
311
|
+
return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits<v_uint8>::vlanes())
|
|
316
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits<v_int8>::vlanes())
|
|
317
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits<v_uint16>::vlanes())
|
|
318
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits<v_int16>::vlanes())
|
|
319
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits<v_uint32>::vlanes())
|
|
320
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits<v_int32>::vlanes())
|
|
321
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits<v_uint64>::vlanes())
|
|
322
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits<v_int64>::vlanes())
|
|
323
|
+
|
|
324
|
+
#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \
|
|
325
|
+
template <int s = 0> \
|
|
326
|
+
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
|
|
327
|
+
{ \
|
|
328
|
+
return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
|
|
329
|
+
} \
|
|
330
|
+
template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
|
|
331
|
+
{ \
|
|
332
|
+
return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits<v_float32>::vlanes())
|
|
336
|
+
#if CV_SIMD_SCALABLE_64F
|
|
337
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits<v_float64>::vlanes())
|
|
338
|
+
#endif
|
|
339
|
+
|
|
340
|
+
#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
|
|
341
|
+
inline _Tp v_extract_highest(_Tpvec v) \
|
|
342
|
+
{ \
|
|
343
|
+
return v_extract_n(v, vl-1); \
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
|
|
347
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
|
|
348
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
|
|
349
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
|
|
350
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
|
|
351
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
|
|
352
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
|
|
353
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
|
|
354
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
|
|
355
|
+
#if CV_SIMD_SCALABLE_64F
|
|
356
|
+
OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
|
|
357
|
+
#endif
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
////////////// Load/Store //////////////
|
|
361
|
+
#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
|
|
362
|
+
inline _Tpvec v_load(const _Tp* ptr) \
|
|
363
|
+
{ \
|
|
364
|
+
return vle##width##_v_##suffix##m1(ptr, vl); \
|
|
365
|
+
} \
|
|
366
|
+
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
|
367
|
+
{ \
|
|
368
|
+
return vle##width##_v_##suffix##m1(ptr, vl); \
|
|
369
|
+
} \
|
|
370
|
+
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
|
|
371
|
+
{ \
|
|
372
|
+
vse##width##_v_##suffix##m1(ptr, a, vl); \
|
|
373
|
+
} \
|
|
374
|
+
inline _Tpvec v_load_low(const _Tp* ptr) \
|
|
375
|
+
{ \
|
|
376
|
+
return vle##width##_v_##suffix##m1(ptr, hvl); \
|
|
377
|
+
} \
|
|
378
|
+
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
|
379
|
+
{ \
|
|
380
|
+
return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
|
|
381
|
+
} \
|
|
382
|
+
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
|
383
|
+
{ \
|
|
384
|
+
vse##width(ptr, a, vl); \
|
|
385
|
+
} \
|
|
386
|
+
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
|
387
|
+
{ \
|
|
388
|
+
vse##width(ptr, a, vl); \
|
|
389
|
+
} \
|
|
390
|
+
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
|
|
391
|
+
{ \
|
|
392
|
+
vse##width(ptr, a, vl); \
|
|
393
|
+
} \
|
|
394
|
+
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
|
395
|
+
{ \
|
|
396
|
+
vse##width(ptr, a, hvl); \
|
|
397
|
+
} \
|
|
398
|
+
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
|
399
|
+
{ \
|
|
400
|
+
vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
|
|
401
|
+
} \
|
|
402
|
+
inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
|
|
403
|
+
{ \
|
|
404
|
+
assert(nScalars.size() == vl); \
|
|
405
|
+
return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
|
|
406
|
+
} \
|
|
407
|
+
template<typename... Targs> \
|
|
408
|
+
_Tpvec v_load_##suffix(Targs... nScalars) \
|
|
409
|
+
{ \
|
|
410
|
+
return v_load({nScalars...}); \
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
|
|
415
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
|
|
416
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
|
|
417
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
|
|
418
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
|
|
419
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
|
|
420
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
|
|
421
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
|
|
422
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
|
|
423
|
+
|
|
424
|
+
#if CV_SIMD_SCALABLE_64F
|
|
425
|
+
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
|
|
426
|
+
#endif
|
|
427
|
+
|
|
428
|
+
////////////// Lookup table access ////////////////////
|
|
429
|
+
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
|
|
430
|
+
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
|
|
431
|
+
{ \
|
|
432
|
+
vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
433
|
+
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
434
|
+
} \
|
|
435
|
+
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
|
|
436
|
+
{ \
|
|
437
|
+
std::vector<uint> idx_; \
|
|
438
|
+
for (size_t i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
|
|
439
|
+
idx_.push_back(idx[i]); \
|
|
440
|
+
idx_.push_back(idx[i]+1); \
|
|
441
|
+
} \
|
|
442
|
+
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
443
|
+
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
444
|
+
} \
|
|
445
|
+
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
|
|
446
|
+
{ \
|
|
447
|
+
std::vector<uint> idx_; \
|
|
448
|
+
for (size_t i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
|
|
449
|
+
idx_.push_back(idx[i]); \
|
|
450
|
+
idx_.push_back(idx[i]+1); \
|
|
451
|
+
idx_.push_back(idx[i]+2); \
|
|
452
|
+
idx_.push_back(idx[i]+3); \
|
|
453
|
+
} \
|
|
454
|
+
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
455
|
+
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
456
|
+
}
|
|
457
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
|
|
458
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
|
|
459
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
|
|
460
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
|
|
461
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
|
|
462
|
+
#if CV_SIMD_SCALABLE_64F
|
|
463
|
+
OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
|
|
464
|
+
#endif
|
|
465
|
+
|
|
466
|
+
inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
|
|
467
|
+
inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
|
|
468
|
+
inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
|
|
469
|
+
inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
|
|
470
|
+
inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
|
|
471
|
+
inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
|
|
472
|
+
inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
|
|
473
|
+
inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
|
|
474
|
+
inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
|
|
475
|
+
inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
|
|
476
|
+
inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
|
|
477
|
+
inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
|
|
478
|
+
|
|
479
|
+
////////////// Pack boolean ////////////////////
|
|
480
|
+
inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
|
|
481
|
+
{
|
|
482
|
+
return vnsrl(vset(vlmul_ext_u16m2(a),1,b), 0, VTraits<v_uint8>::vlanes());
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b,
|
|
486
|
+
const v_uint32& c, const v_uint32& d)
|
|
487
|
+
{
|
|
488
|
+
|
|
489
|
+
return vnsrl(vnsrl(vset(vset(vset(vlmul_ext_u32m4(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c,
|
|
493
|
+
const v_uint64& d, const v_uint64& e, const v_uint64& f,
|
|
494
|
+
const v_uint64& g, const v_uint64& h)
|
|
495
|
+
{
|
|
496
|
+
return vnsrl(vnsrl(vnsrl(
|
|
497
|
+
vset(vset(vset(vset(vset(vset(vset(vlmul_ext_u64m8(a),
|
|
498
|
+
1,b),2,c),3,d),4,e),5,f),6,g),7,h),
|
|
499
|
+
0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
////////////// Arithmetics //////////////
|
|
503
|
+
#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
|
|
504
|
+
inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
|
|
505
|
+
{ \
|
|
506
|
+
return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu)
|
|
510
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu)
|
|
511
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd)
|
|
512
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub)
|
|
513
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu)
|
|
514
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu)
|
|
515
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd)
|
|
516
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub)
|
|
517
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd)
|
|
518
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub)
|
|
519
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul)
|
|
520
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd)
|
|
521
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub)
|
|
522
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul)
|
|
523
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd)
|
|
524
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub)
|
|
525
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul)
|
|
526
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv)
|
|
527
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd)
|
|
528
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub)
|
|
529
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd)
|
|
530
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub)
|
|
531
|
+
|
|
532
|
+
#if CV_SIMD_SCALABLE_64F
|
|
533
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd)
|
|
534
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub)
|
|
535
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul)
|
|
536
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv)
|
|
537
|
+
#endif
|
|
538
|
+
|
|
539
|
+
#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
|
|
540
|
+
template<typename... Args> \
|
|
541
|
+
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
|
542
|
+
return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
|
|
543
|
+
}
|
|
544
|
+
#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
|
|
545
|
+
template<typename... Args> \
|
|
546
|
+
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
|
547
|
+
return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
|
|
548
|
+
}
|
|
549
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu)
|
|
550
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd)
|
|
551
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu)
|
|
552
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd)
|
|
553
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd)
|
|
554
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd)
|
|
555
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd)
|
|
556
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd)
|
|
557
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd)
|
|
558
|
+
|
|
559
|
+
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul)
|
|
560
|
+
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul)
|
|
561
|
+
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul)
|
|
562
|
+
#if CV_SIMD_SCALABLE_64F
|
|
563
|
+
OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd)
|
|
564
|
+
OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul)
|
|
565
|
+
#endif
|
|
566
|
+
|
|
567
|
+
#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
|
|
568
|
+
inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
|
|
569
|
+
{ \
|
|
570
|
+
_TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
|
|
571
|
+
c = vget_##suffix##m1(temp, 0); \
|
|
572
|
+
d = vget_##suffix##m1(temp, 1); \
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu)
|
|
576
|
+
OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul)
|
|
577
|
+
OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu)
|
|
578
|
+
OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul)
|
|
579
|
+
OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu)
|
|
580
|
+
|
|
581
|
+
inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
|
|
582
|
+
{
|
|
583
|
+
return vmulh(a, b, VTraits<v_int16>::vlanes());
|
|
584
|
+
}
|
|
585
|
+
inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
|
|
586
|
+
{
|
|
587
|
+
return vmulhu(a, b, VTraits<v_uint16>::vlanes());
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
////////////// Arithmetics (wrap)//////////////
|
|
591
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd)
|
|
592
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd)
|
|
593
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd)
|
|
594
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd)
|
|
595
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub)
|
|
596
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub)
|
|
597
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub)
|
|
598
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub)
|
|
599
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul)
|
|
600
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul)
|
|
601
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul)
|
|
602
|
+
OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul)
|
|
603
|
+
|
|
604
|
+
//////// Saturating Multiply ////////
|
|
605
|
+
#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
|
|
606
|
+
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
|
607
|
+
{ \
|
|
608
|
+
return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \
|
|
609
|
+
} \
|
|
610
|
+
template<typename... Args> \
|
|
611
|
+
inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
|
|
612
|
+
return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu)
|
|
616
|
+
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul)
|
|
617
|
+
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu)
|
|
618
|
+
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul)
|
|
619
|
+
|
|
620
|
+
////////////// Bitwise logic //////////////
|
|
621
|
+
|
|
622
|
+
#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
|
|
623
|
+
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
|
|
624
|
+
{ \
|
|
625
|
+
return vand(a, b, vl); \
|
|
626
|
+
} \
|
|
627
|
+
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
|
|
628
|
+
{ \
|
|
629
|
+
return vor(a, b, vl); \
|
|
630
|
+
} \
|
|
631
|
+
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
|
|
632
|
+
{ \
|
|
633
|
+
return vxor(a, b, vl); \
|
|
634
|
+
} \
|
|
635
|
+
inline _Tpvec v_not (const _Tpvec& a) \
|
|
636
|
+
{ \
|
|
637
|
+
return vnot(a, vl); \
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
|
|
641
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
|
|
642
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
|
|
643
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
|
|
644
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
|
|
645
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
|
|
646
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
|
|
647
|
+
OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
|
|
648
|
+
|
|
649
|
+
#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
|
|
650
|
+
inline v_float32 intrin (const v_float32& a, const v_float32& b) \
|
|
651
|
+
{ \
|
|
652
|
+
return vreinterpret_f32m1(intrin(vreinterpret_i32m1(a), vreinterpret_i32m1(b))); \
|
|
653
|
+
}
|
|
654
|
+
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
|
|
655
|
+
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
|
|
656
|
+
OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
|
|
657
|
+
|
|
658
|
+
inline v_float32 v_not (const v_float32& a) \
|
|
659
|
+
{ \
|
|
660
|
+
return vreinterpret_f32m1(v_not(vreinterpret_i32m1(a))); \
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
#if CV_SIMD_SCALABLE_64F
|
|
664
|
+
#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \
|
|
665
|
+
inline v_float64 intrin (const v_float64& a, const v_float64& b) \
|
|
666
|
+
{ \
|
|
667
|
+
return vreinterpret_f64m1(intrin(vreinterpret_i64m1(a), vreinterpret_i64m1(b))); \
|
|
668
|
+
}
|
|
669
|
+
OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and)
|
|
670
|
+
OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or)
|
|
671
|
+
OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor)
|
|
672
|
+
|
|
673
|
+
inline v_float64 v_not (const v_float64& a) \
|
|
674
|
+
{ \
|
|
675
|
+
return vreinterpret_f64m1(v_not(vreinterpret_i64m1(a))); \
|
|
676
|
+
}
|
|
677
|
+
#endif
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
////////////// Bitwise shifts //////////////
|
|
681
|
+
|
|
682
|
+
#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
|
|
683
|
+
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
|
684
|
+
{ \
|
|
685
|
+
return _Tpvec(vsll(a, uint8_t(n), vl)); \
|
|
686
|
+
} \
|
|
687
|
+
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
|
688
|
+
{ \
|
|
689
|
+
return _Tpvec(vsrl(a, uint8_t(n), vl)); \
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
|
|
693
|
+
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
|
694
|
+
{ \
|
|
695
|
+
return _Tpvec(vsll(a, uint8_t(n), vl)); \
|
|
696
|
+
} \
|
|
697
|
+
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
|
698
|
+
{ \
|
|
699
|
+
return _Tpvec(vsra(a, uint8_t(n), vl)); \
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
|
|
703
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
|
|
704
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
|
|
705
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
|
|
706
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
|
|
707
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
|
|
708
|
+
|
|
709
|
+
////////////// Comparison //////////////
|
|
710
|
+
#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
|
|
711
|
+
inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
|
|
712
|
+
{ \
|
|
713
|
+
uint64_t ones = -1; \
|
|
714
|
+
return vmerge(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl); \
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
|
|
718
|
+
inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
|
|
719
|
+
{ \
|
|
720
|
+
union { uint64 u; double d; } ones; ones.u = -1; \
|
|
721
|
+
return _Tpvec(vfmerge(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
|
|
722
|
+
} //TODO
|
|
723
|
+
|
|
724
|
+
#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, vl) \
|
|
725
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix, vl) \
|
|
726
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix, vl) \
|
|
727
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix, vl) \
|
|
728
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix, vl) \
|
|
729
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix, vl) \
|
|
730
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix, vl)
|
|
731
|
+
|
|
732
|
+
#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, vl) \
|
|
733
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix, vl) \
|
|
734
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix, vl) \
|
|
735
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix, vl) \
|
|
736
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix, vl) \
|
|
737
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix, vl) \
|
|
738
|
+
OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix, vl)
|
|
739
|
+
|
|
740
|
+
#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, vl) \
|
|
741
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix, vl) \
|
|
742
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix, vl) \
|
|
743
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix, vl) \
|
|
744
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix, vl) \
|
|
745
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix, vl) \
|
|
746
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix, vl)
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8, VTraits<v_uint8>::vlanes())
|
|
750
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16, VTraits<v_uint16>::vlanes())
|
|
751
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32, VTraits<v_uint32>::vlanes())
|
|
752
|
+
OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64, VTraits<v_uint64>::vlanes())
|
|
753
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8, VTraits<v_int8>::vlanes())
|
|
754
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16, VTraits<v_int16>::vlanes())
|
|
755
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32, VTraits<v_int32>::vlanes())
|
|
756
|
+
OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64, VTraits<v_int64>::vlanes())
|
|
757
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32, VTraits<v_float32>::vlanes())
|
|
758
|
+
#if CV_SIMD_SCALABLE_64F
|
|
759
|
+
OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64, VTraits<v_float64>::vlanes())
|
|
760
|
+
#endif
|
|
761
|
+
|
|
762
|
+
inline v_float32 v_not_nan(const v_float32& a)
|
|
763
|
+
{ return v_eq(a, a); }
|
|
764
|
+
|
|
765
|
+
#if CV_SIMD_SCALABLE_64F
|
|
766
|
+
inline v_float64 v_not_nan(const v_float64& a)
|
|
767
|
+
{ return v_eq(a, a); }
|
|
768
|
+
#endif
|
|
769
|
+
|
|
770
|
+
////////////// Min/Max //////////////
|
|
771
|
+
|
|
772
|
+
#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
|
|
773
|
+
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
|
774
|
+
{ \
|
|
775
|
+
return intrin(a, b, vl); \
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
|
|
779
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
|
|
780
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
|
|
781
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
|
|
782
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
|
|
783
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
|
|
784
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
|
|
785
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
|
|
786
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
|
|
787
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
|
|
788
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
|
|
789
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
|
|
790
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
|
|
791
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
|
|
792
|
+
#if CV_SIMD_SCALABLE_64F
|
|
793
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
|
|
794
|
+
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
|
|
795
|
+
#endif
|
|
796
|
+
|
|
797
|
+
////////////// Transpose4x4 //////////////
|
|
798
|
+
#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
|
|
799
|
+
inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
|
|
800
|
+
int vl = 4; \
|
|
801
|
+
_wTpvec temp = vreinterpret_##suffix##m2(convert2u( \
|
|
802
|
+
vor(vzext_vf2(convert(a0), vl), \
|
|
803
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \
|
|
804
|
+
vl))); \
|
|
805
|
+
b0 = vget_##suffix##m1(temp, 0); \
|
|
806
|
+
b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
|
|
810
|
+
OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1)
|
|
811
|
+
OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1)
|
|
812
|
+
|
|
813
|
+
#if 0
|
|
814
|
+
// this is v_zip4 and v_tranpose4x4 for scalable VLEN, costs more instruction than current 128-bit only version.
|
|
815
|
+
inline void v_zip4(const v_float32& a0, const v_float32& a1, v_float32& b0, v_float32& b1) {
|
|
816
|
+
vuint64m1_t vid1 = vid_v_u64m1(VTraits<vuint64m1_t>::vlanes());
|
|
817
|
+
vuint16m1_t t1 = vreinterpret_u16m1(vid1);
|
|
818
|
+
vuint16m1_t t2 = vslide1up(t1, 0, VTraits<vuint16m1_t>::vlanes());
|
|
819
|
+
vuint16m1_t t3 = vslide1up(t2, 0, VTraits<vuint16m1_t>::vlanes());
|
|
820
|
+
vuint16m1_t t4 = vslide1up(t3, 0, VTraits<vuint16m1_t>::vlanes());
|
|
821
|
+
t1 = vor(
|
|
822
|
+
vor(t1, t2, VTraits<vuint16m1_t>::vlanes()),
|
|
823
|
+
vor(t3, t4, VTraits<vuint16m1_t>::vlanes()),
|
|
824
|
+
VTraits<vuint16m1_t>::vlanes()
|
|
825
|
+
);
|
|
826
|
+
vuint32m2_t vidx0 = vwmulu(t1, 4, VTraits<vuint32m1_t>::vlanes());
|
|
827
|
+
vidx0 = vadd(vidx0, vid_v_u32m2(VTraits<vuint32m1_t>::vlanes()), VTraits<vuint32m1_t>::vlanes());
|
|
828
|
+
vuint32m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint32m1_t>::vlanes());
|
|
829
|
+
vfloat32m2_t temp = vreinterpret_f32m2(vreinterpret_u32m2(
|
|
830
|
+
vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()),
|
|
831
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vfloat32m1_t>::vlanes()*2)),
|
|
832
|
+
VTraits<vfloat32m1_t>::vlanes())));
|
|
833
|
+
b0 = vlmul_trunc_f32m1(vrgather(temp, vidx0, VTraits<vuint16m1_t>::vlanes()));
|
|
834
|
+
b1 = vlmul_trunc_f32m1(vrgather(temp, vidx1, VTraits<vuint16m1_t>::vlanes()));
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
inline void v_transpose4x4(const v_float32& a0, const v_float32& a1, const v_float32& a2, const v_float32& a3,\
|
|
838
|
+
v_float32& b0, v_float32& b1, v_float32& b2, v_float32& b3) { \
|
|
839
|
+
vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
|
|
840
|
+
vuint16m2_t t1 = vreinterpret_u16m2(vid1);
|
|
841
|
+
vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
|
|
842
|
+
vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
|
|
843
|
+
vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
|
|
844
|
+
t1 = vor(
|
|
845
|
+
vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
|
|
846
|
+
vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
|
|
847
|
+
VTraits<vuint8m1_t>::vlanes()
|
|
848
|
+
);
|
|
849
|
+
vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
|
|
850
|
+
vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
|
|
851
|
+
vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
|
|
852
|
+
vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
|
|
853
|
+
vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
|
|
854
|
+
vuint32m2_t tempA = vreinterpret_u32m2( \
|
|
855
|
+
vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()), \
|
|
856
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a2), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
|
|
857
|
+
VTraits<vuint32m1_t>::vlanes())); \
|
|
858
|
+
vuint32m2_t tempB = vreinterpret_u32m2( \
|
|
859
|
+
vor(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes()), \
|
|
860
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a3), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
|
|
861
|
+
VTraits<vuint32m1_t>::vlanes())); \
|
|
862
|
+
vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
|
|
863
|
+
vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
|
|
864
|
+
vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
|
|
865
|
+
VTraits<vuint16m1_t>::vlanes()))); \
|
|
866
|
+
b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
|
|
867
|
+
b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
|
|
868
|
+
b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
|
|
869
|
+
b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
|
|
870
|
+
}
|
|
871
|
+
#endif
|
|
872
|
+
|
|
873
|
+
#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
|
|
874
|
+
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
|
|
875
|
+
_Tpvec t0,t1,t2,t3; \
|
|
876
|
+
v_zip4(a0, a2, t0, t2); \
|
|
877
|
+
v_zip4(a1, a3, t1, t3); \
|
|
878
|
+
v_zip4(t0, t1, b0, b1); \
|
|
879
|
+
v_zip4(t2, t3, b2, b3); \
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
|
|
883
|
+
OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
|
|
884
|
+
OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
|
|
885
|
+
|
|
886
|
+
////////////// Reduce //////////////
|
|
887
|
+
|
|
888
|
+
#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
|
|
889
|
+
inline scalartype v_reduce_sum(const _Tpvec& a) \
|
|
890
|
+
{ \
|
|
891
|
+
_nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
|
|
892
|
+
_nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
|
|
893
|
+
res = v##red(res, a, zero, vl); \
|
|
894
|
+
return (scalartype)v_get0(res); \
|
|
895
|
+
}
|
|
896
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
|
|
897
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
|
|
898
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
|
|
899
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
|
|
900
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
|
|
901
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
|
|
902
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits<v_uint64>::vlanes(), redsum)
|
|
903
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits<v_int64>::vlanes(), redsum)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
|
|
907
|
+
inline scalartype v_reduce_sum(const _Tpvec& a) \
|
|
908
|
+
{ \
|
|
909
|
+
_nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
|
|
910
|
+
_nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
|
|
911
|
+
res = vfredosum(res, a, zero, vl); \
|
|
912
|
+
return (scalartype)v_get0(res); \
|
|
913
|
+
}
|
|
914
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
|
|
915
|
+
|
|
916
|
+
#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
|
|
917
|
+
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
|
918
|
+
{ \
|
|
919
|
+
_Tpvec res = _Tpvec(v##red(a, a, a, vl)); \
|
|
920
|
+
return (scalartype)v_get0(res); \
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
|
|
924
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
|
|
925
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
|
|
926
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
|
|
927
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
|
|
928
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
|
|
929
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
|
|
930
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
|
|
931
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
|
|
932
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
|
|
933
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
|
|
934
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
|
|
935
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
|
|
936
|
+
OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
|
|
937
|
+
|
|
938
|
+
inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
|
|
939
|
+
const v_float32& c, const v_float32& d)
|
|
940
|
+
{
|
|
941
|
+
// 0000 1111 2222 3333 ....
|
|
942
|
+
vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
|
|
943
|
+
vuint16m2_t t1 = vreinterpret_u16m2(vid1);
|
|
944
|
+
vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
|
|
945
|
+
vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
|
|
946
|
+
vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
|
|
947
|
+
t1 = vor(
|
|
948
|
+
vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
|
|
949
|
+
vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
|
|
950
|
+
VTraits<vuint8m1_t>::vlanes()
|
|
951
|
+
);
|
|
952
|
+
|
|
953
|
+
// index for transpose4X4
|
|
954
|
+
vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
|
|
955
|
+
vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
|
|
956
|
+
vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
|
|
957
|
+
vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
|
|
958
|
+
vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
|
|
959
|
+
|
|
960
|
+
// zip
|
|
961
|
+
vuint32m2_t tempA = vreinterpret_u32m2( \
|
|
962
|
+
vor(vzext_vf2(vreinterpret_u32m1(a), VTraits<vuint16m1_t>::vlanes()), \
|
|
963
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(c), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
|
|
964
|
+
VTraits<vuint32m1_t>::vlanes())); \
|
|
965
|
+
vuint32m2_t tempB = vreinterpret_u32m2( \
|
|
966
|
+
vor(vzext_vf2(vreinterpret_u32m1(b), VTraits<vuint16m1_t>::vlanes()), \
|
|
967
|
+
vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(d), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
|
|
968
|
+
VTraits<vuint32m1_t>::vlanes())); \
|
|
969
|
+
vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
|
|
970
|
+
vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
|
|
971
|
+
vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
|
|
972
|
+
VTraits<vuint16m1_t>::vlanes())));
|
|
973
|
+
|
|
974
|
+
// transpose
|
|
975
|
+
vfloat32m1_t b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
|
|
976
|
+
vfloat32m1_t b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
|
|
977
|
+
vfloat32m1_t b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
|
|
978
|
+
vfloat32m1_t b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
|
|
979
|
+
|
|
980
|
+
// vector add
|
|
981
|
+
v_float32 res = vfadd(
|
|
982
|
+
vfadd(b0, b1, VTraits<vfloat32m1_t>::vlanes()),
|
|
983
|
+
vfadd(b2, b3, VTraits<vfloat32m1_t>::vlanes()),
|
|
984
|
+
VTraits<vfloat32m1_t>::vlanes()
|
|
985
|
+
);
|
|
986
|
+
return res;
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
////////////// Square-Root //////////////
|
|
990
|
+
|
|
991
|
+
inline v_float32 v_sqrt(const v_float32& x)
|
|
992
|
+
{
|
|
993
|
+
return vfsqrt(x, VTraits<v_float32>::vlanes());
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
inline v_float32 v_invsqrt(const v_float32& x)
|
|
997
|
+
{
|
|
998
|
+
v_float32 one = v_setall_f32(1.0f);
|
|
999
|
+
return v_div(one, v_sqrt(x));
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1003
|
+
inline v_float64 v_sqrt(const v_float64& x)
|
|
1004
|
+
{
|
|
1005
|
+
return vfsqrt(x, VTraits<v_float64>::vlanes());
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
inline v_float64 v_invsqrt(const v_float64& x)
|
|
1009
|
+
{
|
|
1010
|
+
v_float64 one = v_setall_f64(1.0f);
|
|
1011
|
+
return v_div(one, v_sqrt(x));
|
|
1012
|
+
}
|
|
1013
|
+
#endif
|
|
1014
|
+
|
|
1015
|
+
inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
|
|
1016
|
+
{
|
|
1017
|
+
v_float32 x = vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
|
|
1018
|
+
return v_sqrt(x);
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
|
|
1022
|
+
{
|
|
1023
|
+
return v_float32(vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1027
|
+
inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
|
|
1028
|
+
{
|
|
1029
|
+
v_float64 x = vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
|
|
1030
|
+
return v_sqrt(x);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
|
|
1034
|
+
{
|
|
1035
|
+
return vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
|
|
1036
|
+
}
|
|
1037
|
+
#endif
|
|
1038
|
+
|
|
1039
|
+
////////////// Multiply-Add //////////////
|
|
1040
|
+
|
|
1041
|
+
inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
|
|
1042
|
+
{
|
|
1043
|
+
return vfmacc(c, a, b, VTraits<v_float32>::vlanes());
|
|
1044
|
+
}
|
|
1045
|
+
inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
|
|
1046
|
+
{
|
|
1047
|
+
return vmacc(c, a, b, VTraits<v_float32>::vlanes());
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
|
|
1051
|
+
{
|
|
1052
|
+
return v_fma(a, b, c);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
|
|
1056
|
+
{
|
|
1057
|
+
return v_fma(a, b, c);
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1061
|
+
inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
|
|
1062
|
+
{
|
|
1063
|
+
return vfmacc_vv_f64m1(c, a, b, VTraits<v_float64>::vlanes());
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
|
|
1067
|
+
{
|
|
1068
|
+
return v_fma(a, b, c);
|
|
1069
|
+
}
|
|
1070
|
+
#endif
|
|
1071
|
+
|
|
1072
|
+
////////////// Check all/any //////////////
|
|
1073
|
+
|
|
1074
|
+
#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
|
|
1075
|
+
inline bool v_check_all(const _Tpvec& a) \
|
|
1076
|
+
{ \
|
|
1077
|
+
return vcpop(vmslt(a, 0, vl), vl) == vl; \
|
|
1078
|
+
} \
|
|
1079
|
+
inline bool v_check_any(const _Tpvec& a) \
|
|
1080
|
+
{ \
|
|
1081
|
+
return vcpop(vmslt(a, 0, vl), vl) != 0; \
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
|
|
1085
|
+
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
|
|
1086
|
+
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
|
|
1087
|
+
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
inline bool v_check_all(const v_uint8& a)
|
|
1091
|
+
{ return v_check_all(v_reinterpret_as_s8(a)); }
|
|
1092
|
+
inline bool v_check_any(const v_uint8& a)
|
|
1093
|
+
{ return v_check_any(v_reinterpret_as_s8(a)); }
|
|
1094
|
+
|
|
1095
|
+
inline bool v_check_all(const v_uint16& a)
|
|
1096
|
+
{ return v_check_all(v_reinterpret_as_s16(a)); }
|
|
1097
|
+
inline bool v_check_any(const v_uint16& a)
|
|
1098
|
+
{ return v_check_any(v_reinterpret_as_s16(a)); }
|
|
1099
|
+
|
|
1100
|
+
inline bool v_check_all(const v_uint32& a)
|
|
1101
|
+
{ return v_check_all(v_reinterpret_as_s32(a)); }
|
|
1102
|
+
inline bool v_check_any(const v_uint32& a)
|
|
1103
|
+
{ return v_check_any(v_reinterpret_as_s32(a)); }
|
|
1104
|
+
|
|
1105
|
+
inline bool v_check_all(const v_float32& a)
|
|
1106
|
+
{ return v_check_all(v_reinterpret_as_s32(a)); }
|
|
1107
|
+
inline bool v_check_any(const v_float32& a)
|
|
1108
|
+
{ return v_check_any(v_reinterpret_as_s32(a)); }
|
|
1109
|
+
|
|
1110
|
+
inline bool v_check_all(const v_uint64& a)
|
|
1111
|
+
{ return v_check_all(v_reinterpret_as_s64(a)); }
|
|
1112
|
+
inline bool v_check_any(const v_uint64& a)
|
|
1113
|
+
{ return v_check_any(v_reinterpret_as_s64(a)); }
|
|
1114
|
+
|
|
1115
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1116
|
+
inline bool v_check_all(const v_float64& a)
|
|
1117
|
+
{ return v_check_all(v_reinterpret_as_s64(a)); }
|
|
1118
|
+
inline bool v_check_any(const v_float64& a)
|
|
1119
|
+
{ return v_check_any(v_reinterpret_as_s64(a)); }
|
|
1120
|
+
#endif
|
|
1121
|
+
|
|
1122
|
+
////////////// abs //////////////
|
|
1123
|
+
|
|
1124
|
+
#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
|
|
1125
|
+
inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
|
|
1126
|
+
{ \
|
|
1127
|
+
return v_sub(v_max(a, b), v_min(a, b)); \
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
|
|
1131
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
|
|
1132
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
|
|
1133
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
|
|
1134
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1135
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
|
|
1136
|
+
#endif
|
|
1137
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
|
|
1138
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
|
|
1139
|
+
|
|
1140
|
+
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
|
|
1141
|
+
inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
|
1142
|
+
{ \
|
|
1143
|
+
return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
|
|
1147
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
|
|
1148
|
+
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
|
|
1149
|
+
|
|
1150
|
+
#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
|
|
1151
|
+
inline _Tprvec v_abs(const _Tpvec& a) \
|
|
1152
|
+
{ \
|
|
1153
|
+
return v_absdiff(a, v_setzero_##suffix()); \
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
|
|
1157
|
+
OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
|
|
1158
|
+
OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
|
|
1159
|
+
OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
|
|
1160
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1161
|
+
OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
|
|
1162
|
+
#endif
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
|
|
1166
|
+
inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
|
|
1167
|
+
{ \
|
|
1168
|
+
return v_reduce_sum(v_absdiff(a, b)); \
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
|
|
1172
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
|
|
1173
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
|
|
1174
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
|
|
1175
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
|
|
1176
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
|
|
1177
|
+
OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
|
|
1178
|
+
|
|
1179
|
+
////////////// Select //////////////
|
|
1180
|
+
|
|
1181
|
+
#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
|
|
1182
|
+
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
|
|
1183
|
+
{ \
|
|
1184
|
+
return vmerge(vmsne(mask, 0, vl), b, a, vl); \
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
|
|
1188
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
|
|
1189
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
|
|
1190
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
|
|
1191
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
|
|
1192
|
+
OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
|
|
1193
|
+
|
|
1194
|
+
inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
|
|
1195
|
+
{ \
|
|
1196
|
+
return vmerge(vmfne(mask, 0, VTraits<v_float32>::vlanes()), b, a, VTraits<v_float32>::vlanes()); \
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1200
|
+
inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
|
|
1201
|
+
{ \
|
|
1202
|
+
return vmerge(vmfne(mask, 0, VTraits<v_float64>::vlanes()), b, a, VTraits<v_float64>::vlanes()); \
|
|
1203
|
+
}
|
|
1204
|
+
#endif
|
|
1205
|
+
|
|
1206
|
+
////////////// Rotate shift //////////////
|
|
1207
|
+
|
|
1208
|
+
#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
|
|
1209
|
+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
|
|
1210
|
+
{ \
|
|
1211
|
+
return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
|
|
1212
|
+
} \
|
|
1213
|
+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
|
|
1214
|
+
{ \
|
|
1215
|
+
return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
|
|
1216
|
+
} \
|
|
1217
|
+
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
|
|
1218
|
+
{ return a; } \
|
|
1219
|
+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
|
|
1220
|
+
{ \
|
|
1221
|
+
return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
|
|
1222
|
+
} \
|
|
1223
|
+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
|
|
1224
|
+
{ \
|
|
1225
|
+
return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
|
|
1226
|
+
} \
|
|
1227
|
+
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
|
|
1228
|
+
{ CV_UNUSED(b); return a; }
|
|
1229
|
+
|
|
1230
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
|
|
1231
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
|
|
1232
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
|
|
1233
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16, VTraits<v_int16>::vlanes())
|
|
1234
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
|
|
1235
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
|
|
1236
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
|
|
1237
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
|
|
1238
|
+
|
|
1239
|
+
#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
|
|
1240
|
+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
|
|
1241
|
+
{ \
|
|
1242
|
+
return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
|
|
1243
|
+
} \
|
|
1244
|
+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
|
|
1245
|
+
{ \
|
|
1246
|
+
return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
|
|
1247
|
+
} \
|
|
1248
|
+
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
|
|
1249
|
+
{ return a; } \
|
|
1250
|
+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
|
|
1251
|
+
{ \
|
|
1252
|
+
return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
|
|
1253
|
+
} \
|
|
1254
|
+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
|
|
1255
|
+
{ \
|
|
1256
|
+
return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
|
|
1257
|
+
} \
|
|
1258
|
+
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
|
|
1259
|
+
{ CV_UNUSED(b); return a; }
|
|
1260
|
+
|
|
1261
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
|
|
1262
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1263
|
+
OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits<v_float64>::vlanes())
|
|
1264
|
+
#endif
|
|
1265
|
+
|
|
1266
|
+
////////////// Convert to float //////////////
|
|
1267
|
+
inline v_float32 v_cvt_f32(const v_int32& a)
|
|
1268
|
+
{
|
|
1269
|
+
return vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1273
|
+
inline v_float32 v_cvt_f32(const v_float64& a)
|
|
1274
|
+
{
|
|
1275
|
+
return vfncvt_f(vlmul_ext_f64m2(a), VTraits<v_float64>::vlanes());
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b)
|
|
1279
|
+
{
|
|
1280
|
+
return vfncvt_f(vset(vlmul_ext_f64m2(a),1,b), VTraits<v_float32>::vlanes());
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
inline v_float64 v_cvt_f64(const v_int32& a)
|
|
1284
|
+
{
|
|
1285
|
+
return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 0);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
inline v_float64 v_cvt_f64_high(const v_int32& a)
|
|
1289
|
+
{
|
|
1290
|
+
return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 1);
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
inline v_float64 v_cvt_f64(const v_float32& a)
|
|
1294
|
+
{
|
|
1295
|
+
return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 0);
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1298
|
+
inline v_float64 v_cvt_f64_high(const v_float32& a)
|
|
1299
|
+
{
|
|
1300
|
+
return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 1);
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
inline v_float64 v_cvt_f64(const v_int64& a)
|
|
1304
|
+
{
|
|
1305
|
+
return vfcvt_f(a, VTraits<v_int64>::vlanes());
|
|
1306
|
+
}
|
|
1307
|
+
#endif
|
|
1308
|
+
|
|
1309
|
+
//////////// Broadcast //////////////
|
|
1310
|
+
|
|
1311
|
+
#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
|
|
1312
|
+
template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
|
|
1313
|
+
{ \
|
|
1314
|
+
return v_setall_##suffix(v_extract_n(v, i)); \
|
|
1315
|
+
} \
|
|
1316
|
+
inline _Tpvec v_broadcast_highest(_Tpvec v) \
|
|
1317
|
+
{ \
|
|
1318
|
+
return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
|
|
1322
|
+
OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
|
|
1323
|
+
OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
|
|
1324
|
+
|
|
1325
|
+
|
|
1326
|
+
////////////// Reverse //////////////
|
|
1327
|
+
#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
|
|
1328
|
+
inline _Tpvec v_reverse(const _Tpvec& a) \
|
|
1329
|
+
{ \
|
|
1330
|
+
vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
|
|
1331
|
+
return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
1332
|
+
}
|
|
1333
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
|
|
1334
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
|
|
1335
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
|
|
1336
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
|
|
1337
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
|
|
1338
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
|
|
1339
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
|
|
1340
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
|
|
1341
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
|
|
1342
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1343
|
+
OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
|
|
1344
|
+
#endif
|
|
1345
|
+
|
|
1346
|
+
//////////// Value reordering ////////////
|
|
1347
|
+
|
|
1348
|
+
#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
|
|
1349
|
+
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
|
1350
|
+
{ \
|
|
1351
|
+
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
|
1352
|
+
b0 = vget_##suffix##m1(temp, 0); \
|
|
1353
|
+
b1 = vget_##suffix##m1(temp, 1); \
|
|
1354
|
+
} \
|
|
1355
|
+
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
|
1356
|
+
{ \
|
|
1357
|
+
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
|
1358
|
+
return vget_##suffix##m1(temp, 0); \
|
|
1359
|
+
} \
|
|
1360
|
+
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
|
1361
|
+
{ \
|
|
1362
|
+
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
|
1363
|
+
return vget_##suffix##m1(temp, 1); \
|
|
1364
|
+
} \
|
|
1365
|
+
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
|
1366
|
+
{ \
|
|
1367
|
+
return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
|
|
1371
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
|
|
1372
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
|
|
1373
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
|
|
1374
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
|
|
1375
|
+
OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
|
|
1376
|
+
|
|
1377
|
+
inline v_uint32 v_load_expand_q(const uchar* ptr)
|
|
1378
|
+
{
|
|
1379
|
+
return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
inline v_int32 v_load_expand_q(const schar* ptr)
|
|
1383
|
+
{
|
|
1384
|
+
return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
|
|
1385
|
+
}
|
|
1386
|
+
|
|
1387
|
+
#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
|
|
1388
|
+
inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
|
|
1389
|
+
{ \
|
|
1390
|
+
return shr(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \
|
|
1391
|
+
} \
|
|
1392
|
+
inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
|
|
1393
|
+
{ \
|
|
1394
|
+
vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
|
|
1395
|
+
} \
|
|
1396
|
+
template<int n = 0> inline \
|
|
1397
|
+
_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
|
|
1398
|
+
{ \
|
|
1399
|
+
return rshr(vset(vlmul_ext_##suffix##m2(a), 1, b), N, VTraits<_Tpvec>::vlanes()); \
|
|
1400
|
+
} \
|
|
1401
|
+
template<int n = 0> inline \
|
|
1402
|
+
void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
|
|
1403
|
+
{ \
|
|
1404
|
+
vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, vnclipu, vnclipu)
|
|
1408
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_int8, schar, v_int16, 8, i8, i16, vnclip, vnclip)
|
|
1409
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_uint16, ushort, v_uint32, 16, u16, u32, vnclipu, vnclipu)
|
|
1410
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, vnclip, vnclip)
|
|
1411
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_uint32, unsigned, v_uint64, 32, u32, u64, vnclipu, vnsrl)
|
|
1412
|
+
OPENCV_HAL_IMPL_RVV_PACK(v_int32, int, v_int64, 32, i32, i64, vnclip, vnsra)
|
|
1413
|
+
|
|
1414
|
+
#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
|
|
1415
|
+
inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
|
|
1416
|
+
{ \
|
|
1417
|
+
return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), 0, vl); \
|
|
1418
|
+
} \
|
|
1419
|
+
inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
|
|
1420
|
+
{ \
|
|
1421
|
+
vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), 0, vl), hvl); \
|
|
1422
|
+
} \
|
|
1423
|
+
template<int N = 0> inline \
|
|
1424
|
+
_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \
|
|
1425
|
+
{ \
|
|
1426
|
+
return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), n, vl); \
|
|
1427
|
+
} \
|
|
1428
|
+
template<int N = 0> inline \
|
|
1429
|
+
void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
|
|
1430
|
+
{ \
|
|
1431
|
+
vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), n, vl), hvl); \
|
|
1432
|
+
}
|
|
1433
|
+
|
|
1434
|
+
OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
|
|
1435
|
+
OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
|
|
1436
|
+
|
|
1437
|
+
|
|
1438
|
+
/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
|
|
1439
|
+
a0 = {A1 A2 A3 A4}
|
|
1440
|
+
a1 = {B1 B2 B3 B4}
|
|
1441
|
+
---------------
|
|
1442
|
+
{A1 B1 A2 B2} and {A3 B3 A4 B4}
|
|
1443
|
+
*/
|
|
1444
|
+
|
|
1445
|
+
#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
|
|
1446
|
+
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
|
|
1447
|
+
_wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \
|
|
1448
|
+
vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
|
|
1449
|
+
vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
|
|
1450
|
+
VTraits<_Tpvec>::vlanes()))); \
|
|
1451
|
+
b0 = vget_##suffix##m1(temp, 0); \
|
|
1452
|
+
b1 = vget_##suffix##m1(temp, 1); \
|
|
1453
|
+
}
|
|
1454
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
|
|
1455
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1)
|
|
1456
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
|
|
1457
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1)
|
|
1458
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
|
|
1459
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
|
|
1460
|
+
OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
|
|
1461
|
+
|
|
1462
|
+
#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
|
|
1463
|
+
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
|
|
1464
|
+
{ \
|
|
1465
|
+
return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
|
|
1466
|
+
} \
|
|
1467
|
+
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
|
|
1468
|
+
{ \
|
|
1469
|
+
return vslideup( \
|
|
1470
|
+
vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
|
|
1471
|
+
vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
|
|
1472
|
+
VTraits<_Tpvec>::vlanes()/2, \
|
|
1473
|
+
VTraits<_Tpvec>::vlanes()); \
|
|
1474
|
+
} \
|
|
1475
|
+
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
|
|
1476
|
+
{ \
|
|
1477
|
+
c = v_combine_low(a, b); \
|
|
1478
|
+
d = v_combine_high(a, b); \
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
|
|
1482
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
|
|
1483
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
|
|
1484
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
|
|
1485
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
|
|
1486
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
|
|
1487
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
|
|
1488
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1489
|
+
OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
|
|
1490
|
+
#endif
|
|
1491
|
+
|
|
1492
|
+
#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
|
|
1493
|
+
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
|
|
1494
|
+
{ \
|
|
1495
|
+
a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1496
|
+
b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1497
|
+
}\
|
|
1498
|
+
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
|
|
1499
|
+
{ \
|
|
1500
|
+
a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1501
|
+
b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1502
|
+
c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1503
|
+
} \
|
|
1504
|
+
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
|
|
1505
|
+
v_##_Tpvec& c, v_##_Tpvec& d) \
|
|
1506
|
+
{ \
|
|
1507
|
+
\
|
|
1508
|
+
a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1509
|
+
b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1510
|
+
c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1511
|
+
d = vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1512
|
+
} \
|
|
1513
|
+
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
|
1514
|
+
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
|
|
1515
|
+
{ \
|
|
1516
|
+
vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1517
|
+
vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1518
|
+
} \
|
|
1519
|
+
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
|
1520
|
+
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
|
|
1521
|
+
{ \
|
|
1522
|
+
vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1523
|
+
vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1524
|
+
vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1525
|
+
} \
|
|
1526
|
+
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
|
1527
|
+
const v_##_Tpvec& c, const v_##_Tpvec& d, \
|
|
1528
|
+
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
|
|
1529
|
+
{ \
|
|
1530
|
+
vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1531
|
+
vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1532
|
+
vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1533
|
+
vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
|
|
1537
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
|
|
1538
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
|
|
1539
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
|
|
1540
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
|
|
1541
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
|
|
1542
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
|
|
1543
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
|
|
1544
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
|
|
1545
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1546
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
|
|
1547
|
+
#endif
|
|
1548
|
+
|
|
1549
|
+
static uint64_t idx_interleave_pairs[] = { \
|
|
1550
|
+
0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
|
|
1551
|
+
0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
|
|
1552
|
+
0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
|
|
1553
|
+
0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
|
|
1554
|
+
|
|
1555
|
+
static uint64_t idx_interleave_quads[] = { \
|
|
1556
|
+
0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
|
|
1557
|
+
0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
|
|
1558
|
+
0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
|
|
1559
|
+
0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
|
|
1560
|
+
|
|
1561
|
+
#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
|
|
1562
|
+
inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
|
|
1563
|
+
CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
|
|
1564
|
+
vuint8m1_t vidx = vundefined_u8m1();\
|
|
1565
|
+
vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \
|
|
1566
|
+
return vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
|
|
1567
|
+
}
|
|
1568
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
|
|
1569
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
|
|
1570
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
|
|
1571
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
|
|
1572
|
+
|
|
1573
|
+
#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
|
|
1574
|
+
inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
|
|
1575
|
+
CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
|
|
1576
|
+
vuint##width##m1_t vidx = vundefined_u##width##m1();\
|
|
1577
|
+
vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
|
|
1578
|
+
return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs)
|
|
1582
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs)
|
|
1583
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs)
|
|
1584
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs)
|
|
1585
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs)
|
|
1586
|
+
|
|
1587
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads)
|
|
1588
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads)
|
|
1589
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads)
|
|
1590
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads)
|
|
1591
|
+
OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads)
|
|
1592
|
+
|
|
1593
|
+
//////////// PopCount //////////
|
|
1594
|
+
static const unsigned char popCountTable[256] =
|
|
1595
|
+
{
|
|
1596
|
+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
|
1597
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
1598
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
1599
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1600
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
1601
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1602
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1603
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
1604
|
+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
1605
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1606
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1607
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
1608
|
+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
1609
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
1610
|
+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
1611
|
+
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
|
|
1612
|
+
};
|
|
1613
|
+
#define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \
|
|
1614
|
+
static inline _Tpvec2 v_hadd(_Tpvec a) { \
|
|
1615
|
+
vuint##width2##m1_t oneX2 = vmv_v_x_u##width2##m1(1, VTraits<v_uint##width2>::vlanes()); \
|
|
1616
|
+
vuint##width##m1_t one = vreinterpret_u##width##m1(oneX2); \
|
|
1617
|
+
_Tm2 res = add(a, vslide1down(a, 0, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()); \
|
|
1618
|
+
return vget_##suffix##m1(vcompress(vmseq(one, 1, VTraits<v_uint##width>::vlanes()), res, res, VTraits<v_uint##width>::vlanes()), 0); \
|
|
1619
|
+
}
|
|
1620
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m2_t, 8, 16, u16, vwaddu_vv)
|
|
1621
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m2_t, 16, 32, u32, vwaddu_vv)
|
|
1622
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m2_t, 32, 64, u64, vwaddu_vv)
|
|
1623
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m2_t, 8, 16, i16, vwadd_vv)
|
|
1624
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m2_t, 16, 32, i32, vwadd_vv)
|
|
1625
|
+
OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m2_t, 32, 64, i64, vwadd_vv)
|
|
1626
|
+
|
|
1627
|
+
OPENCV_HAL_IMPL_RVV_HADD(vint32m2_t, v_int32, vint32m2_t, 16, 32, i32, vadd)
|
|
1628
|
+
OPENCV_HAL_IMPL_RVV_HADD(vint64m2_t, v_int64, vint64m2_t, 32, 64, i64, vadd)
|
|
1629
|
+
|
|
1630
|
+
inline v_uint8 v_popcount(const v_uint8& a)
|
|
1631
|
+
{
|
|
1632
|
+
return vloxei8(popCountTable, a, VTraits<v_uint8>::vlanes());
|
|
1633
|
+
}
|
|
1634
|
+
inline v_uint16 v_popcount(const v_uint16& a)
|
|
1635
|
+
{
|
|
1636
|
+
return v_hadd(v_popcount(vreinterpret_u8m1(a)));
|
|
1637
|
+
}
|
|
1638
|
+
inline v_uint32 v_popcount(const v_uint32& a)
|
|
1639
|
+
{
|
|
1640
|
+
return v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a))));
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
inline v_uint8 v_popcount(const v_int8& a)
|
|
1644
|
+
{
|
|
1645
|
+
return v_popcount(v_abs(a));\
|
|
1646
|
+
}
|
|
1647
|
+
inline v_uint16 v_popcount(const v_int16& a)
|
|
1648
|
+
{
|
|
1649
|
+
return v_popcount(v_abs(a));\
|
|
1650
|
+
}
|
|
1651
|
+
inline v_uint32 v_popcount(const v_int32& a)
|
|
1652
|
+
{
|
|
1653
|
+
return v_popcount(v_abs(a));\
|
|
1654
|
+
}
|
|
1655
|
+
|
|
1656
|
+
|
|
1657
|
+
//////////// SignMask ////////////
|
|
1658
|
+
#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
|
|
1659
|
+
inline int v_signmask(const _Tpvec& a) \
|
|
1660
|
+
{ \
|
|
1661
|
+
uint8_t ans[4] = {0}; \
|
|
1662
|
+
vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
|
|
1663
|
+
return *(reinterpret_cast<int*>(ans)) & (((__int128_t)1 << VTraits<_Tpvec>::vlanes()) - 1); \
|
|
1664
|
+
} \
|
|
1665
|
+
inline int v_scan_forward(const _Tpvec& a) \
|
|
1666
|
+
{ \
|
|
1667
|
+
return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
|
|
1671
|
+
OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
|
|
1672
|
+
OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
|
|
1673
|
+
OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
|
|
1674
|
+
|
|
1675
|
+
inline int64 v_signmask(const v_uint8& a)
|
|
1676
|
+
{ return v_signmask(v_reinterpret_as_s8(a)); }
|
|
1677
|
+
inline int64 v_signmask(const v_uint16& a)
|
|
1678
|
+
{ return v_signmask(v_reinterpret_as_s16(a)); }
|
|
1679
|
+
inline int v_signmask(const v_uint32& a)
|
|
1680
|
+
{ return v_signmask(v_reinterpret_as_s32(a)); }
|
|
1681
|
+
inline int v_signmask(const v_float32& a)
|
|
1682
|
+
{ return v_signmask(v_reinterpret_as_s32(a)); }
|
|
1683
|
+
inline int v_signmask(const v_uint64& a)
|
|
1684
|
+
{ return v_signmask(v_reinterpret_as_s64(a)); }
|
|
1685
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1686
|
+
inline int v_signmask(const v_float64& a)
|
|
1687
|
+
{ return v_signmask(v_reinterpret_as_s64(a)); }
|
|
1688
|
+
#endif
|
|
1689
|
+
|
|
1690
|
+
//////////// Scan forward ////////////
|
|
1691
|
+
inline int v_scan_forward(const v_uint8& a)
|
|
1692
|
+
{ return v_scan_forward(v_reinterpret_as_s8(a)); }
|
|
1693
|
+
inline int v_scan_forward(const v_uint16& a)
|
|
1694
|
+
{ return v_scan_forward(v_reinterpret_as_s16(a)); }
|
|
1695
|
+
inline int v_scan_forward(const v_uint32& a)
|
|
1696
|
+
{ return v_scan_forward(v_reinterpret_as_s32(a)); }
|
|
1697
|
+
inline int v_scan_forward(const v_float32& a)
|
|
1698
|
+
{ return v_scan_forward(v_reinterpret_as_s32(a)); }
|
|
1699
|
+
inline int v_scan_forward(const v_uint64& a)
|
|
1700
|
+
{ return v_scan_forward(v_reinterpret_as_s64(a)); }
|
|
1701
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1702
|
+
inline int v_scan_forward(const v_float64& a)
|
|
1703
|
+
{ return v_scan_forward(v_reinterpret_as_s64(a)); }
|
|
1704
|
+
#endif
|
|
1705
|
+
|
|
1706
|
+
//////////// Pack triplets ////////////
|
|
1707
|
+
// {A0, A1, A2, A3, B0, B1, B2, B3, C0 ...} --> {A0, A1, A2, B0, B1, B2, C0 ...}
|
|
1708
|
+
// mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
|
|
1709
|
+
#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
|
|
1710
|
+
inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
|
|
1711
|
+
size_t vl = vsetvlmax_e8m1(); \
|
|
1712
|
+
vuint32m1_t one = vmv_v_x_u32m1(1, vl/4); \
|
|
1713
|
+
vuint8m1_t zero = vmv_v_x_u8m1(0, vl); \
|
|
1714
|
+
vuint8m1_t mask = vreinterpret_u8m1(one); \
|
|
1715
|
+
return vcompress(vmseq(v_trunc(vslideup(zero, mask, 3, vl)), 0, vl), vec, vec, VTraits<_Tpvec>::vlanes()); \
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP)
|
|
1719
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP)
|
|
1720
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, vlmul_trunc_u8mf2)
|
|
1721
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, vlmul_trunc_u8mf2)
|
|
1722
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, vlmul_trunc_u8mf4)
|
|
1723
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, vlmul_trunc_u8mf4)
|
|
1724
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, vlmul_trunc_u8mf4)
|
|
1725
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, vlmul_trunc_u8mf8)
|
|
1726
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, vlmul_trunc_u8mf8)
|
|
1727
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1728
|
+
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, vlmul_trunc_u8mf8)
|
|
1729
|
+
#endif
|
|
1730
|
+
|
|
1731
|
+
|
|
1732
|
+
////// FP16 support ///////
|
|
1733
|
+
|
|
1734
|
+
#if defined(__riscv_zfh) && __riscv_zfh
|
|
1735
|
+
inline v_float32 v_load_expand(const float16_t* ptr)
|
|
1736
|
+
{
|
|
1737
|
+
return vfwcvt_f(vle16_v_f16mf2((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
inline void v_pack_store(float16_t* ptr, const v_float32& v)
|
|
1741
|
+
{
|
|
1742
|
+
vse16_v_f16mf2((_Float16*)ptr, vfncvt_f_f_w_f16mf2(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
|
|
1743
|
+
}
|
|
1744
|
+
#else
|
|
1745
|
+
inline v_float32 v_load_expand(const float16_t* ptr)
|
|
1746
|
+
{
|
|
1747
|
+
float buf[32];
|
|
1748
|
+
for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
|
|
1749
|
+
return v_load(buf);
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
inline void v_pack_store(float16_t* ptr, const v_float32& v)
|
|
1753
|
+
{
|
|
1754
|
+
float buf[32];
|
|
1755
|
+
v_store(buf, v);
|
|
1756
|
+
for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = float16_t(buf[i]);
|
|
1757
|
+
}
|
|
1758
|
+
#endif
|
|
1759
|
+
////////////// Rounding //////////////
|
|
1760
|
+
inline v_int32 v_round(const v_float32& a)
|
|
1761
|
+
{
|
|
1762
|
+
// return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
|
|
1763
|
+
return vfcvt_x(a, VTraits<v_float32>::vlanes());
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
inline v_int32 v_floor(const v_float32& a)
|
|
1767
|
+
{
|
|
1768
|
+
return vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
|
|
1769
|
+
// return vfcvt_x(a, VTraits<v_float32>::vlanes());
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
inline v_int32 v_ceil(const v_float32& a)
|
|
1773
|
+
{
|
|
1774
|
+
return vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
|
|
1775
|
+
}
|
|
1776
|
+
|
|
1777
|
+
inline v_int32 v_trunc(const v_float32& a)
|
|
1778
|
+
{
|
|
1779
|
+
return vfcvt_rtz_x(a, VTraits<v_float32>::vlanes());
|
|
1780
|
+
}
|
|
1781
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1782
|
+
inline v_int32 v_round(const v_float64& a)
|
|
1783
|
+
{
|
|
1784
|
+
return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
|
|
1785
|
+
}
|
|
1786
|
+
|
|
1787
|
+
inline v_int32 v_round(const v_float64& a, const v_float64& b)
|
|
1788
|
+
{
|
|
1789
|
+
return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
|
|
1790
|
+
}
|
|
1791
|
+
|
|
1792
|
+
inline v_int32 v_floor(const v_float64& a)
|
|
1793
|
+
{
|
|
1794
|
+
return vfncvt_x(vlmul_ext_f64m2(vfsub(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
inline v_int32 v_ceil(const v_float64& a)
|
|
1798
|
+
{
|
|
1799
|
+
return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
|
|
1800
|
+
}
|
|
1801
|
+
|
|
1802
|
+
inline v_int32 v_trunc(const v_float64& a)
|
|
1803
|
+
{
|
|
1804
|
+
return vfncvt_rtz_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
|
|
1805
|
+
}
|
|
1806
|
+
#endif
|
|
1807
|
+
|
|
1808
|
+
//////// Dot Product ////////
|
|
1809
|
+
|
|
1810
|
+
// 16 >> 32
|
|
1811
|
+
inline v_int32 v_dotprod(const v_int16& a, const v_int16& b)
|
|
1812
|
+
{
|
|
1813
|
+
vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
|
|
1814
|
+
return v_hadd(temp1);
|
|
1815
|
+
}
|
|
1816
|
+
|
|
1817
|
+
inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c)
|
|
1818
|
+
{
|
|
1819
|
+
vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
|
|
1820
|
+
return vadd(v_hadd(temp1), c, VTraits<v_int32>::vlanes());
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
// 32 >> 64
|
|
1824
|
+
inline v_int64 v_dotprod(const v_int32& a, const v_int32& b)
|
|
1825
|
+
{
|
|
1826
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1827
|
+
vuint32m1_t one32 = vreinterpret_u32m1(one64); \
|
|
1828
|
+
vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
|
|
1829
|
+
vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
|
|
1830
|
+
vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
|
|
1831
|
+
vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
|
|
1832
|
+
res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
|
|
1833
|
+
return vlmul_trunc_i64m1(res); \
|
|
1834
|
+
}
|
|
1835
|
+
inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c)
|
|
1836
|
+
{
|
|
1837
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1838
|
+
vuint32m1_t one32 = vreinterpret_u32m1(one64); \
|
|
1839
|
+
vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
|
|
1840
|
+
vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
|
|
1841
|
+
vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
|
|
1842
|
+
vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
|
|
1843
|
+
res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
|
|
1844
|
+
return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int64>::vlanes()); \
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
// 8 >> 32
|
|
1848
|
+
inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b)
|
|
1849
|
+
{
|
|
1850
|
+
vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
|
|
1851
|
+
vuint8m1_t one8 = vreinterpret_u8m1(one32); \
|
|
1852
|
+
vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
|
|
1853
|
+
vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
|
|
1854
|
+
vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
|
|
1855
|
+
vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
|
|
1856
|
+
vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
|
|
1857
|
+
vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
|
|
1858
|
+
res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
|
|
1859
|
+
return vlmul_trunc_u32m1(res);
|
|
1860
|
+
}
|
|
1861
|
+
|
|
1862
|
+
inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b,
|
|
1863
|
+
const v_uint32& c)
|
|
1864
|
+
{
|
|
1865
|
+
vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
|
|
1866
|
+
vuint8m1_t one8 = vreinterpret_u8m1(one32); \
|
|
1867
|
+
vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
|
|
1868
|
+
vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
|
|
1869
|
+
vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
|
|
1870
|
+
vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
|
|
1871
|
+
vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
|
|
1872
|
+
vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
|
|
1873
|
+
res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
|
|
1874
|
+
return vadd(vlmul_trunc_u32m1(res), c, VTraits<v_uint8>::vlanes());
|
|
1875
|
+
}
|
|
1876
|
+
|
|
1877
|
+
inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b)
|
|
1878
|
+
{
|
|
1879
|
+
vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
|
|
1880
|
+
vuint8m1_t one8 = vreinterpret_u8m1(one32); \
|
|
1881
|
+
vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
|
|
1882
|
+
vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
|
|
1883
|
+
vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
|
|
1884
|
+
vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
|
|
1885
|
+
vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
|
|
1886
|
+
vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
|
|
1887
|
+
res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
|
|
1888
|
+
return vlmul_trunc_i32m1(res);
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b,
|
|
1892
|
+
const v_int32& c)
|
|
1893
|
+
{
|
|
1894
|
+
vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
|
|
1895
|
+
vuint8m1_t one8 = vreinterpret_u8m1(one32); \
|
|
1896
|
+
vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
|
|
1897
|
+
vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
|
|
1898
|
+
vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
|
|
1899
|
+
vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
|
|
1900
|
+
vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
|
|
1901
|
+
vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
|
|
1902
|
+
res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
|
|
1903
|
+
return vadd(vlmul_trunc_i32m1(res), c, VTraits<v_int8>::vlanes());
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
// // 16 >> 64
|
|
1908
|
+
inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b)
|
|
1909
|
+
{
|
|
1910
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1911
|
+
vuint16m1_t one16 = vreinterpret_u16m1(one64); \
|
|
1912
|
+
vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
|
|
1913
|
+
vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
|
|
1914
|
+
vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
|
|
1915
|
+
vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
|
|
1916
|
+
vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
|
|
1917
|
+
vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
|
|
1918
|
+
res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
|
|
1919
|
+
return vlmul_trunc_u64m1(res);
|
|
1920
|
+
}
|
|
1921
|
+
inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c)
|
|
1922
|
+
{
|
|
1923
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1924
|
+
vuint16m1_t one16 = vreinterpret_u16m1(one64); \
|
|
1925
|
+
vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
|
|
1926
|
+
vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
|
|
1927
|
+
vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
|
|
1928
|
+
vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
|
|
1929
|
+
vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
|
|
1930
|
+
vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
|
|
1931
|
+
res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
|
|
1932
|
+
return vadd(vlmul_trunc_u64m1(res), c, VTraits<v_uint16>::vlanes());
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b)
|
|
1936
|
+
{
|
|
1937
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1938
|
+
vuint16m1_t one16 = vreinterpret_u16m1(one64); \
|
|
1939
|
+
vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
|
|
1940
|
+
vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
|
|
1941
|
+
vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
|
|
1942
|
+
vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
|
|
1943
|
+
vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
|
|
1944
|
+
vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
|
|
1945
|
+
res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
|
|
1946
|
+
return vlmul_trunc_i64m1(res);
|
|
1947
|
+
}
|
|
1948
|
+
inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b,
|
|
1949
|
+
const v_int64& c)
|
|
1950
|
+
{
|
|
1951
|
+
vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
|
|
1952
|
+
vuint16m1_t one16 = vreinterpret_u16m1(one64); \
|
|
1953
|
+
vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
|
|
1954
|
+
vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
|
|
1955
|
+
vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
|
|
1956
|
+
vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
|
|
1957
|
+
vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
|
|
1958
|
+
vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
|
|
1959
|
+
res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
|
|
1960
|
+
return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int16>::vlanes());
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
// // 32 >> 64f
|
|
1964
|
+
#if CV_SIMD_SCALABLE_64F
|
|
1965
|
+
inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b)
|
|
1966
|
+
{ return v_cvt_f64(v_dotprod(a, b)); }
|
|
1967
|
+
inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b,
|
|
1968
|
+
const v_float64& c)
|
|
1969
|
+
{ return v_add(v_dotprod_expand(a, b) , c); }
|
|
1970
|
+
#endif
|
|
1971
|
+
|
|
1972
|
+
//////// Fast Dot Product ////////
|
|
1973
|
+
// 16 >> 32
|
|
1974
|
+
inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b)
|
|
1975
|
+
{
|
|
1976
|
+
v_int32 zero = v_setzero_s32();
|
|
1977
|
+
return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
|
|
1978
|
+
}
|
|
1979
|
+
inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c)
|
|
1980
|
+
{
|
|
1981
|
+
v_int32 zero = v_setzero_s32();
|
|
1982
|
+
return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int16>::vlanes());
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
// 32 >> 64
|
|
1986
|
+
inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b)
|
|
1987
|
+
{
|
|
1988
|
+
v_int64 zero = v_setzero_s64();
|
|
1989
|
+
return vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes());
|
|
1990
|
+
}
|
|
1991
|
+
inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c)
|
|
1992
|
+
{
|
|
1993
|
+
v_int64 zero = v_setzero_s64();
|
|
1994
|
+
return vadd(vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
|
|
1998
|
+
// 8 >> 32
|
|
1999
|
+
inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b)
|
|
2000
|
+
{
|
|
2001
|
+
v_uint32 zero = v_setzero_u32();
|
|
2002
|
+
return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes());
|
|
2003
|
+
}
|
|
2004
|
+
inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c)
|
|
2005
|
+
{
|
|
2006
|
+
v_uint32 zero = v_setzero_u32();
|
|
2007
|
+
return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
|
|
2008
|
+
}
|
|
2009
|
+
inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b)
|
|
2010
|
+
{
|
|
2011
|
+
v_int32 zero = v_setzero_s32();
|
|
2012
|
+
return vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes());
|
|
2013
|
+
}
|
|
2014
|
+
inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c)
|
|
2015
|
+
{
|
|
2016
|
+
v_int32 zero = v_setzero_s32();
|
|
2017
|
+
return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
// 16 >> 64
|
|
2021
|
+
inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b)
|
|
2022
|
+
{
|
|
2023
|
+
v_uint64 zero = v_setzero_u64();
|
|
2024
|
+
return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes());
|
|
2025
|
+
}
|
|
2026
|
+
inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c)
|
|
2027
|
+
{
|
|
2028
|
+
v_uint64 zero = v_setzero_u64();
|
|
2029
|
+
return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes()), vredsum(zero, c, zero, VTraits<v_uint64>::vlanes()), VTraits<v_uint64>::vlanes());
|
|
2030
|
+
}
|
|
2031
|
+
inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b)
|
|
2032
|
+
{
|
|
2033
|
+
v_int64 zero = v_setzero_s64();
|
|
2034
|
+
return vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
|
|
2035
|
+
}
|
|
2036
|
+
inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c)
|
|
2037
|
+
{
|
|
2038
|
+
v_int64 zero = v_setzero_s64();
|
|
2039
|
+
return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
|
|
2040
|
+
}
|
|
2041
|
+
|
|
2042
|
+
// 32 >> 64f
|
|
2043
|
+
#if CV_SIMD_SCALABLE_64F
|
|
2044
|
+
inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b)
|
|
2045
|
+
{ return v_cvt_f64(v_dotprod_fast(a, b)); }
|
|
2046
|
+
inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const v_float64& c)
|
|
2047
|
+
{ return v_add(v_dotprod_expand_fast(a, b) , c); }
|
|
2048
|
+
#endif
|
|
2049
|
+
|
|
2050
|
+
// TODO: only 128 bit now.
|
|
2051
|
+
inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
|
|
2052
|
+
const v_float32& m1, const v_float32& m2,
|
|
2053
|
+
const v_float32& m3)
|
|
2054
|
+
{
|
|
2055
|
+
vfloat32m1_t res;
|
|
2056
|
+
res = vfmul_vf_f32m1(m0, v_extract_n(v, 0), VTraits<v_float32>::vlanes());
|
|
2057
|
+
res = vfmacc_vf_f32m1(res, v_extract_n(v, 1), m1, VTraits<v_float32>::vlanes());
|
|
2058
|
+
res = vfmacc_vf_f32m1(res, v_extract_n(v, 2), m2, VTraits<v_float32>::vlanes());
|
|
2059
|
+
res = vfmacc_vf_f32m1(res, v_extract_n(v, 3), m3, VTraits<v_float32>::vlanes());
|
|
2060
|
+
return res;
|
|
2061
|
+
}
|
|
2062
|
+
|
|
2063
|
+
// TODO: only 128 bit now.
|
|
2064
|
+
inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
|
|
2065
|
+
const v_float32& m1, const v_float32& m2,
|
|
2066
|
+
const v_float32& a)
|
|
2067
|
+
{
|
|
2068
|
+
vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n(v,0), VTraits<v_float32>::vlanes());
|
|
2069
|
+
res = vfmacc_vf_f32m1(res, v_extract_n(v,1), m1, VTraits<v_float32>::vlanes());
|
|
2070
|
+
res = vfmacc_vf_f32m1(res, v_extract_n(v,2), m2, VTraits<v_float32>::vlanes());
|
|
2071
|
+
return vfadd(res, a, VTraits<v_float32>::vlanes());
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
inline void v_cleanup() {}
|
|
2075
|
+
|
|
2076
|
+
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
|
2077
|
+
|
|
2078
|
+
} //namespace cv
|
|
2079
|
+
|
|
2080
|
+
#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|