node-native-win-utils 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. package/README.md +144 -27
  2. package/binding.gyp +18 -5
  3. package/dist/index.d.ts +146 -4
  4. package/dist/index.js +107 -3
  5. package/include/opencv2/core/affine.hpp +678 -0
  6. package/include/opencv2/core/async.hpp +105 -0
  7. package/include/opencv2/core/base.hpp +664 -0
  8. package/include/opencv2/core/bindings_utils.hpp +325 -0
  9. package/include/opencv2/core/bufferpool.hpp +40 -0
  10. package/include/opencv2/core/check.hpp +170 -0
  11. package/include/opencv2/core/core.hpp +48 -0
  12. package/include/opencv2/core/core_c.h +3128 -0
  13. package/include/opencv2/core/cuda/block.hpp +211 -0
  14. package/include/opencv2/core/cuda/border_interpolate.hpp +722 -0
  15. package/include/opencv2/core/cuda/color.hpp +309 -0
  16. package/include/opencv2/core/cuda/common.hpp +131 -0
  17. package/include/opencv2/core/cuda/datamov_utils.hpp +113 -0
  18. package/include/opencv2/core/cuda/detail/color_detail.hpp +2018 -0
  19. package/include/opencv2/core/cuda/detail/reduce.hpp +365 -0
  20. package/include/opencv2/core/cuda/detail/reduce_key_val.hpp +502 -0
  21. package/include/opencv2/core/cuda/detail/transform_detail.hpp +392 -0
  22. package/include/opencv2/core/cuda/detail/type_traits_detail.hpp +191 -0
  23. package/include/opencv2/core/cuda/detail/vec_distance_detail.hpp +121 -0
  24. package/include/opencv2/core/cuda/dynamic_smem.hpp +88 -0
  25. package/include/opencv2/core/cuda/emulation.hpp +269 -0
  26. package/include/opencv2/core/cuda/filters.hpp +293 -0
  27. package/include/opencv2/core/cuda/funcattrib.hpp +79 -0
  28. package/include/opencv2/core/cuda/functional.hpp +805 -0
  29. package/include/opencv2/core/cuda/limits.hpp +128 -0
  30. package/include/opencv2/core/cuda/reduce.hpp +209 -0
  31. package/include/opencv2/core/cuda/saturate_cast.hpp +292 -0
  32. package/include/opencv2/core/cuda/scan.hpp +258 -0
  33. package/include/opencv2/core/cuda/simd_functions.hpp +869 -0
  34. package/include/opencv2/core/cuda/transform.hpp +75 -0
  35. package/include/opencv2/core/cuda/type_traits.hpp +90 -0
  36. package/include/opencv2/core/cuda/utility.hpp +230 -0
  37. package/include/opencv2/core/cuda/vec_distance.hpp +232 -0
  38. package/include/opencv2/core/cuda/vec_math.hpp +923 -0
  39. package/include/opencv2/core/cuda/vec_traits.hpp +288 -0
  40. package/include/opencv2/core/cuda/warp.hpp +139 -0
  41. package/include/opencv2/core/cuda/warp_reduce.hpp +76 -0
  42. package/include/opencv2/core/cuda/warp_shuffle.hpp +162 -0
  43. package/include/opencv2/core/cuda.hpp +1279 -0
  44. package/include/opencv2/core/cuda.inl.hpp +763 -0
  45. package/include/opencv2/core/cuda_stream_accessor.hpp +86 -0
  46. package/include/opencv2/core/cuda_types.hpp +144 -0
  47. package/include/opencv2/core/cv_cpu_dispatch.h +381 -0
  48. package/include/opencv2/core/cv_cpu_helper.h +550 -0
  49. package/include/opencv2/core/cvdef.h +973 -0
  50. package/include/opencv2/core/cvstd.hpp +190 -0
  51. package/include/opencv2/core/cvstd.inl.hpp +197 -0
  52. package/include/opencv2/core/cvstd_wrapper.hpp +154 -0
  53. package/include/opencv2/core/detail/async_promise.hpp +71 -0
  54. package/include/opencv2/core/detail/dispatch_helper.impl.hpp +49 -0
  55. package/include/opencv2/core/detail/exception_ptr.hpp +27 -0
  56. package/include/opencv2/core/directx.hpp +184 -0
  57. package/include/opencv2/core/dualquaternion.hpp +979 -0
  58. package/include/opencv2/core/dualquaternion.inl.hpp +487 -0
  59. package/include/opencv2/core/eigen.hpp +402 -0
  60. package/include/opencv2/core/fast_math.hpp +433 -0
  61. package/include/opencv2/core/hal/hal.hpp +256 -0
  62. package/include/opencv2/core/hal/interface.h +190 -0
  63. package/include/opencv2/core/hal/intrin.hpp +939 -0
  64. package/include/opencv2/core/hal/intrin_avx.hpp +3177 -0
  65. package/include/opencv2/core/hal/intrin_avx512.hpp +3090 -0
  66. package/include/opencv2/core/hal/intrin_cpp.hpp +3321 -0
  67. package/include/opencv2/core/hal/intrin_forward.hpp +191 -0
  68. package/include/opencv2/core/hal/intrin_lasx.hpp +3236 -0
  69. package/include/opencv2/core/hal/intrin_msa.hpp +1887 -0
  70. package/include/opencv2/core/hal/intrin_neon.hpp +2610 -0
  71. package/include/opencv2/core/hal/intrin_rvv.hpp +3320 -0
  72. package/include/opencv2/core/hal/intrin_rvv071.hpp +2545 -0
  73. package/include/opencv2/core/hal/intrin_rvv_scalable.hpp +2080 -0
  74. package/include/opencv2/core/hal/intrin_sse.hpp +3467 -0
  75. package/include/opencv2/core/hal/intrin_sse_em.hpp +180 -0
  76. package/include/opencv2/core/hal/intrin_vsx.hpp +1608 -0
  77. package/include/opencv2/core/hal/intrin_wasm.hpp +2782 -0
  78. package/include/opencv2/core/hal/msa_macros.h +1558 -0
  79. package/include/opencv2/core/hal/simd_utils.impl.hpp +186 -0
  80. package/include/opencv2/core/llapi/llapi.h +102 -0
  81. package/include/opencv2/core/mat.hpp +3775 -0
  82. package/include/opencv2/core/mat.inl.hpp +3422 -0
  83. package/include/opencv2/core/matx.hpp +1536 -0
  84. package/include/opencv2/core/neon_utils.hpp +128 -0
  85. package/include/opencv2/core/ocl.hpp +917 -0
  86. package/include/opencv2/core/ocl_genbase.hpp +69 -0
  87. package/include/opencv2/core/opencl/ocl_defs.hpp +82 -0
  88. package/include/opencv2/core/opencl/opencl_info.hpp +212 -0
  89. package/include/opencv2/core/opencl/opencl_svm.hpp +81 -0
  90. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp +602 -0
  91. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp +146 -0
  92. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp +371 -0
  93. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp +272 -0
  94. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp +62 -0
  95. package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp +42 -0
  96. package/include/opencv2/core/opencl/runtime/opencl_clblas.hpp +53 -0
  97. package/include/opencv2/core/opencl/runtime/opencl_clfft.hpp +53 -0
  98. package/include/opencv2/core/opencl/runtime/opencl_core.hpp +84 -0
  99. package/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp +47 -0
  100. package/include/opencv2/core/opencl/runtime/opencl_gl.hpp +53 -0
  101. package/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp +47 -0
  102. package/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp +48 -0
  103. package/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp +42 -0
  104. package/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp +166 -0
  105. package/include/opencv2/core/opengl.hpp +733 -0
  106. package/include/opencv2/core/openvx/ovx_defs.hpp +48 -0
  107. package/include/opencv2/core/operations.hpp +610 -0
  108. package/include/opencv2/core/optim.hpp +302 -0
  109. package/include/opencv2/core/ovx.hpp +28 -0
  110. package/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp +72 -0
  111. package/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp +153 -0
  112. package/include/opencv2/core/parallel/parallel_backend.hpp +90 -0
  113. package/include/opencv2/core/persistence.hpp +1350 -0
  114. package/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp +30 -0
  115. package/include/opencv2/core/private.cuda.hpp +169 -0
  116. package/include/opencv2/core/private.hpp +896 -0
  117. package/include/opencv2/core/quaternion.hpp +1696 -0
  118. package/include/opencv2/core/quaternion.inl.hpp +1063 -0
  119. package/include/opencv2/core/saturate.hpp +180 -0
  120. package/include/opencv2/core/simd_intrinsics.hpp +87 -0
  121. package/include/opencv2/core/softfloat.hpp +514 -0
  122. package/include/opencv2/core/sse_utils.hpp +652 -0
  123. package/include/opencv2/core/traits.hpp +417 -0
  124. package/include/opencv2/core/types.hpp +2457 -0
  125. package/include/opencv2/core/types_c.h +2126 -0
  126. package/include/opencv2/core/utility.hpp +1229 -0
  127. package/include/opencv2/core/utils/allocator_stats.hpp +29 -0
  128. package/include/opencv2/core/utils/allocator_stats.impl.hpp +158 -0
  129. package/include/opencv2/core/utils/buffer_area.private.hpp +136 -0
  130. package/include/opencv2/core/utils/configuration.private.hpp +22 -0
  131. package/include/opencv2/core/utils/filesystem.hpp +82 -0
  132. package/include/opencv2/core/utils/filesystem.private.hpp +66 -0
  133. package/include/opencv2/core/utils/fp_control.private.hpp +29 -0
  134. package/include/opencv2/core/utils/fp_control_utils.hpp +69 -0
  135. package/include/opencv2/core/utils/instrumentation.hpp +125 -0
  136. package/include/opencv2/core/utils/lock.private.hpp +119 -0
  137. package/include/opencv2/core/utils/logger.defines.hpp +42 -0
  138. package/include/opencv2/core/utils/logger.hpp +218 -0
  139. package/include/opencv2/core/utils/logtag.hpp +28 -0
  140. package/include/opencv2/core/utils/plugin_loader.private.hpp +165 -0
  141. package/include/opencv2/core/utils/tls.hpp +235 -0
  142. package/include/opencv2/core/utils/trace.hpp +252 -0
  143. package/include/opencv2/core/utils/trace.private.hpp +421 -0
  144. package/include/opencv2/core/va_intel.hpp +75 -0
  145. package/include/opencv2/core/version.hpp +26 -0
  146. package/include/opencv2/core/vsx_utils.hpp +1047 -0
  147. package/include/opencv2/core.hpp +3365 -0
  148. package/include/opencv2/imgcodecs/imgcodecs.hpp +48 -0
  149. package/include/opencv2/imgcodecs/imgcodecs_c.h +1 -0
  150. package/include/opencv2/imgcodecs/ios.h +59 -0
  151. package/include/opencv2/imgcodecs/legacy/constants_c.h +54 -0
  152. package/include/opencv2/imgcodecs/macosx.h +20 -0
  153. package/include/opencv2/imgcodecs.hpp +407 -0
  154. package/include/opencv2/imgproc/bindings.hpp +34 -0
  155. package/include/opencv2/imgproc/detail/gcgraph.hpp +395 -0
  156. package/include/opencv2/imgproc/hal/hal.hpp +246 -0
  157. package/include/opencv2/imgproc/hal/interface.h +46 -0
  158. package/include/opencv2/imgproc/imgproc.hpp +48 -0
  159. package/include/opencv2/imgproc/imgproc_c.h +1177 -0
  160. package/include/opencv2/imgproc/segmentation.hpp +141 -0
  161. package/include/opencv2/imgproc/types_c.h +659 -0
  162. package/include/opencv2/imgproc.hpp +5035 -0
  163. package/include/opencv2/opencv_modules.hpp +17 -0
  164. package/libs/libjpeg-turbo.lib +0 -0
  165. package/libs/libpng.lib +0 -0
  166. package/libs/opencv_core470.lib +0 -0
  167. package/libs/opencv_imgcodecs470.lib +0 -0
  168. package/libs/opencv_imgproc470.lib +0 -0
  169. package/libs/zlib.lib +0 -0
  170. package/package.json +14 -3
  171. package/prebuilds/win32-x64/node.napi.node +0 -0
  172. package/src/cpp/capturewindow.cpp +36 -46
  173. package/src/cpp/main.cpp +10 -2
  174. package/src/cpp/opencv.cpp +425 -0
@@ -0,0 +1,869 @@
1
+ /*M///////////////////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
+ //
5
+ // By downloading, copying, installing or using the software you agree to this license.
6
+ // If you do not agree to this license, do not download, install,
7
+ // copy or use the software.
8
+ //
9
+ //
10
+ // License Agreement
11
+ // For Open Source Computer Vision Library
12
+ //
13
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
+ // Third party copyrights are property of their respective owners.
16
+ //
17
+ // Redistribution and use in source and binary forms, with or without modification,
18
+ // are permitted provided that the following conditions are met:
19
+ //
20
+ // * Redistribution's of source code must retain the above copyright notice,
21
+ // this list of conditions and the following disclaimer.
22
+ //
23
+ // * Redistribution's in binary form must reproduce the above copyright notice,
24
+ // this list of conditions and the following disclaimer in the documentation
25
+ // and/or other materials provided with the distribution.
26
+ //
27
+ // * The name of the copyright holders may not be used to endorse or promote products
28
+ // derived from this software without specific prior written permission.
29
+ //
30
+ // This software is provided by the copyright holders and contributors "as is" and
31
+ // any express or implied warranties, including, but not limited to, the implied
32
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
33
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
34
+ // indirect, incidental, special, exemplary, or consequential damages
35
+ // (including, but not limited to, procurement of substitute goods or services;
36
+ // loss of use, data, or profits; or business interruption) however caused
37
+ // and on any theory of liability, whether in contract, strict liability,
38
+ // or tort (including negligence or otherwise) arising in any way out of
39
+ // the use of this software, even if advised of the possibility of such damage.
40
+ //
41
+ //M*/
42
+
43
+ /*
44
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
45
+ *
46
+ * Redistribution and use in source and binary forms, with or without
47
+ * modification, are permitted provided that the following conditions are met:
48
+ *
49
+ * Redistributions of source code must retain the above copyright notice,
50
+ * this list of conditions and the following disclaimer.
51
+ *
52
+ * Redistributions in binary form must reproduce the above copyright notice,
53
+ * this list of conditions and the following disclaimer in the documentation
54
+ * and/or other materials provided with the distribution.
55
+ *
56
+ * Neither the name of NVIDIA Corporation nor the names of its contributors
57
+ * may be used to endorse or promote products derived from this software
58
+ * without specific prior written permission.
59
+ *
60
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
61
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
64
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70
+ * POSSIBILITY OF SUCH DAMAGE.
71
+ */
72
+
73
+ #ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
74
+ #define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
75
+
76
+ #include "common.hpp"
77
+
78
+ /** @file
79
+ * @deprecated Use @ref cudev instead.
80
+ */
81
+
82
+ //! @cond IGNORED
83
+
84
+ namespace cv { namespace cuda { namespace device
85
+ {
86
+ // 2
87
+
88
+ static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
89
+ {
90
+ unsigned int r = 0;
91
+
92
+ #if __CUDA_ARCH__ >= 300
93
+ asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
94
+ #elif __CUDA_ARCH__ >= 200
95
+ asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
96
+ asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
97
+ #else
98
+ unsigned int s;
99
+ s = a ^ b; // sum bits
100
+ r = a + b; // actual sum
101
+ s = s ^ r; // determine carry-ins for each bit position
102
+ s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
103
+ r = r - s; // subtract out carry-out from low word
104
+ #endif
105
+
106
+ return r;
107
+ }
108
+
109
+ static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
110
+ {
111
+ unsigned int r = 0;
112
+
113
+ #if __CUDA_ARCH__ >= 300
114
+ asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
115
+ #elif __CUDA_ARCH__ >= 200
116
+ asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
117
+ asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
118
+ #else
119
+ unsigned int s;
120
+ s = a ^ b; // sum bits
121
+ r = a - b; // actual sum
122
+ s = s ^ r; // determine carry-ins for each bit position
123
+ s = s & 0x00010000; // borrow to high word
124
+ r = r + s; // compensate for borrow from low word
125
+ #endif
126
+
127
+ return r;
128
+ }
129
+
130
+ static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
131
+ {
132
+ unsigned int r = 0;
133
+
134
+ #if __CUDA_ARCH__ >= 300
135
+ asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
136
+ #elif __CUDA_ARCH__ >= 200
137
+ asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
138
+ asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
139
+ #else
140
+ unsigned int s, t, u, v;
141
+ s = a & 0x0000ffff; // extract low halfword
142
+ r = b & 0x0000ffff; // extract low halfword
143
+ u = ::max(r, s); // maximum of low halfwords
144
+ v = ::min(r, s); // minimum of low halfwords
145
+ s = a & 0xffff0000; // extract high halfword
146
+ r = b & 0xffff0000; // extract high halfword
147
+ t = ::max(r, s); // maximum of high halfwords
148
+ s = ::min(r, s); // minimum of high halfwords
149
+ r = u | t; // maximum of both halfwords
150
+ s = v | s; // minimum of both halfwords
151
+ r = r - s; // |a - b| = max(a,b) - min(a,b);
152
+ #endif
153
+
154
+ return r;
155
+ }
156
+
157
+ static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
158
+ {
159
+ unsigned int r, s;
160
+
161
+ // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
162
+ // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
163
+ s = a ^ b;
164
+ r = a & b;
165
+ s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
166
+ s = s >> 1;
167
+ s = r + s;
168
+
169
+ return s;
170
+ }
171
+
172
+ static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
173
+ {
174
+ unsigned int r = 0;
175
+
176
+ #if __CUDA_ARCH__ >= 300
177
+ asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
178
+ #else
179
+ // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
180
+ // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
181
+ unsigned int s;
182
+ s = a ^ b;
183
+ r = a | b;
184
+ s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
185
+ s = s >> 1;
186
+ r = r - s;
187
+ #endif
188
+
189
+ return r;
190
+ }
191
+
192
+ static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
193
+ {
194
+ unsigned int r = 0;
195
+
196
+ #if __CUDA_ARCH__ >= 300
197
+ asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
198
+ #else
199
+ // inspired by Alan Mycroft's null-byte detection algorithm:
200
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
201
+ unsigned int c;
202
+ r = a ^ b; // 0x0000 if a == b
203
+ c = r | 0x80008000; // set msbs, to catch carry out
204
+ r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
205
+ c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
206
+ c = r & ~c; // msb = 1, if r was 0x0000
207
+ r = c >> 15; // convert to bool
208
+ #endif
209
+
210
+ return r;
211
+ }
212
+
213
+ static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
214
+ {
215
+ unsigned int r, c;
216
+
217
+ #if __CUDA_ARCH__ >= 300
218
+ r = vseteq2(a, b);
219
+ c = r << 16; // convert bool
220
+ r = c - r; // into mask
221
+ #else
222
+ // inspired by Alan Mycroft's null-byte detection algorithm:
223
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
224
+ r = a ^ b; // 0x0000 if a == b
225
+ c = r | 0x80008000; // set msbs, to catch carry out
226
+ r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
227
+ c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
228
+ c = r & ~c; // msb = 1, if r was 0x0000
229
+ r = c >> 15; // convert
230
+ r = c - r; // msbs to
231
+ r = c | r; // mask
232
+ #endif
233
+
234
+ return r;
235
+ }
236
+
237
+ static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
238
+ {
239
+ unsigned int r = 0;
240
+
241
+ #if __CUDA_ARCH__ >= 300
242
+ asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
243
+ #else
244
+ unsigned int c;
245
+ asm("not.b32 %0, %0;" : "+r"(b));
246
+ c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
247
+ c = c & 0x80008000; // msb = carry-outs
248
+ r = c >> 15; // convert to bool
249
+ #endif
250
+
251
+ return r;
252
+ }
253
+
254
+ static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
255
+ {
256
+ unsigned int r, c;
257
+
258
+ #if __CUDA_ARCH__ >= 300
259
+ r = vsetge2(a, b);
260
+ c = r << 16; // convert bool
261
+ r = c - r; // into mask
262
+ #else
263
+ asm("not.b32 %0, %0;" : "+r"(b));
264
+ c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
265
+ c = c & 0x80008000; // msb = carry-outs
266
+ r = c >> 15; // convert
267
+ r = c - r; // msbs to
268
+ r = c | r; // mask
269
+ #endif
270
+
271
+ return r;
272
+ }
273
+
274
+ static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
275
+ {
276
+ unsigned int r = 0;
277
+
278
+ #if __CUDA_ARCH__ >= 300
279
+ asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
280
+ #else
281
+ unsigned int c;
282
+ asm("not.b32 %0, %0;" : "+r"(b));
283
+ c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
284
+ c = c & 0x80008000; // msbs = carry-outs
285
+ r = c >> 15; // convert to bool
286
+ #endif
287
+
288
+ return r;
289
+ }
290
+
291
+ static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
292
+ {
293
+ unsigned int r, c;
294
+
295
+ #if __CUDA_ARCH__ >= 300
296
+ r = vsetgt2(a, b);
297
+ c = r << 16; // convert bool
298
+ r = c - r; // into mask
299
+ #else
300
+ asm("not.b32 %0, %0;" : "+r"(b));
301
+ c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
302
+ c = c & 0x80008000; // msbs = carry-outs
303
+ r = c >> 15; // convert
304
+ r = c - r; // msbs to
305
+ r = c | r; // mask
306
+ #endif
307
+
308
+ return r;
309
+ }
310
+
311
+ static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
312
+ {
313
+ unsigned int r = 0;
314
+
315
+ #if __CUDA_ARCH__ >= 300
316
+ asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
317
+ #else
318
+ unsigned int c;
319
+ asm("not.b32 %0, %0;" : "+r"(a));
320
+ c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
321
+ c = c & 0x80008000; // msb = carry-outs
322
+ r = c >> 15; // convert to bool
323
+ #endif
324
+
325
+ return r;
326
+ }
327
+
328
+ static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
329
+ {
330
+ unsigned int r, c;
331
+
332
+ #if __CUDA_ARCH__ >= 300
333
+ r = vsetle2(a, b);
334
+ c = r << 16; // convert bool
335
+ r = c - r; // into mask
336
+ #else
337
+ asm("not.b32 %0, %0;" : "+r"(a));
338
+ c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
339
+ c = c & 0x80008000; // msb = carry-outs
340
+ r = c >> 15; // convert
341
+ r = c - r; // msbs to
342
+ r = c | r; // mask
343
+ #endif
344
+
345
+ return r;
346
+ }
347
+
348
+ static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
349
+ {
350
+ unsigned int r = 0;
351
+
352
+ #if __CUDA_ARCH__ >= 300
353
+ asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
354
+ #else
355
+ unsigned int c;
356
+ asm("not.b32 %0, %0;" : "+r"(a));
357
+ c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
358
+ c = c & 0x80008000; // msb = carry-outs
359
+ r = c >> 15; // convert to bool
360
+ #endif
361
+
362
+ return r;
363
+ }
364
+
365
+ static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
366
+ {
367
+ unsigned int r, c;
368
+
369
+ #if __CUDA_ARCH__ >= 300
370
+ r = vsetlt2(a, b);
371
+ c = r << 16; // convert bool
372
+ r = c - r; // into mask
373
+ #else
374
+ asm("not.b32 %0, %0;" : "+r"(a));
375
+ c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
376
+ c = c & 0x80008000; // msb = carry-outs
377
+ r = c >> 15; // convert
378
+ r = c - r; // msbs to
379
+ r = c | r; // mask
380
+ #endif
381
+
382
+ return r;
383
+ }
384
+
385
+ static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
386
+ {
387
+ unsigned int r = 0;
388
+
389
+ #if __CUDA_ARCH__ >= 300
390
+ asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
391
+ #else
392
+ // inspired by Alan Mycroft's null-byte detection algorithm:
393
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
394
+ unsigned int c;
395
+ r = a ^ b; // 0x0000 if a == b
396
+ c = r | 0x80008000; // set msbs, to catch carry out
397
+ c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
398
+ c = r | c; // msb = 1, if r was not 0x0000
399
+ c = c & 0x80008000; // extract msbs
400
+ r = c >> 15; // convert to bool
401
+ #endif
402
+
403
+ return r;
404
+ }
405
+
406
+ static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
407
+ {
408
+ unsigned int r, c;
409
+
410
+ #if __CUDA_ARCH__ >= 300
411
+ r = vsetne2(a, b);
412
+ c = r << 16; // convert bool
413
+ r = c - r; // into mask
414
+ #else
415
+ // inspired by Alan Mycroft's null-byte detection algorithm:
416
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
417
+ r = a ^ b; // 0x0000 if a == b
418
+ c = r | 0x80008000; // set msbs, to catch carry out
419
+ c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
420
+ c = r | c; // msb = 1, if r was not 0x0000
421
+ c = c & 0x80008000; // extract msbs
422
+ r = c >> 15; // convert
423
+ r = c - r; // msbs to
424
+ r = c | r; // mask
425
+ #endif
426
+
427
+ return r;
428
+ }
429
+
430
+ static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
431
+ {
432
+ unsigned int r = 0;
433
+
434
+ #if __CUDA_ARCH__ >= 300
435
+ asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
436
+ #elif __CUDA_ARCH__ >= 200
437
+ asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
438
+ asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439
+ #else
440
+ unsigned int s, t, u;
441
+ r = a & 0x0000ffff; // extract low halfword
442
+ s = b & 0x0000ffff; // extract low halfword
443
+ t = ::max(r, s); // maximum of low halfwords
444
+ r = a & 0xffff0000; // extract high halfword
445
+ s = b & 0xffff0000; // extract high halfword
446
+ u = ::max(r, s); // maximum of high halfwords
447
+ r = t | u; // combine halfword maximums
448
+ #endif
449
+
450
+ return r;
451
+ }
452
+
453
+ static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
454
+ {
455
+ unsigned int r = 0;
456
+
457
+ #if __CUDA_ARCH__ >= 300
458
+ asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
459
+ #elif __CUDA_ARCH__ >= 200
460
+ asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
461
+ asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
462
+ #else
463
+ unsigned int s, t, u;
464
+ r = a & 0x0000ffff; // extract low halfword
465
+ s = b & 0x0000ffff; // extract low halfword
466
+ t = ::min(r, s); // minimum of low halfwords
467
+ r = a & 0xffff0000; // extract high halfword
468
+ s = b & 0xffff0000; // extract high halfword
469
+ u = ::min(r, s); // minimum of high halfwords
470
+ r = t | u; // combine halfword minimums
471
+ #endif
472
+
473
+ return r;
474
+ }
475
+
476
+ // 4
477
+
478
+ static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
479
+ {
480
+ unsigned int r = 0;
481
+
482
+ #if __CUDA_ARCH__ >= 300
483
+ asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484
+ #elif __CUDA_ARCH__ >= 200
485
+ asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486
+ asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487
+ asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
488
+ asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
489
+ #else
490
+ unsigned int s, t;
491
+ s = a ^ b; // sum bits
492
+ r = a & 0x7f7f7f7f; // clear msbs
493
+ t = b & 0x7f7f7f7f; // clear msbs
494
+ s = s & 0x80808080; // msb sum bits
495
+ r = r + t; // add without msbs, record carry-out in msbs
496
+ r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
497
+ #endif /* __CUDA_ARCH__ >= 300 */
498
+
499
+ return r;
500
+ }
501
+
502
+ static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
503
+ {
504
+ unsigned int r = 0;
505
+
506
+ #if __CUDA_ARCH__ >= 300
507
+ asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
508
+ #elif __CUDA_ARCH__ >= 200
509
+ asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510
+ asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
511
+ asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
512
+ asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
513
+ #else
514
+ unsigned int s, t;
515
+ s = a ^ ~b; // inverted sum bits
516
+ r = a | 0x80808080; // set msbs
517
+ t = b & 0x7f7f7f7f; // clear msbs
518
+ s = s & 0x80808080; // inverted msb sum bits
519
+ r = r - t; // subtract w/o msbs, record inverted borrows in msb
520
+ r = r ^ s; // combine inverted msb sum bits and borrows
521
+ #endif
522
+
523
+ return r;
524
+ }
525
+
526
+ static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
527
+ {
528
+ unsigned int r, s;
529
+
530
+ // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
531
+ // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
532
+ s = a ^ b;
533
+ r = a & b;
534
+ s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
535
+ s = s >> 1;
536
+ s = r + s;
537
+
538
+ return s;
539
+ }
540
+
541
+ static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
542
+ {
543
+ unsigned int r = 0;
544
+
545
+ #if __CUDA_ARCH__ >= 300
546
+ asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
547
+ #else
548
+ // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
549
+ // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
550
+ unsigned int c;
551
+ c = a ^ b;
552
+ r = a | b;
553
+ c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
554
+ c = c >> 1;
555
+ r = r - c;
556
+ #endif
557
+
558
+ return r;
559
+ }
560
+
561
+ static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
562
+ {
563
+ unsigned int r = 0;
564
+
565
+ #if __CUDA_ARCH__ >= 300
566
+ asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
567
+ #else
568
+ // inspired by Alan Mycroft's null-byte detection algorithm:
569
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
570
+ unsigned int c;
571
+ r = a ^ b; // 0x00 if a == b
572
+ c = r | 0x80808080; // set msbs, to catch carry out
573
+ r = r ^ c; // extract msbs, msb = 1 if r < 0x80
574
+ c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
575
+ c = r & ~c; // msb = 1, if r was 0x00
576
+ r = c >> 7; // convert to bool
577
+ #endif
578
+
579
+ return r;
580
+ }
581
+
582
+ static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
583
+ {
584
+ unsigned int r, t;
585
+
586
+ #if __CUDA_ARCH__ >= 300
587
+ r = vseteq4(a, b);
588
+ t = r << 8; // convert bool
589
+ r = t - r; // to mask
590
+ #else
591
+ // inspired by Alan Mycroft's null-byte detection algorithm:
592
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
593
+ t = a ^ b; // 0x00 if a == b
594
+ r = t | 0x80808080; // set msbs, to catch carry out
595
+ t = t ^ r; // extract msbs, msb = 1 if t < 0x80
596
+ r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
597
+ r = t & ~r; // msb = 1, if t was 0x00
598
+ t = r >> 7; // build mask
599
+ t = r - t; // from
600
+ r = t | r; // msbs
601
+ #endif
602
+
603
+ return r;
604
+ }
605
+
606
+ static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
607
+ {
608
+ unsigned int r = 0;
609
+
610
+ #if __CUDA_ARCH__ >= 300
611
+ asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
612
+ #else
613
+ unsigned int c;
614
+ asm("not.b32 %0, %0;" : "+r"(a));
615
+ c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
616
+ c = c & 0x80808080; // msb = carry-outs
617
+ r = c >> 7; // convert to bool
618
+ #endif
619
+
620
+ return r;
621
+ }
622
+
623
+ static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
624
+ {
625
+ unsigned int r, c;
626
+
627
+ #if __CUDA_ARCH__ >= 300
628
+ r = vsetle4(a, b);
629
+ c = r << 8; // convert bool
630
+ r = c - r; // to mask
631
+ #else
632
+ asm("not.b32 %0, %0;" : "+r"(a));
633
+ c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
634
+ c = c & 0x80808080; // msbs = carry-outs
635
+ r = c >> 7; // convert
636
+ r = c - r; // msbs to
637
+ r = c | r; // mask
638
+ #endif
639
+
640
+ return r;
641
+ }
642
+
643
+ static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
644
+ {
645
+ unsigned int r = 0;
646
+
647
+ #if __CUDA_ARCH__ >= 300
648
+ asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
649
+ #else
650
+ unsigned int c;
651
+ asm("not.b32 %0, %0;" : "+r"(a));
652
+ c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
653
+ c = c & 0x80808080; // msb = carry-outs
654
+ r = c >> 7; // convert to bool
655
+ #endif
656
+
657
+ return r;
658
+ }
659
+
660
+ static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
661
+ {
662
+ unsigned int r, c;
663
+
664
+ #if __CUDA_ARCH__ >= 300
665
+ r = vsetlt4(a, b);
666
+ c = r << 8; // convert bool
667
+ r = c - r; // to mask
668
+ #else
669
+ asm("not.b32 %0, %0;" : "+r"(a));
670
+ c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
671
+ c = c & 0x80808080; // msbs = carry-outs
672
+ r = c >> 7; // convert
673
+ r = c - r; // msbs to
674
+ r = c | r; // mask
675
+ #endif
676
+
677
+ return r;
678
+ }
679
+
680
+ static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
681
+ {
682
+ unsigned int r = 0;
683
+
684
+ #if __CUDA_ARCH__ >= 300
685
+ asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
686
+ #else
687
+ unsigned int c;
688
+ asm("not.b32 %0, %0;" : "+r"(b));
689
+ c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
690
+ c = c & 0x80808080; // msb = carry-outs
691
+ r = c >> 7; // convert to bool
692
+ #endif
693
+
694
+ return r;
695
+ }
696
+
697
+ static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
698
+ {
699
+ unsigned int r, s;
700
+
701
+ #if __CUDA_ARCH__ >= 300
702
+ r = vsetge4(a, b);
703
+ s = r << 8; // convert bool
704
+ r = s - r; // to mask
705
+ #else
706
+ asm ("not.b32 %0,%0;" : "+r"(b));
707
+ r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
708
+ r = r & 0x80808080; // msb = carry-outs
709
+ s = r >> 7; // build mask
710
+ s = r - s; // from
711
+ r = s | r; // msbs
712
+ #endif
713
+
714
+ return r;
715
+ }
716
+
717
+ static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
718
+ {
719
+ unsigned int r = 0;
720
+
721
+ #if __CUDA_ARCH__ >= 300
722
+ asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
723
+ #else
724
+ unsigned int c;
725
+ asm("not.b32 %0, %0;" : "+r"(b));
726
+ c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
727
+ c = c & 0x80808080; // msb = carry-outs
728
+ r = c >> 7; // convert to bool
729
+ #endif
730
+
731
+ return r;
732
+ }
733
+
734
+ static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
735
+ {
736
+ unsigned int r, c;
737
+
738
+ #if __CUDA_ARCH__ >= 300
739
+ r = vsetgt4(a, b);
740
+ c = r << 8; // convert bool
741
+ r = c - r; // to mask
742
+ #else
743
+ asm("not.b32 %0, %0;" : "+r"(b));
744
+ c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
745
+ c = c & 0x80808080; // msb = carry-outs
746
+ r = c >> 7; // convert
747
+ r = c - r; // msbs to
748
+ r = c | r; // mask
749
+ #endif
750
+
751
+ return r;
752
+ }
753
+
754
+ static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
755
+ {
756
+ unsigned int r = 0;
757
+
758
+ #if __CUDA_ARCH__ >= 300
759
+ asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
760
+ #else
761
+ // inspired by Alan Mycroft's null-byte detection algorithm:
762
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
763
+ unsigned int c;
764
+ r = a ^ b; // 0x00 if a == b
765
+ c = r | 0x80808080; // set msbs, to catch carry out
766
+ c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
767
+ c = r | c; // msb = 1, if r was not 0x00
768
+ c = c & 0x80808080; // extract msbs
769
+ r = c >> 7; // convert to bool
770
+ #endif
771
+
772
+ return r;
773
+ }
774
+
775
+ static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
776
+ {
777
+ unsigned int r, c;
778
+
779
+ #if __CUDA_ARCH__ >= 300
780
+ r = vsetne4(a, b);
781
+ c = r << 8; // convert bool
782
+ r = c - r; // to mask
783
+ #else
784
+ // inspired by Alan Mycroft's null-byte detection algorithm:
785
+ // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
786
+ r = a ^ b; // 0x00 if a == b
787
+ c = r | 0x80808080; // set msbs, to catch carry out
788
+ c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
789
+ c = r | c; // msb = 1, if r was not 0x00
790
+ c = c & 0x80808080; // extract msbs
791
+ r = c >> 7; // convert
792
+ r = c - r; // msbs to
793
+ r = c | r; // mask
794
+ #endif
795
+
796
+ return r;
797
+ }
798
+
799
+ static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
800
+ {
801
+ unsigned int r = 0;
802
+
803
+ #if __CUDA_ARCH__ >= 300
804
+ asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
805
+ #elif __CUDA_ARCH__ >= 200
806
+ asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
807
+ asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808
+ asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
809
+ asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
810
+ #else
811
+ unsigned int s;
812
+ s = vcmpge4(a, b); // mask = 0xff if a >= b
813
+ r = a ^ b; //
814
+ s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
815
+ r = s ^ r; // select a when b >= a, else select b => min(a,b)
816
+ r = s - r; // |a - b| = max(a,b) - min(a,b);
817
+ #endif
818
+
819
+ return r;
820
+ }
821
+
822
+ static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
823
+ {
824
+ unsigned int r = 0;
825
+
826
+ #if __CUDA_ARCH__ >= 300
827
+ asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
828
+ #elif __CUDA_ARCH__ >= 200
829
+ asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
830
+ asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
831
+ asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
832
+ asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
833
+ #else
834
+ unsigned int s;
835
+ s = vcmpge4(a, b); // mask = 0xff if a >= b
836
+ r = a & s; // select a when b >= a
837
+ s = b & ~s; // select b when b < a
838
+ r = r | s; // combine byte selections
839
+ #endif
840
+
841
+ return r; // byte-wise unsigned maximum
842
+ }
843
+
844
+ static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
845
+ {
846
+ unsigned int r = 0;
847
+
848
+ #if __CUDA_ARCH__ >= 300
849
+ asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
850
+ #elif __CUDA_ARCH__ >= 200
851
+ asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
852
+ asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853
+ asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
854
+ asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855
+ #else
856
+ unsigned int s;
857
+ s = vcmpge4(b, a); // mask = 0xff if a >= b
858
+ r = a & s; // select a when b >= a
859
+ s = b & ~s; // select b when b < a
860
+ r = r | s; // combine byte selections
861
+ #endif
862
+
863
+ return r;
864
+ }
865
+ }}}
866
+
867
+ //! @endcond
868
+
869
+ #endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP