npm - node-native-win-utils - Versions diffs - 1.1.0 → 1.3.0 - Mend

node-native-win-utils 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

package/README.md +144 -27
package/binding.gyp +18 -5
package/dist/index.d.ts +146 -4
package/dist/index.js +107 -3
package/include/opencv2/core/affine.hpp +678 -0
package/include/opencv2/core/async.hpp +105 -0
package/include/opencv2/core/base.hpp +664 -0
package/include/opencv2/core/bindings_utils.hpp +325 -0
package/include/opencv2/core/bufferpool.hpp +40 -0
package/include/opencv2/core/check.hpp +170 -0
package/include/opencv2/core/core.hpp +48 -0
package/include/opencv2/core/core_c.h +3128 -0
package/include/opencv2/core/cuda/block.hpp +211 -0
package/include/opencv2/core/cuda/border_interpolate.hpp +722 -0
package/include/opencv2/core/cuda/color.hpp +309 -0
package/include/opencv2/core/cuda/common.hpp +131 -0
package/include/opencv2/core/cuda/datamov_utils.hpp +113 -0
package/include/opencv2/core/cuda/detail/color_detail.hpp +2018 -0
package/include/opencv2/core/cuda/detail/reduce.hpp +365 -0
package/include/opencv2/core/cuda/detail/reduce_key_val.hpp +502 -0
package/include/opencv2/core/cuda/detail/transform_detail.hpp +392 -0
package/include/opencv2/core/cuda/detail/type_traits_detail.hpp +191 -0
package/include/opencv2/core/cuda/detail/vec_distance_detail.hpp +121 -0
package/include/opencv2/core/cuda/dynamic_smem.hpp +88 -0
package/include/opencv2/core/cuda/emulation.hpp +269 -0
package/include/opencv2/core/cuda/filters.hpp +293 -0
package/include/opencv2/core/cuda/funcattrib.hpp +79 -0
package/include/opencv2/core/cuda/functional.hpp +805 -0
package/include/opencv2/core/cuda/limits.hpp +128 -0
package/include/opencv2/core/cuda/reduce.hpp +209 -0
package/include/opencv2/core/cuda/saturate_cast.hpp +292 -0
package/include/opencv2/core/cuda/scan.hpp +258 -0
package/include/opencv2/core/cuda/simd_functions.hpp +869 -0
package/include/opencv2/core/cuda/transform.hpp +75 -0
package/include/opencv2/core/cuda/type_traits.hpp +90 -0
package/include/opencv2/core/cuda/utility.hpp +230 -0
package/include/opencv2/core/cuda/vec_distance.hpp +232 -0
package/include/opencv2/core/cuda/vec_math.hpp +923 -0
package/include/opencv2/core/cuda/vec_traits.hpp +288 -0
package/include/opencv2/core/cuda/warp.hpp +139 -0
package/include/opencv2/core/cuda/warp_reduce.hpp +76 -0
package/include/opencv2/core/cuda/warp_shuffle.hpp +162 -0
package/include/opencv2/core/cuda.hpp +1279 -0
package/include/opencv2/core/cuda.inl.hpp +763 -0
package/include/opencv2/core/cuda_stream_accessor.hpp +86 -0
package/include/opencv2/core/cuda_types.hpp +144 -0
package/include/opencv2/core/cv_cpu_dispatch.h +381 -0
package/include/opencv2/core/cv_cpu_helper.h +550 -0
package/include/opencv2/core/cvdef.h +973 -0
package/include/opencv2/core/cvstd.hpp +190 -0
package/include/opencv2/core/cvstd.inl.hpp +197 -0
package/include/opencv2/core/cvstd_wrapper.hpp +154 -0
package/include/opencv2/core/detail/async_promise.hpp +71 -0
package/include/opencv2/core/detail/dispatch_helper.impl.hpp +49 -0
package/include/opencv2/core/detail/exception_ptr.hpp +27 -0
package/include/opencv2/core/directx.hpp +184 -0
package/include/opencv2/core/dualquaternion.hpp +979 -0
package/include/opencv2/core/dualquaternion.inl.hpp +487 -0
package/include/opencv2/core/eigen.hpp +402 -0
package/include/opencv2/core/fast_math.hpp +433 -0
package/include/opencv2/core/hal/hal.hpp +256 -0
package/include/opencv2/core/hal/interface.h +190 -0
package/include/opencv2/core/hal/intrin.hpp +939 -0
package/include/opencv2/core/hal/intrin_avx.hpp +3177 -0
package/include/opencv2/core/hal/intrin_avx512.hpp +3090 -0
package/include/opencv2/core/hal/intrin_cpp.hpp +3321 -0
package/include/opencv2/core/hal/intrin_forward.hpp +191 -0
package/include/opencv2/core/hal/intrin_lasx.hpp +3236 -0
package/include/opencv2/core/hal/intrin_msa.hpp +1887 -0
package/include/opencv2/core/hal/intrin_neon.hpp +2610 -0
package/include/opencv2/core/hal/intrin_rvv.hpp +3320 -0
package/include/opencv2/core/hal/intrin_rvv071.hpp +2545 -0
package/include/opencv2/core/hal/intrin_rvv_scalable.hpp +2080 -0
package/include/opencv2/core/hal/intrin_sse.hpp +3467 -0
package/include/opencv2/core/hal/intrin_sse_em.hpp +180 -0
package/include/opencv2/core/hal/intrin_vsx.hpp +1608 -0
package/include/opencv2/core/hal/intrin_wasm.hpp +2782 -0
package/include/opencv2/core/hal/msa_macros.h +1558 -0
package/include/opencv2/core/hal/simd_utils.impl.hpp +186 -0
package/include/opencv2/core/llapi/llapi.h +102 -0
package/include/opencv2/core/mat.hpp +3775 -0
package/include/opencv2/core/mat.inl.hpp +3422 -0
package/include/opencv2/core/matx.hpp +1536 -0
package/include/opencv2/core/neon_utils.hpp +128 -0
package/include/opencv2/core/ocl.hpp +917 -0
package/include/opencv2/core/ocl_genbase.hpp +69 -0
package/include/opencv2/core/opencl/ocl_defs.hpp +82 -0
package/include/opencv2/core/opencl/opencl_info.hpp +212 -0
package/include/opencv2/core/opencl/opencl_svm.hpp +81 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp +602 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp +146 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp +371 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp +272 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp +62 -0
package/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp +42 -0
package/include/opencv2/core/opencl/runtime/opencl_clblas.hpp +53 -0
package/include/opencv2/core/opencl/runtime/opencl_clfft.hpp +53 -0
package/include/opencv2/core/opencl/runtime/opencl_core.hpp +84 -0
package/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp +47 -0
package/include/opencv2/core/opencl/runtime/opencl_gl.hpp +53 -0
package/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp +47 -0
package/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp +48 -0
package/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp +42 -0
package/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp +166 -0
package/include/opencv2/core/opengl.hpp +733 -0
package/include/opencv2/core/openvx/ovx_defs.hpp +48 -0
package/include/opencv2/core/operations.hpp +610 -0
package/include/opencv2/core/optim.hpp +302 -0
package/include/opencv2/core/ovx.hpp +28 -0
package/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp +72 -0
package/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp +153 -0
package/include/opencv2/core/parallel/parallel_backend.hpp +90 -0
package/include/opencv2/core/persistence.hpp +1350 -0
package/include/opencv2/core/private/cv_cpu_include_simd_declarations.hpp +30 -0
package/include/opencv2/core/private.cuda.hpp +169 -0
package/include/opencv2/core/private.hpp +896 -0
package/include/opencv2/core/quaternion.hpp +1696 -0
package/include/opencv2/core/quaternion.inl.hpp +1063 -0
package/include/opencv2/core/saturate.hpp +180 -0
package/include/opencv2/core/simd_intrinsics.hpp +87 -0
package/include/opencv2/core/softfloat.hpp +514 -0
package/include/opencv2/core/sse_utils.hpp +652 -0
package/include/opencv2/core/traits.hpp +417 -0
package/include/opencv2/core/types.hpp +2457 -0
package/include/opencv2/core/types_c.h +2126 -0
package/include/opencv2/core/utility.hpp +1229 -0
package/include/opencv2/core/utils/allocator_stats.hpp +29 -0
package/include/opencv2/core/utils/allocator_stats.impl.hpp +158 -0
package/include/opencv2/core/utils/buffer_area.private.hpp +136 -0
package/include/opencv2/core/utils/configuration.private.hpp +22 -0
package/include/opencv2/core/utils/filesystem.hpp +82 -0
package/include/opencv2/core/utils/filesystem.private.hpp +66 -0
package/include/opencv2/core/utils/fp_control.private.hpp +29 -0
package/include/opencv2/core/utils/fp_control_utils.hpp +69 -0
package/include/opencv2/core/utils/instrumentation.hpp +125 -0
package/include/opencv2/core/utils/lock.private.hpp +119 -0
package/include/opencv2/core/utils/logger.defines.hpp +42 -0
package/include/opencv2/core/utils/logger.hpp +218 -0
package/include/opencv2/core/utils/logtag.hpp +28 -0
package/include/opencv2/core/utils/plugin_loader.private.hpp +165 -0
package/include/opencv2/core/utils/tls.hpp +235 -0
package/include/opencv2/core/utils/trace.hpp +252 -0
package/include/opencv2/core/utils/trace.private.hpp +421 -0
package/include/opencv2/core/va_intel.hpp +75 -0
package/include/opencv2/core/version.hpp +26 -0
package/include/opencv2/core/vsx_utils.hpp +1047 -0
package/include/opencv2/core.hpp +3365 -0
package/include/opencv2/imgcodecs/imgcodecs.hpp +48 -0
package/include/opencv2/imgcodecs/imgcodecs_c.h +1 -0
package/include/opencv2/imgcodecs/ios.h +59 -0
package/include/opencv2/imgcodecs/legacy/constants_c.h +54 -0
package/include/opencv2/imgcodecs/macosx.h +20 -0
package/include/opencv2/imgcodecs.hpp +407 -0
package/include/opencv2/imgproc/bindings.hpp +34 -0
package/include/opencv2/imgproc/detail/gcgraph.hpp +395 -0
package/include/opencv2/imgproc/hal/hal.hpp +246 -0
package/include/opencv2/imgproc/hal/interface.h +46 -0
package/include/opencv2/imgproc/imgproc.hpp +48 -0
package/include/opencv2/imgproc/imgproc_c.h +1177 -0
package/include/opencv2/imgproc/segmentation.hpp +141 -0
package/include/opencv2/imgproc/types_c.h +659 -0
package/include/opencv2/imgproc.hpp +5035 -0
package/include/opencv2/opencv_modules.hpp +17 -0
package/libs/libjpeg-turbo.lib +0 -0
package/libs/libpng.lib +0 -0
package/libs/opencv_core470.lib +0 -0
package/libs/opencv_imgcodecs470.lib +0 -0
package/libs/opencv_imgproc470.lib +0 -0
package/libs/zlib.lib +0 -0
package/package.json +14 -3
package/prebuilds/win32-x64/node.napi.node +0 -0
package/src/cpp/capturewindow.cpp +36 -46
package/src/cpp/main.cpp +10 -2
package/src/cpp/opencv.cpp +425 -0

package/include/opencv2/core/cuda/simd_functions.hpp ADDED Viewed

@@ -0,0 +1,869 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+#include "common.hpp"
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+//! @cond IGNORED
+namespace cv { namespace cuda { namespace device
+{
+    // 2
+    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a + b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+        r = r - s;          // subtract out carry-out from low word
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a - b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // borrow to high word
+        r = r + s;          // compensate for borrow from low word
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u, v;
+        s = a & 0x0000ffff; // extract low halfword
+        r = b & 0x0000ffff; // extract low halfword
+        u = ::max(r, s);    // maximum of low halfwords
+        v = ::min(r, s);    // minimum of low halfwords
+        s = a & 0xffff0000; // extract high halfword
+        r = b & 0xffff0000; // extract high halfword
+        t = ::max(r, s);    // maximum of high halfwords
+        s = ::min(r, s);    // minimum of high halfwords
+        r = u | t;          // maximum of both halfwords
+        s = v | s;          // minimum of both halfwords
+        r = r - s;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+        s = s >> 1;
+        s = r + s;
+        return s;
+    }
+    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int s;
+        s = a ^ b;
+        r = a | b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+        s = s >> 1;
+        r = r - s;
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::max(r, s);    // maximum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::max(r, s);    // maximum of high halfwords
+        r = t | u;          // combine halfword maximums
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::min(r, s);    // minimum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::min(r, s);    // minimum of high halfwords
+        r = t | u;          // combine halfword minimums
+    #endif
+        return r;
+    }
+    // 4
+    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ b;          // sum bits
+        r = a & 0x7f7f7f7f; // clear msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // msb sum bits
+        r = r + t;          // add without msbs, record carry-out in msbs
+        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+    #endif /* __CUDA_ARCH__ >= 300 */
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ ~b;         // inverted sum bits
+        r = a | 0x80808080; // set msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // inverted msb sum bits
+        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+        r = r ^ s;          // combine inverted msb sum bits and borrows
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        s = s >> 1;
+        s = r + s;
+        return s;
+    }
+    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int c;
+        c = a ^ b;
+        r = a | b;
+        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        c = c >> 1;
+        r = r - c;
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r & ~c;         // msb = 1, if r was 0x00
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, t;
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq4(a, b);
+        t = r << 8;         // convert bool
+        r = t - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        t = a ^ b;          // 0x00 if a == b
+        r = t | 0x80808080; // set msbs, to catch carry out
+        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+        r = t & ~r;         // msb = 1, if t was 0x00
+        t = r >> 7;         // build mask
+        t = r - t;          //  from
+        r = t | r;          //   msbs
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge4(a, b);
+        s = r << 8;         // convert bool
+        r = s - r;          //  to mask
+    #else
+        asm ("not.b32 %0,%0;" : "+r"(b));
+        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+        r = r & 0x80808080; // msb = carry-outs
+        s = r >> 7;         // build mask
+        s = r - s;          //  from
+        r = s | r;          //   msbs
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert to bool
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a ^ b;          //
+        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+        r = s - r;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+        return r;
+    }
+    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+        return r;           // byte-wise unsigned maximum
+    }
+    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(b, a);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+        return r;
+    }
+}}}
+//! @endcond
+#endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP