PyPI - numba-cuda - Versions diffs - 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl - Mend

numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/api.py +4 -1
numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/codegen.py +42 -10
numba_cuda/numba/cuda/compiler.py +10 -4
numba_cuda/numba/cuda/core/analysis.py +29 -21
numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
numba_cuda/numba/cuda/core/base.py +6 -1
numba_cuda/numba/cuda/core/consts.py +1 -1
numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
numba_cuda/numba/cuda/core/errors.py +4 -912
numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
numba_cuda/numba/cuda/core/interpreter.py +79 -64
numba_cuda/numba/cuda/core/ir.py +191 -119
numba_cuda/numba/cuda/core/ir_utils.py +142 -112
numba_cuda/numba/cuda/core/postproc.py +8 -8
numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
numba_cuda/numba/cuda/core/ssa.py +3 -3
numba_cuda/numba/cuda/core/transforms.py +25 -10
numba_cuda/numba/cuda/core/typed_passes.py +9 -9
numba_cuda/numba/cuda/core/typeinfer.py +39 -24
numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
numba_cuda/numba/cuda/cudadecl.py +0 -13
numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
numba_cuda/numba/cuda/cudaimpl.py +0 -12
numba_cuda/numba/cuda/debuginfo.py +104 -10
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +4 -7
numba_cuda/numba/cuda/dispatcher.py +36 -32
numba_cuda/numba/cuda/intrinsics.py +150 -1
numba_cuda/numba/cuda/lowering.py +64 -29
numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
numba_cuda/numba/cuda/np/arrayobj.py +54 -0
numba_cuda/numba/cuda/np/numpy_support.py +26 -0
numba_cuda/numba/cuda/printimpl.py +20 -0
numba_cuda/numba/cuda/serialize.py +10 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
numba_cuda/numba/cuda/typing/context.py +3 -1
numba_cuda/numba/cuda/typing/typeof.py +56 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudadrv/dummyarray.py CHANGED Viewed

@@ -279,6 +279,10 @@ class Array(object):
         if not self.dims:
             return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
+        # All 0-size arrays are considered contiguous, even if they are multidimensional
+        if self.size == 0:
+            return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
         # If this is a broadcast array then it is not contiguous
         if any([dim.stride == 0 for dim in self.dims]):
             return {"C_CONTIGUOUS": False, "F_CONTIGUOUS": False}

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -109,6 +109,22 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     includes = [numba_include, *cuda_includes, nrt_include, *extra_includes]
+    # TODO: move all this into Program/ProgramOptions
+    # logsz = config.CUDA_LOG_SIZE
+    #
+    # jitinfo = bytearray(logsz)
+    # jiterrors = bytearray(logsz)
+    #
+    # jit_option = binding.CUjit_option
+    # options = {
+    #     jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
+    #     jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
+    #     jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
+    #     jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
+    #     jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
+    # }
+    # info_log = jitinfo.decode("utf-8")
     options = ProgramOptions(
         arch=arch,
         include_path=includes,

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -280,18 +280,6 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
-@lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
-def ptx_vote_sync(context, builder, sig, args):
-    fname = "llvm.nvvm.vote.sync"
-    lmod = builder.module
-    fnty = ir.FunctionType(
-        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-        (ir.IntType(32), ir.IntType(32), ir.IntType(1)),
-    )
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    return builder.call(func, args)
 @lower(stubs.match_any_sync, types.i4, types.i4)
 @lower(stubs.match_any_sync, types.i4, types.i8)
 @lower(stubs.match_any_sync, types.i4, types.f4)

numba_cuda/numba/cuda/debuginfo.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import abc
 import os
 from contextlib import contextmanager
+from enum import IntEnum
 import llvmlite
 from llvmlite import ir
@@ -71,6 +72,16 @@ if not hasattr(config, "CUDA_DEBUG_POLY_USE_TYPED_CONST"):
     config.CUDA_DEBUG_POLY_USE_TYPED_CONST = DEBUG_POLY_USE_TYPED_CONST
+class DwarfAddressClass(IntEnum):
+    GENERIC = 0x00
+    GLOBAL = 0x01
+    REGISTER = 0x02
+    CONSTANT = 0x05
+    LOCAL = 0x06
+    PARAMETER = 0x07
+    SHARED = 0x08
 @contextmanager
 def suspend_emission(builder):
     """Suspends the emission of debug_metadata for the duration of the context
@@ -179,6 +190,19 @@ class DIBuilder(AbstractDIBuilder):
         # constructing subprograms
         self.dicompileunit = self._di_compile_unit()
+    def get_dwarf_address_class(self, addrspace):
+        # Map NVVM address space to DWARF address class.
+        from numba.cuda.cudadrv import nvvm
+        addrspace_to_addrclass_dict = {
+            nvvm.ADDRSPACE_GENERIC: None,
+            nvvm.ADDRSPACE_GLOBAL: DwarfAddressClass.GLOBAL,
+            nvvm.ADDRSPACE_SHARED: DwarfAddressClass.SHARED,
+            nvvm.ADDRSPACE_CONSTANT: DwarfAddressClass.CONSTANT,
+            nvvm.ADDRSPACE_LOCAL: DwarfAddressClass.LOCAL,
+        }
+        return addrspace_to_addrclass_dict.get(addrspace)
     def _var_type(self, lltype, size, datamodel=None):
         if self._DEBUG:
             print(
@@ -622,6 +646,11 @@ class CUDADIBuilder(DIBuilder):
         super().__init__(module, filepath, cgctx, directives_only)
         # Cache for local variable metadata type and line deduplication
         self._vartypelinemap = {}
+        # Variable address space dictionary
+        self._var_addrspace_map = {}
+    def _set_addrspace_map(self, map):
+        self._var_addrspace_map = map
     def _var_type(self, lltype, size, datamodel=None):
         is_bool = False
@@ -796,6 +825,65 @@ class CUDADIBuilder(DIBuilder):
                     },
                     is_distinct=True,
                 )
+        # Check if there's actually address space info to handle
+        addrspace = getattr(self, "_addrspace", None)
+        if (
+            isinstance(lltype, ir.LiteralStructType)
+            and datamodel is not None
+            and datamodel.inner_models()
+            and addrspace not in (None, 0)
+        ):
+            # Process struct with datamodel that has address space info
+            meta = []
+            offset = 0
+            for element, field, model in zip(
+                lltype.elements, datamodel._fields, datamodel.inner_models()
+            ):
+                size_field = self.cgctx.get_abi_sizeof(element)
+                if isinstance(element, ir.PointerType) and field == "data":
+                    # Create pointer type with correct address space
+                    pointee_size = self.cgctx.get_abi_sizeof(element.pointee)
+                    pointee_model = getattr(model, "_pointee_model", None)
+                    pointee_type = self._var_type(
+                        element.pointee, pointee_size, datamodel=pointee_model
+                    )
+                    meta_ptr = {
+                        "tag": ir.DIToken("DW_TAG_pointer_type"),
+                        "baseType": pointee_type,
+                        "size": _BYTE_SIZE * size_field,
+                    }
+                    dwarf_addrclass = self.get_dwarf_address_class(addrspace)
+                    if dwarf_addrclass is not None:
+                        meta_ptr["dwarfAddressSpace"] = int(dwarf_addrclass)
+                    basetype = m.add_debug_info("DIDerivedType", meta_ptr)
+                else:
+                    basetype = self._var_type(
+                        element, size_field, datamodel=model
+                    )
+                derived_type = m.add_debug_info(
+                    "DIDerivedType",
+                    {
+                        "tag": ir.DIToken("DW_TAG_member"),
+                        "name": field,
+                        "baseType": basetype,
+                        "size": _BYTE_SIZE * size_field,
+                        "offset": offset,
+                    },
+                )
+                meta.append(derived_type)
+                offset += _BYTE_SIZE * size_field
+            return m.add_debug_info(
+                "DICompositeType",
+                {
+                    "tag": ir.DIToken("DW_TAG_structure_type"),
+                    "name": f"{datamodel.fe_type}",
+                    "elements": m.add_metadata(meta),
+                    "size": offset,
+                },
+                is_distinct=True,
+            )
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)
@@ -848,16 +936,22 @@ class CUDADIBuilder(DIBuilder):
                 # to llvm.dbg.value
                 return
             else:
-                return super().mark_variable(
-                    builder,
-                    allocavalue,
-                    name,
-                    lltype,
-                    size,
-                    line,
-                    datamodel,
-                    argidx,
-                )
+                # Look up address space for this variable
+                self._addrspace = self._var_addrspace_map.get(name)
+                try:
+                    return super().mark_variable(
+                        builder,
+                        allocavalue,
+                        name,
+                        lltype,
+                        size,
+                        line,
+                        datamodel,
+                        argidx,
+                    )
+                finally:
+                    # Clean up address space info
+                    self._addrspace = None
     def update_variable(
         self,

numba_cuda/numba/cuda/descriptor.py CHANGED Viewed

@@ -28,7 +28,7 @@ class CUDATarget:
     @property
     def target_context(self):
         if self._targetctx is None:
-            self._targetctx = CUDATargetContext(self._typingctx)
+            self._targetctx = CUDATargetContext(self.typing_context)
         return self._targetctx

numba_cuda/numba/cuda/device_init.py CHANGED Viewed

@@ -27,7 +27,6 @@ from .stubs import (
     local,
     const,
     atomic,
-    vote_sync_intrinsic,
     match_any_sync,
     match_all_sync,
     threadfence_block,
@@ -56,6 +55,10 @@ from .intrinsics import (
     shfl_up_sync,
     shfl_down_sync,
     shfl_xor_sync,
+    all_sync,
+    any_sync,
+    eq_sync,
+    ballot_sync,
 )
 from .cudadrv.error import CudaSupportError
 from numba.cuda.cudadrv.driver import (
@@ -79,12 +82,6 @@ from .api import *
 from .api import _auto_device
 from .args import In, Out, InOut
-from .intrinsic_wrapper import (
-    all_sync,
-    any_sync,
-    eq_sync,
-    ballot_sync,
-)
 from .kernels import reduction
 from numba.cuda.cudadrv.linkable_code import (

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -15,6 +15,8 @@ import uuid
 import re
 from warnings import warn
+from cuda.core.experimental import launch
 from numba.cuda.core import errors
 from numba.cuda import serialize, utils
 from numba import cuda
@@ -39,6 +41,7 @@ from numba.cuda.compiler import (
 from numba.cuda.core import sigutils, config, entrypoints
 from numba.cuda.flags import Flags
 from numba.cuda.cudadrv import driver, nvvm
+from cuda.core.experimental import LaunchConfig
 from numba.cuda.locks import module_init_lock
 from numba.cuda.core.caching import Cache, CacheImpl, NullCache
 from numba.cuda.descriptor import cuda_target
@@ -475,18 +478,15 @@ class _Kernel(serialize.ReduceMixin):
         for t, v in zip(self.argument_types, args):
             self._prepare_args(t, v, stream, retr, kernelargs)
-        stream_handle = driver._stream_handle(stream)
         # Invoke kernel
-        driver.launch_kernel(
-            cufunc.handle,
-            *griddim,
-            *blockdim,
-            sharedmem,
-            stream_handle,
-            kernelargs,
-            cooperative=self.cooperative,
+        config = LaunchConfig(
+            grid=griddim,
+            block=blockdim,
+            shmem_size=sharedmem,
+            cooperative_launch=self.cooperative,
         )
+        kernel = cufunc.kernel
+        launch(stream, config, kernel, *kernelargs)
         if self.debug:
             driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
@@ -540,30 +540,26 @@ class _Kernel(serialize.ReduceMixin):
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
-            c_intp = ctypes.c_ssize_t
-            meminfo = ctypes.c_void_p(0)
-            parent = ctypes.c_void_p(0)
-            nitems = c_intp(devary.size)
-            itemsize = c_intp(devary.dtype.itemsize)
-            ptr = driver.device_pointer(devary)
-            ptr = int(ptr)
-            data = ctypes.c_void_p(ptr)
+            meminfo = 0
+            parent = 0
             kernelargs.append(meminfo)
             kernelargs.append(parent)
-            kernelargs.append(nitems)
-            kernelargs.append(itemsize)
-            kernelargs.append(data)
-            kernelargs.extend(map(c_intp, devary.shape))
-            kernelargs.extend(map(c_intp, devary.strides))
+            # non-pointer-arguments-without-ctypes might be dicey, since we're
+            # assuming shape, strides, size, and itemsize fit into intptr_t
+            # however, this saves a noticeable amount of overhead in kernel
+            # invocation
+            kernelargs.append(devary.size)
+            kernelargs.append(devary.dtype.itemsize)
+            kernelargs.append(devary.device_ctypes_pointer.value)
+            kernelargs.extend(devary.shape)
+            kernelargs.extend(devary.strides)
         elif isinstance(ty, types.CPointer):
             # Pointer arguments should be a pointer-sized integer
-            kernelargs.append(ctypes.c_uint64(val))
+            kernelargs.append(val)
         elif isinstance(ty, types.Integer):
             cval = getattr(ctypes, "c_%s" % ty)(val)
@@ -582,8 +578,7 @@ class _Kernel(serialize.ReduceMixin):
             kernelargs.append(cval)
         elif ty == types.boolean:
-            cval = ctypes.c_uint8(int(val))
-            kernelargs.append(cval)
+            kernelargs.append(val)
         elif ty == types.complex64:
             kernelargs.append(ctypes.c_float(val.real))
@@ -598,8 +593,7 @@ class _Kernel(serialize.ReduceMixin):
         elif isinstance(ty, types.Record):
             devrec = wrap_arg(val).to_device(retr, stream)
-            ptr = devrec.device_ctypes_pointer
-            kernelargs.append(ptr)
+            kernelargs.append(devrec.device_ctypes_pointer.value)
         elif isinstance(ty, types.BaseTuple):
             assert len(ty) == len(val)
@@ -671,7 +665,7 @@ class _LaunchConfiguration:
         self.dispatcher = dispatcher
         self.griddim = griddim
         self.blockdim = blockdim
-        self.stream = stream
+        self.stream = driver._to_core_stream(stream)
         self.sharedmem = sharedmem
         if (
@@ -700,6 +694,16 @@ class _LaunchConfiguration:
             args, self.griddim, self.blockdim, self.stream, self.sharedmem
         )
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["stream"] = int(state["stream"].handle)
+        return state
+    def __setstate__(self, state):
+        handle = state.pop("stream")
+        self.__dict__.update(state)
+        self.stream = driver._to_core_stream(handle)
 class CUDACacheImpl(CacheImpl):
     def reduce(self, kernel):

numba_cuda/numba/cuda/intrinsics.py CHANGED Viewed

@@ -6,7 +6,11 @@ from llvmlite import ir
 from numba import cuda
 from numba.cuda import types
 from numba.cuda import cgutils
-from numba.cuda.core.errors import RequireLiteralValue, TypingError
+from numba.cuda.core.errors import (
+    RequireLiteralValue,
+    TypingError,
+    NumbaTypeError,
+)
 from numba.cuda.typing import signature
 from numba.cuda.extending import overload_attribute, overload_method
 from numba.cuda import nvvmutils
@@ -380,3 +384,148 @@ def shfl_sync_intrinsic(
     sig = signature(a_type, membermask_type, a_type, b_type)
     return sig, codegen
+# -------------------------------------------------------------------------------
+# Warp vote functions
+#
+# References:
+#
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-vote-functions
+# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?highlight=data%2520movement#vote
+#
+# Notes:
+#
+# - The NVVM IR specification requires some of the mode parameter to be
+#   constants. It's therefore essential that we pass in mode values to the
+#   vote_sync_intrinsic.
+@intrinsic
+def all_sync(typingctx, mask_type, predicate_type):
+    """
+    If for all threads in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 0
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        # Call vote_sync_intrinsic and extract the boolean result (index 1)
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def any_sync(typingctx, mask_type, predicate_type):
+    """
+    If for any thread in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 1
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def eq_sync(typingctx, mask_type, predicate_type):
+    """
+    If for all threads in the masked warp the boolean predicate is the same,
+    then a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 2
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def ballot_sync(typingctx, mask_type, predicate_type):
+    """
+    Returns a mask of all threads in the warp whose predicate is true,
+    and are within the given mask.
+    """
+    mode_value = 3
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(
+            result_tuple, 0
+        )  # Extract ballot result (index 0)
+    sig_outer = signature(types.i4, mask_type, predicate_type)
+    return sig_outer, codegen
+def vote_sync_intrinsic(typingctx, mask_type, mode_value, predicate_type):
+    # Validate mode value
+    if mode_value not in (0, 1, 2, 3):
+        raise ValueError("Mode must be 0 (all), 1 (any), 2 (eq), or 3 (ballot)")
+    if types.unliteral(mask_type) not in types.integer_domain:
+        raise NumbaTypeError(f"Mask type must be an integer. Got {mask_type}")
+    predicate_types = types.integer_domain | {types.boolean}
+    if types.unliteral(predicate_type) not in predicate_types:
+        raise NumbaTypeError(
+            f"Predicate must be an integer or boolean. Got {predicate_type}"
+        )
+    def codegen(context, builder, sig, args):
+        mask, predicate = args
+        # Types
+        i1 = ir.IntType(1)
+        i32 = ir.IntType(32)
+        # NVVM intrinsic definition
+        arg_types = (i32, i32, i1)
+        vote_return_type = ir.LiteralStructType((i32, i1))
+        fnty = ir.FunctionType(vote_return_type, arg_types)
+        fname = "llvm.nvvm.vote.sync"
+        lmod = builder.module
+        vote_sync = cgutils.get_or_insert_function(lmod, fnty, fname)
+        # Intrinsic arguments
+        mode = ir.Constant(i32, mode_value)
+        mask_i32 = builder.trunc(mask, i32)
+        # Convert predicate to i1
+        if predicate.type != ir.IntType(1):
+            predicate_bool = builder.icmp_signed(
+                "!=", predicate, ir.Constant(predicate.type, 0)
+            )
+        else:
+            predicate_bool = predicate
+        return builder.call(vote_sync, [mask_i32, mode, predicate_bool])
+    sig = signature(
+        types.Tuple((types.i4, types.b1)), mask_type, predicate_type
+    )
+    return sig, codegen