PyPI - numba-cuda - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

numba-cuda 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/codegen.py +15 -3
numba_cuda/numba/cuda/cuda_paths.py +68 -0
numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
numba_cuda/numba/cuda/cudadrv/driver.py +209 -47
numba_cuda/numba/cuda/cudadrv/enums.py +3 -0
numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
numba_cuda/numba/cuda/cudadrv/linkable_code.py +63 -0
numba_cuda/numba/cuda/cudadrv/mappings.py +24 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
numba_cuda/numba/cuda/device_init.py +3 -0
numba_cuda/numba/cuda/dispatcher.py +48 -8
numba_cuda/numba/cuda/intrinsics.py +6 -1
numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
numba_cuda/numba/cuda/simulator/api.py +14 -0
numba_cuda/numba/cuda/target.py +8 -2
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +199 -0
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +44 -4
numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +48 -0
numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +51 -0
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +170 -0
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +19 -0
numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +3 -0
{numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/METADATA +1 -1
{numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/RECORD +32 -20
{numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/WHEEL +1 -1
{numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/LICENSE +0 -0
{numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import numpy as np
 import os
+import re
 import sys
 import ctypes
 import functools
@@ -43,10 +44,25 @@ class _Kernel(serialize.ReduceMixin):
     object launches the kernel on the device.
     '''
+    NRT_functions = [
+        "NRT_Allocate",
+        "NRT_MemInfo_init",
+        "NRT_MemInfo_new",
+        "NRT_Free",
+        "NRT_dealloc",
+        "NRT_MemInfo_destroy",
+        "NRT_MemInfo_call_dtor",
+        "NRT_MemInfo_data_fast",
+        "NRT_MemInfo_alloc_aligned",
+        "NRT_Allocate_External",
+        "NRT_decref",
+        "NRT_incref"
+    ]
     @global_compiler_lock
     def __init__(self, py_func, argtypes, link=None, debug=False,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
-                 max_registers=None, opt=True, device=False):
+                 max_registers=None, lto=False, opt=True, device=False):
         if device:
             raise RuntimeError('Cannot compile a device function as a kernel')
@@ -94,7 +110,7 @@ class _Kernel(serialize.ReduceMixin):
         lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
                                                   debug, lineinfo, nvvm_options,
                                                   filename, linenum,
-                                                  max_registers)
+                                                  max_registers, lto)
         if not link:
             link = []
@@ -105,16 +121,20 @@ class _Kernel(serialize.ReduceMixin):
         if self.cooperative:
             lib.needs_cudadevrt = True
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        asm = lib.get_asm_str()
         res = [fn for fn in cuda_fp16_math_funcs
-               if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
+               if (f'__numba_wrapper_{fn}' in asm)]
         if res:
             # Path to the source containing the foreign function
-            basedir = os.path.dirname(os.path.abspath(__file__))
             functions_cu_path = os.path.join(basedir,
                                              'cpp_function_wrappers.cu')
             link.append(functions_cu_path)
+        link = self.maybe_link_nrt(link, tgt_ctx, asm)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -136,6 +156,25 @@ class _Kernel(serialize.ReduceMixin):
         self.lifted = []
         self.reload_init = []
+    def maybe_link_nrt(self, link, tgt_ctx, asm):
+        if not tgt_ctx.enable_nrt:
+            return link
+        all_nrt = "|".join(self.NRT_functions)
+        pattern = (
+            r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
+            + all_nrt + r')\s*\([^)]*\)\s*;'
+        )
+        nrt_in_asm = re.findall(pattern, asm)
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        if nrt_in_asm:
+            nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
+            link.append(nrt_path)
+        return link
     @property
     def library(self):
         return self._codelibrary
@@ -385,7 +424,6 @@ class _Kernel(serialize.ReduceMixin):
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
             c_intp = ctypes.c_ssize_t
             meminfo = ctypes.c_void_p(0)
@@ -519,7 +557,10 @@ class _LaunchConfiguration:
         self.stream = stream
         self.sharedmem = sharedmem
-        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
+        if (
+            config.CUDA_LOW_OCCUPANCY_WARNINGS
+            and not config.DISABLE_PERFORMANCE_WARNINGS
+        ):
             # Warn when the grid has fewer than 128 blocks. This number is
             # chosen somewhat heuristically - ideally the minimum is 2 times
             # the number of SMs, but the number of SMs varies between devices -
@@ -708,8 +749,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
         *args*.
         '''
         cc = get_current_device().compute_capability
-        argtypes = tuple(
-            [self.typingctx.resolve_argument_type(a) for a in args])
+        argtypes = tuple(self.typeof_pyval(a) for a in args)
         if self.specialized:
             raise RuntimeError('Dispatcher already specialized')

numba_cuda/numba/cuda/intrinsics.py CHANGED Viewed

@@ -4,7 +4,7 @@ from numba import cuda, types
 from numba.core import cgutils
 from numba.core.errors import RequireLiteralValue
 from numba.core.typing import signature
-from numba.core.extending import overload_attribute
+from numba.core.extending import overload_attribute, overload_method
 from numba.cuda import nvvmutils
 from numba.cuda.extending import intrinsic
@@ -196,3 +196,8 @@ def syncthreads_or(typingctx, predicate):
     '''
     fname = 'llvm.nvvm.barrier0.or'
     return _syncthreads_predicate(typingctx, predicate, fname)
+@overload_method(types.Integer, 'bit_count', target='cuda')
+def integer_bit_count(i):
+    return lambda i: cuda.popc(i)

numba_cuda/numba/cuda/runtime/nrt.cu ADDED Viewed

@@ -0,0 +1,190 @@
+#ifndef _NRT_H
+#define _NRT_H
+#include <cuda/atomic>
+typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+typedef struct MemInfo NRT_MemInfo;
+extern "C" {
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+}
+// Globally needed variables
+struct NRT_MemSys {
+  struct {
+    bool enabled;
+    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> free;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+  } stats;
+};
+static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
+static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
+extern "C" __device__ void* NRT_Allocate_External(size_t size);
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+extern "C" __device__ void* NRT_Allocate(size_t size)
+{
+  void* ptr = NULL;
+  ptr       = malloc(size);
+//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  return ptr;
+}
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info)
+//                                            NRT_MemSys* TheMSys)
+{
+  mi->refct     = 1; /* starts with 1 refct */
+  mi->dtor      = dtor;
+  mi->dtor_info = dtor_info;
+  mi->data      = data;
+  mi->size      = size;
+//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+}
+extern "C"
+__device__ NRT_MemInfo* NRT_MemInfo_new(
+  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
+{
+  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
+  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
+  return mi;
+}
+extern "C" __device__ void NRT_Free(void* ptr)
+{
+  free(ptr);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+}
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
+{
+  NRT_Free(mi);
+}
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
+{
+  NRT_dealloc(mi);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+}
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
+{
+  if (mi->dtor) /* We have a destructor */
+    mi->dtor(mi->data, mi->size, NULL);
+  /* Clear and release MemInfo */
+  NRT_MemInfo_destroy(mi);
+}
+extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
+{
+  return mi->data;
+}
+extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
+    NRT_MemInfo *mi = NULL;
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    if (data == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
+    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    return mi;
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
+                                          NRT_MemInfo **mi)
+{
+    size_t offset = 0, intptr = 0, remainder = 0;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
+    char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
+    if (base == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    intptr = (size_t) base;
+    /*
+     * See if the allocation is aligned already...
+     * Check if align is a power of 2, if so the modulo can be avoided.
+     */
+    if((align & (align - 1)) == 0)
+    {
+        remainder = intptr & (align - 1);
+    }
+    else
+    {
+        remainder = intptr % align;
+    }
+    if (remainder == 0){ /* Yes */
+        offset = 0;
+    } else { /* No, move forward `offset` bytes */
+        offset = align - remainder;
+    }
+    return (void*)((char *)base + offset);
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
+    NRT_MemInfo *mi = NULL;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
+    char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
+    if (base == NULL) {
+        *mi_out = NULL; /* set meminfo to NULL as allocation failed */
+        return NULL; /* return early as allocation failed */
+    }
+    mi = (NRT_MemInfo *) base;
+    *mi_out = mi;
+    return (void*)((char *)base + sizeof(NRT_MemInfo));
+}
+extern "C" __device__ void* NRT_Allocate_External(size_t size) {
+    void *ptr = NULL;
+    ptr = malloc(size);
+    //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
+    //if (TheMSys.stats.enabled)
+    //{
+    //    TheMSys.stats.alloc++;
+    //}
+    return ptr;
+}
+/*
+  c++ version of the NRT_decref function that usually is added to
+  the final kernel link in PTX form by numba. This version may be
+  used by c++ APIs that accept ownership of live objects and must
+  manage them going forward.
+*/
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct--;
+    if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
+  }
+}
+#endif
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct++;
+  }
+}

numba_cuda/numba/cuda/simulator/api.py CHANGED Viewed

@@ -35,6 +35,20 @@ class stream(object):
         pass
+# Default stream APIs. Since execution from the perspective of the host is
+# synchronous in the simulator, these can be the same as the stream class.
+default_stream = stream
+legacy_default_stream = stream
+per_thread_default_stream = stream
+# There is no way to use external streams with the simulator. Since the
+# implementation is not really using streams, we can't meaningfully interact
+# with external ones.
+def external_stream(ptr):
+    raise RuntimeError("External streams are unsupported in the simulator")
 def synchronize():
     pass

numba_cuda/numba/cuda/target.py CHANGED Viewed

@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
             datamodel.default_manager
         )
+    @property
+    def enable_nrt(self):
+        return getattr(config, 'CUDA_ENABLE_NRT', False)
     @property
     def DIBuilder(self):
         return debuginfo.DIBuilder
@@ -148,7 +152,7 @@ class CUDATargetContext(BaseContext):
     def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
                             nvvm_options, filename, linenum,
-                            max_registers=None):
+                            max_registers=None, lto=False):
         """
         Adapt a code library ``codelib`` with the numba compiled CUDA kernel
         with name ``fname`` and arguments ``argtypes`` for NVVM.
@@ -175,7 +179,9 @@ class CUDATargetContext(BaseContext):
         library = self.codegen().create_library(f'{codelib.name}_kernel_',
                                                 entry_name=kernel_name,
                                                 nvvm_options=nvvm_options,
-                                                max_registers=max_registers)
+                                                max_registers=max_registers,
+                                                lto=lto
+                                                )
         library.add_linking_library(codelib)
         wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name,
                                                debug, lineinfo, filename,

numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py ADDED Viewed

@@ -0,0 +1,199 @@
+from numba.cuda.testing import unittest
+from numba.cuda.testing import skip_on_cudasim
+from numba.cuda.testing import CUDATestCase
+from numba.cuda.cudadrv.driver import PyNvJitLinker
+import itertools
+import os
+from numba.cuda import get_current_device
+from numba import cuda
+from numba import config
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    test_device_functions_a = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.a"
+    )
+    test_device_functions_cubin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cubin"
+    )
+    test_device_functions_cu = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cu"
+    )
+    test_device_functions_fatbin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.fatbin"
+    )
+    test_device_functions_o = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.o"
+    )
+    test_device_functions_ptx = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ptx"
+    )
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
+@unittest.skipIf(
+    not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR,
+    "pynvjitlink not enabled"
+)
+@skip_on_cudasim("Linking unsupported in the simulator")
+class TestLinker(CUDATestCase):
+    _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
+    def test_nvjitlink_create(self):
+        patched_linker = PyNvJitLinker(cc=(7, 5))
+        assert "-arch=sm_75" in patched_linker.options
+    def test_nvjitlink_create_no_cc_error(self):
+        # nvJitLink expects at least the architecture to be specified.
+        with self.assertRaisesRegex(
+            RuntimeError, "PyNvJitLinker requires CC to be specified"
+        ):
+            PyNvJitLinker()
+    def test_nvjitlink_invalid_arch_error(self):
+        from pynvjitlink.api import NvJitLinkError
+        # CC 0.0 is not a valid compute capability
+        with self.assertRaisesRegex(
+            NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"
+        ):
+            PyNvJitLinker(cc=(0, 0))
+    def test_nvjitlink_invalid_cc_type_error(self):
+        with self.assertRaisesRegex(
+            TypeError, "`cc` must be a list or tuple of length 2"
+        ):
+            PyNvJitLinker(cc=0)
+    def test_nvjitlink_ptx_compile_options(self):
+        max_registers = (None, 32)
+        lineinfo = (False, True)
+        lto = (False, True)
+        additional_flags = (None, ("-g",), ("-g", "-time"))
+        for (
+            max_registers_i,
+            line_info_i,
+            lto_i,
+            additional_flags_i,
+        ) in itertools.product(max_registers, lineinfo, lto, additional_flags):
+            with self.subTest(
+                max_registers=max_registers_i,
+                lineinfo=line_info_i,
+                lto=lto_i,
+                additional_flags=additional_flags_i,
+            ):
+                patched_linker = PyNvJitLinker(
+                    cc=(7, 5),
+                    max_registers=max_registers_i,
+                    lineinfo=line_info_i,
+                    lto=lto_i,
+                    additional_flags=additional_flags_i,
+                )
+                assert "-arch=sm_75" in patched_linker.options
+                if max_registers_i:
+                    assert (
+                        f"-maxrregcount={max_registers_i}"
+                        in patched_linker.options
+                    )
+                else:
+                    assert "-maxrregcount" not in patched_linker.options
+                if line_info_i:
+                    assert "-lineinfo" in patched_linker.options
+                else:
+                    assert "-lineinfo" not in patched_linker.options
+                if lto_i:
+                    assert "-lto" in patched_linker.options
+                else:
+                    assert "-lto" not in patched_linker.options
+                if additional_flags_i:
+                    for flag in additional_flags_i:
+                        assert flag in patched_linker.options
+    def test_nvjitlink_add_file_guess_ext_linkable_code(self):
+        files = (
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_cu,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
+        )
+        for file in files:
+            with self.subTest(file=file):
+                patched_linker = PyNvJitLinker(
+                    cc=get_current_device().compute_capability
+                )
+                patched_linker.add_file_guess_ext(file)
+    def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
+        with open(test_device_functions_cubin, "rb") as f:
+            content = f.read()
+        patched_linker = PyNvJitLinker(
+            cc=get_current_device().compute_capability
+        )
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+            # Feeding raw data as bytes to add_file_guess_ext should raise,
+            # because there's no way to know what kind of file to treat it as
+            patched_linker.add_file_guess_ext(content)
+    def test_nvjitlink_jit_with_linkable_code(self):
+        files = (
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_cu,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
+        )
+        for file in files:
+            with self.subTest(file=file):
+                sig = "uint32(uint32, uint32)"
+                add_from_numba = cuda.declare_device("add_from_numba", sig)
+                @cuda.jit(link=[file])
+                def kernel(result):
+                    result[0] = add_from_numba(1, 2)
+                result = cuda.device_array(1)
+                kernel[1, 1](result)
+                assert result[0] == 3
+    def test_nvjitlink_jit_with_linkable_code_lto(self):
+        file = test_device_functions_ltoir
+        sig = "uint32(uint32, uint32)"
+        add_from_numba = cuda.declare_device("add_from_numba", sig)
+        @cuda.jit(link=[file], lto=True)
+        def kernel(result):
+            result[0] = add_from_numba(1, 2)
+        result = cuda.device_array(1)
+        kernel[1, 1](result)
+        assert result[0] == 3
+    def test_nvjitlink_jit_with_invalid_linkable_code(self):
+        with open(test_device_functions_cubin, "rb") as f:
+            content = f.read()
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+            @cuda.jit("void()", link=[content])
+            def kernel():
+                pass
+if __name__ == "__main__":
+    unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py CHANGED Viewed

@@ -68,6 +68,10 @@ def simple_popc(ary, c):
     ary[0] = cuda.popc(c)
+def simple_bit_count(ary, c):
+    ary[0] = c.bit_count()
 def simple_fma(ary, a, b, c):
     ary[0] = cuda.fma(a, b, c)
@@ -550,17 +554,53 @@ class TestCudaIntrinsic(CUDATestCase):
         self.assertTrue(np.all(arr))
+    def test_popc_u1(self):
+        compiled = cuda.jit("void(int32[:], uint8)")(simple_popc)
+        ary = np.zeros(1, dtype=np.int8)
+        compiled[1, 1](ary, np.uint8(0xFF))
+        self.assertEqual(ary[0], 8)
+    def test_popc_u2(self):
+        compiled = cuda.jit("void(int32[:], uint16)")(simple_popc)
+        ary = np.zeros(1, dtype=np.int16)
+        compiled[1, 1](ary, np.uint16(0xFFFF))
+        self.assertEqual(ary[0], 16)
     def test_popc_u4(self):
         compiled = cuda.jit("void(int32[:], uint32)")(simple_popc)
         ary = np.zeros(1, dtype=np.int32)
-        compiled[1, 1](ary, 0xF0)
-        self.assertEqual(ary[0], 4)
+        compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
+        self.assertEqual(ary[0], 32)
     def test_popc_u8(self):
         compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
         ary = np.zeros(1, dtype=np.int32)
-        compiled[1, 1](ary, 0xF00000000000)
-        self.assertEqual(ary[0], 4)
+        compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
+        self.assertEqual(ary[0], 64)
+    def test_bit_count_u1(self):
+        compiled = cuda.jit("void(int32[:], uint8)")(simple_bit_count)
+        ary = np.zeros(1, dtype=np.int8)
+        compiled[1, 1](ary, np.uint8(0xFF))
+        self.assertEqual(ary[0], 8)
+    def test_bit_count_u2(self):
+        compiled = cuda.jit("void(int32[:], uint16)")(simple_bit_count)
+        ary = np.zeros(1, dtype=np.int16)
+        compiled[1, 1](ary, np.uint16(0xFFFF))
+        self.assertEqual(ary[0], 16)
+    def test_bit_count_u4(self):
+        compiled = cuda.jit("void(int32[:], uint32)")(simple_bit_count)
+        ary = np.zeros(1, dtype=np.int32)
+        compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
+        self.assertEqual(ary[0], 32)
+    def test_bit_count_u8(self):
+        compiled = cuda.jit("void(int32[:], uint64)")(simple_bit_count)
+        ary = np.zeros(1, dtype=np.int32)
+        compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
+        self.assertEqual(ary[0], 64)
     def test_fma_f4(self):
         compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)

numba_cuda/numba/cuda/tests/cudapy/test_print.py CHANGED Viewed

@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
     def test_bool(self):
         output, _ = self.run_code(printbool_usecase)
-        expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
-        self.assertEqual(output.strip(), expected)
+        expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
+        self.assertRegex(output.strip(), expected)
     def test_printempty(self):
         output, _ = self.run_code(printempty_usecase)

numba-cuda 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

numba-cuda 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl