numba-cuda 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +15 -3
  3. numba_cuda/numba/cuda/cuda_paths.py +68 -0
  4. numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
  5. numba_cuda/numba/cuda/cudadrv/driver.py +209 -47
  6. numba_cuda/numba/cuda/cudadrv/enums.py +3 -0
  7. numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
  8. numba_cuda/numba/cuda/cudadrv/linkable_code.py +63 -0
  9. numba_cuda/numba/cuda/cudadrv/mappings.py +24 -0
  10. numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
  11. numba_cuda/numba/cuda/device_init.py +3 -0
  12. numba_cuda/numba/cuda/dispatcher.py +48 -8
  13. numba_cuda/numba/cuda/intrinsics.py +6 -1
  14. numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
  15. numba_cuda/numba/cuda/simulator/api.py +14 -0
  16. numba_cuda/numba/cuda/target.py +8 -2
  17. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +199 -0
  18. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +44 -4
  19. numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
  20. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +48 -0
  21. numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
  22. numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
  23. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
  24. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +51 -0
  25. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +170 -0
  26. numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +19 -0
  27. numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +3 -0
  28. {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/METADATA +1 -1
  29. {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/RECORD +32 -20
  30. {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/WHEEL +1 -1
  31. {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/LICENSE +0 -0
  32. {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import numpy as np
2
2
  import os
3
+ import re
3
4
  import sys
4
5
  import ctypes
5
6
  import functools
@@ -43,10 +44,25 @@ class _Kernel(serialize.ReduceMixin):
43
44
  object launches the kernel on the device.
44
45
  '''
45
46
 
47
+ NRT_functions = [
48
+ "NRT_Allocate",
49
+ "NRT_MemInfo_init",
50
+ "NRT_MemInfo_new",
51
+ "NRT_Free",
52
+ "NRT_dealloc",
53
+ "NRT_MemInfo_destroy",
54
+ "NRT_MemInfo_call_dtor",
55
+ "NRT_MemInfo_data_fast",
56
+ "NRT_MemInfo_alloc_aligned",
57
+ "NRT_Allocate_External",
58
+ "NRT_decref",
59
+ "NRT_incref"
60
+ ]
61
+
46
62
  @global_compiler_lock
47
63
  def __init__(self, py_func, argtypes, link=None, debug=False,
48
64
  lineinfo=False, inline=False, fastmath=False, extensions=None,
49
- max_registers=None, opt=True, device=False):
65
+ max_registers=None, lto=False, opt=True, device=False):
50
66
 
51
67
  if device:
52
68
  raise RuntimeError('Cannot compile a device function as a kernel')
@@ -94,7 +110,7 @@ class _Kernel(serialize.ReduceMixin):
94
110
  lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
95
111
  debug, lineinfo, nvvm_options,
96
112
  filename, linenum,
97
- max_registers)
113
+ max_registers, lto)
98
114
 
99
115
  if not link:
100
116
  link = []
@@ -105,16 +121,20 @@ class _Kernel(serialize.ReduceMixin):
105
121
  if self.cooperative:
106
122
  lib.needs_cudadevrt = True
107
123
 
124
+ basedir = os.path.dirname(os.path.abspath(__file__))
125
+ asm = lib.get_asm_str()
126
+
108
127
  res = [fn for fn in cuda_fp16_math_funcs
109
- if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
128
+ if (f'__numba_wrapper_{fn}' in asm)]
110
129
 
111
130
  if res:
112
131
  # Path to the source containing the foreign function
113
- basedir = os.path.dirname(os.path.abspath(__file__))
114
132
  functions_cu_path = os.path.join(basedir,
115
133
  'cpp_function_wrappers.cu')
116
134
  link.append(functions_cu_path)
117
135
 
136
+ link = self.maybe_link_nrt(link, tgt_ctx, asm)
137
+
118
138
  for filepath in link:
119
139
  lib.add_linking_file(filepath)
120
140
 
@@ -136,6 +156,25 @@ class _Kernel(serialize.ReduceMixin):
136
156
  self.lifted = []
137
157
  self.reload_init = []
138
158
 
159
+ def maybe_link_nrt(self, link, tgt_ctx, asm):
160
+ if not tgt_ctx.enable_nrt:
161
+ return link
162
+
163
+ all_nrt = "|".join(self.NRT_functions)
164
+ pattern = (
165
+ r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
166
+ + all_nrt + r')\s*\([^)]*\)\s*;'
167
+ )
168
+
169
+ nrt_in_asm = re.findall(pattern, asm)
170
+
171
+ basedir = os.path.dirname(os.path.abspath(__file__))
172
+ if nrt_in_asm:
173
+ nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
174
+ link.append(nrt_path)
175
+
176
+ return link
177
+
139
178
  @property
140
179
  def library(self):
141
180
  return self._codelibrary
@@ -385,7 +424,6 @@ class _Kernel(serialize.ReduceMixin):
385
424
 
386
425
  if isinstance(ty, types.Array):
387
426
  devary = wrap_arg(val).to_device(retr, stream)
388
-
389
427
  c_intp = ctypes.c_ssize_t
390
428
 
391
429
  meminfo = ctypes.c_void_p(0)
@@ -519,7 +557,10 @@ class _LaunchConfiguration:
519
557
  self.stream = stream
520
558
  self.sharedmem = sharedmem
521
559
 
522
- if config.CUDA_LOW_OCCUPANCY_WARNINGS:
560
+ if (
561
+ config.CUDA_LOW_OCCUPANCY_WARNINGS
562
+ and not config.DISABLE_PERFORMANCE_WARNINGS
563
+ ):
523
564
  # Warn when the grid has fewer than 128 blocks. This number is
524
565
  # chosen somewhat heuristically - ideally the minimum is 2 times
525
566
  # the number of SMs, but the number of SMs varies between devices -
@@ -708,8 +749,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
708
749
  *args*.
709
750
  '''
710
751
  cc = get_current_device().compute_capability
711
- argtypes = tuple(
712
- [self.typingctx.resolve_argument_type(a) for a in args])
752
+ argtypes = tuple(self.typeof_pyval(a) for a in args)
713
753
  if self.specialized:
714
754
  raise RuntimeError('Dispatcher already specialized')
715
755
 
@@ -4,7 +4,7 @@ from numba import cuda, types
4
4
  from numba.core import cgutils
5
5
  from numba.core.errors import RequireLiteralValue
6
6
  from numba.core.typing import signature
7
- from numba.core.extending import overload_attribute
7
+ from numba.core.extending import overload_attribute, overload_method
8
8
  from numba.cuda import nvvmutils
9
9
  from numba.cuda.extending import intrinsic
10
10
 
@@ -196,3 +196,8 @@ def syncthreads_or(typingctx, predicate):
196
196
  '''
197
197
  fname = 'llvm.nvvm.barrier0.or'
198
198
  return _syncthreads_predicate(typingctx, predicate, fname)
199
+
200
+
201
+ @overload_method(types.Integer, 'bit_count', target='cuda')
202
+ def integer_bit_count(i):
203
+ return lambda i: cuda.popc(i)
@@ -0,0 +1,190 @@
1
+ #ifndef _NRT_H
2
+ #define _NRT_H
3
+
4
+ #include <cuda/atomic>
5
+
6
+ typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
7
+ typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
8
+
9
+ typedef struct MemInfo NRT_MemInfo;
10
+
11
+ extern "C" {
12
+ struct MemInfo {
13
+ cuda::atomic<size_t, cuda::thread_scope_device> refct;
14
+ NRT_dtor_function dtor;
15
+ void* dtor_info;
16
+ void* data;
17
+ size_t size;
18
+ };
19
+ }
20
+
21
+ // Globally needed variables
22
+ struct NRT_MemSys {
23
+ struct {
24
+ bool enabled;
25
+ cuda::atomic<size_t, cuda::thread_scope_device> alloc;
26
+ cuda::atomic<size_t, cuda::thread_scope_device> free;
27
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
28
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
29
+ } stats;
30
+ };
31
+
32
+ static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
33
+ static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
34
+ extern "C" __device__ void* NRT_Allocate_External(size_t size);
35
+
36
+ /* The Memory System object */
37
+ __device__ NRT_MemSys* TheMSys;
38
+
39
+ extern "C" __device__ void* NRT_Allocate(size_t size)
40
+ {
41
+ void* ptr = NULL;
42
+ ptr = malloc(size);
43
+ // if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
44
+ return ptr;
45
+ }
46
+
47
+ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
48
+ void* data,
49
+ size_t size,
50
+ NRT_dtor_function dtor,
51
+ void* dtor_info)
52
+ // NRT_MemSys* TheMSys)
53
+ {
54
+ mi->refct = 1; /* starts with 1 refct */
55
+ mi->dtor = dtor;
56
+ mi->dtor_info = dtor_info;
57
+ mi->data = data;
58
+ mi->size = size;
59
+ // if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
60
+ }
61
+
62
+ extern "C"
63
+ __device__ NRT_MemInfo* NRT_MemInfo_new(
64
+ void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
65
+ {
66
+ NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
67
+ if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
68
+ return mi;
69
+ }
70
+
71
+ extern "C" __device__ void NRT_Free(void* ptr)
72
+ {
73
+ free(ptr);
74
+ //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
75
+ }
76
+
77
+ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
78
+ {
79
+ NRT_Free(mi);
80
+ }
81
+
82
+ extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
83
+ {
84
+ NRT_dealloc(mi);
85
+ //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
86
+ }
87
+ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
88
+ {
89
+ if (mi->dtor) /* We have a destructor */
90
+ mi->dtor(mi->data, mi->size, NULL);
91
+ /* Clear and release MemInfo */
92
+ NRT_MemInfo_destroy(mi);
93
+ }
94
+
95
+ extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
96
+ {
97
+ return mi->data;
98
+ }
99
+
100
+ extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
101
+ NRT_MemInfo *mi = NULL;
102
+ void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
103
+ if (data == NULL) {
104
+ return NULL; /* return early as allocation failed */
105
+ }
106
+ //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
107
+ NRT_MemInfo_init(mi, data, size, NULL, NULL);
108
+ return mi;
109
+ }
110
+
111
+ static
112
+ __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
113
+ NRT_MemInfo **mi)
114
+ {
115
+ size_t offset = 0, intptr = 0, remainder = 0;
116
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
117
+ char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
118
+ if (base == NULL) {
119
+ return NULL; /* return early as allocation failed */
120
+ }
121
+ intptr = (size_t) base;
122
+ /*
123
+ * See if the allocation is aligned already...
124
+ * Check if align is a power of 2, if so the modulo can be avoided.
125
+ */
126
+ if((align & (align - 1)) == 0)
127
+ {
128
+ remainder = intptr & (align - 1);
129
+ }
130
+ else
131
+ {
132
+ remainder = intptr % align;
133
+ }
134
+ if (remainder == 0){ /* Yes */
135
+ offset = 0;
136
+ } else { /* No, move forward `offset` bytes */
137
+ offset = align - remainder;
138
+ }
139
+ return (void*)((char *)base + offset);
140
+ }
141
+
142
+ static
143
+ __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
144
+ NRT_MemInfo *mi = NULL;
145
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
146
+ char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
147
+ if (base == NULL) {
148
+ *mi_out = NULL; /* set meminfo to NULL as allocation failed */
149
+ return NULL; /* return early as allocation failed */
150
+ }
151
+ mi = (NRT_MemInfo *) base;
152
+ *mi_out = mi;
153
+ return (void*)((char *)base + sizeof(NRT_MemInfo));
154
+ }
155
+
156
+ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
157
+ void *ptr = NULL;
158
+ ptr = malloc(size);
159
+ //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
160
+
161
+ //if (TheMSys.stats.enabled)
162
+ //{
163
+ // TheMSys.stats.alloc++;
164
+ //}
165
+ return ptr;
166
+ }
167
+
168
+
169
+ /*
170
+ c++ version of the NRT_decref function that usually is added to
171
+ the final kernel link in PTX form by numba. This version may be
172
+ used by c++ APIs that accept ownership of live objects and must
173
+ manage them going forward.
174
+ */
175
+ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
176
+ {
177
+ if (mi != NULL) {
178
+ mi->refct--;
179
+ if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
180
+ }
181
+ }
182
+
183
+ #endif
184
+
185
+ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
186
+ {
187
+ if (mi != NULL) {
188
+ mi->refct++;
189
+ }
190
+ }
@@ -35,6 +35,20 @@ class stream(object):
35
35
  pass
36
36
 
37
37
 
38
+ # Default stream APIs. Since execution from the perspective of the host is
39
+ # synchronous in the simulator, these can be the same as the stream class.
40
+ default_stream = stream
41
+ legacy_default_stream = stream
42
+ per_thread_default_stream = stream
43
+
44
+
45
+ # There is no way to use external streams with the simulator. Since the
46
+ # implementation is not really using streams, we can't meaningfully interact
47
+ # with external ones.
48
+ def external_stream(ptr):
49
+ raise RuntimeError("External streams are unsupported in the simulator")
50
+
51
+
38
52
  def synchronize():
39
53
  pass
40
54
 
@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
74
74
  datamodel.default_manager
75
75
  )
76
76
 
77
+ @property
78
+ def enable_nrt(self):
79
+ return getattr(config, 'CUDA_ENABLE_NRT', False)
80
+
77
81
  @property
78
82
  def DIBuilder(self):
79
83
  return debuginfo.DIBuilder
@@ -148,7 +152,7 @@ class CUDATargetContext(BaseContext):
148
152
 
149
153
  def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
150
154
  nvvm_options, filename, linenum,
151
- max_registers=None):
155
+ max_registers=None, lto=False):
152
156
  """
153
157
  Adapt a code library ``codelib`` with the numba compiled CUDA kernel
154
158
  with name ``fname`` and arguments ``argtypes`` for NVVM.
@@ -175,7 +179,9 @@ class CUDATargetContext(BaseContext):
175
179
  library = self.codegen().create_library(f'{codelib.name}_kernel_',
176
180
  entry_name=kernel_name,
177
181
  nvvm_options=nvvm_options,
178
- max_registers=max_registers)
182
+ max_registers=max_registers,
183
+ lto=lto
184
+ )
179
185
  library.add_linking_library(codelib)
180
186
  wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name,
181
187
  debug, lineinfo, filename,
@@ -0,0 +1,199 @@
1
+ from numba.cuda.testing import unittest
2
+ from numba.cuda.testing import skip_on_cudasim
3
+ from numba.cuda.testing import CUDATestCase
4
+ from numba.cuda.cudadrv.driver import PyNvJitLinker
5
+
6
+ import itertools
7
+ import os
8
+ from numba.cuda import get_current_device
9
+ from numba import cuda
10
+ from numba import config
11
+
12
+ TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
13
+ if TEST_BIN_DIR:
14
+ test_device_functions_a = os.path.join(
15
+ TEST_BIN_DIR, "test_device_functions.a"
16
+ )
17
+ test_device_functions_cubin = os.path.join(
18
+ TEST_BIN_DIR, "test_device_functions.cubin"
19
+ )
20
+ test_device_functions_cu = os.path.join(
21
+ TEST_BIN_DIR, "test_device_functions.cu"
22
+ )
23
+ test_device_functions_fatbin = os.path.join(
24
+ TEST_BIN_DIR, "test_device_functions.fatbin"
25
+ )
26
+ test_device_functions_o = os.path.join(
27
+ TEST_BIN_DIR, "test_device_functions.o"
28
+ )
29
+ test_device_functions_ptx = os.path.join(
30
+ TEST_BIN_DIR, "test_device_functions.ptx"
31
+ )
32
+ test_device_functions_ltoir = os.path.join(
33
+ TEST_BIN_DIR, "test_device_functions.ltoir"
34
+ )
35
+
36
+
37
+ @unittest.skipIf(
38
+ not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR,
39
+ "pynvjitlink not enabled"
40
+ )
41
+ @skip_on_cudasim("Linking unsupported in the simulator")
42
+ class TestLinker(CUDATestCase):
43
+ _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
44
+
45
+ def test_nvjitlink_create(self):
46
+ patched_linker = PyNvJitLinker(cc=(7, 5))
47
+ assert "-arch=sm_75" in patched_linker.options
48
+
49
+ def test_nvjitlink_create_no_cc_error(self):
50
+ # nvJitLink expects at least the architecture to be specified.
51
+ with self.assertRaisesRegex(
52
+ RuntimeError, "PyNvJitLinker requires CC to be specified"
53
+ ):
54
+ PyNvJitLinker()
55
+
56
+ def test_nvjitlink_invalid_arch_error(self):
57
+ from pynvjitlink.api import NvJitLinkError
58
+
59
+ # CC 0.0 is not a valid compute capability
60
+ with self.assertRaisesRegex(
61
+ NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"
62
+ ):
63
+ PyNvJitLinker(cc=(0, 0))
64
+
65
+ def test_nvjitlink_invalid_cc_type_error(self):
66
+ with self.assertRaisesRegex(
67
+ TypeError, "`cc` must be a list or tuple of length 2"
68
+ ):
69
+ PyNvJitLinker(cc=0)
70
+
71
+ def test_nvjitlink_ptx_compile_options(self):
72
+
73
+ max_registers = (None, 32)
74
+ lineinfo = (False, True)
75
+ lto = (False, True)
76
+ additional_flags = (None, ("-g",), ("-g", "-time"))
77
+ for (
78
+ max_registers_i,
79
+ line_info_i,
80
+ lto_i,
81
+ additional_flags_i,
82
+ ) in itertools.product(max_registers, lineinfo, lto, additional_flags):
83
+ with self.subTest(
84
+ max_registers=max_registers_i,
85
+ lineinfo=line_info_i,
86
+ lto=lto_i,
87
+ additional_flags=additional_flags_i,
88
+ ):
89
+ patched_linker = PyNvJitLinker(
90
+ cc=(7, 5),
91
+ max_registers=max_registers_i,
92
+ lineinfo=line_info_i,
93
+ lto=lto_i,
94
+ additional_flags=additional_flags_i,
95
+ )
96
+ assert "-arch=sm_75" in patched_linker.options
97
+
98
+ if max_registers_i:
99
+ assert (
100
+ f"-maxrregcount={max_registers_i}"
101
+ in patched_linker.options
102
+ )
103
+ else:
104
+ assert "-maxrregcount" not in patched_linker.options
105
+
106
+ if line_info_i:
107
+ assert "-lineinfo" in patched_linker.options
108
+ else:
109
+ assert "-lineinfo" not in patched_linker.options
110
+
111
+ if lto_i:
112
+ assert "-lto" in patched_linker.options
113
+ else:
114
+ assert "-lto" not in patched_linker.options
115
+
116
+ if additional_flags_i:
117
+ for flag in additional_flags_i:
118
+ assert flag in patched_linker.options
119
+
120
+ def test_nvjitlink_add_file_guess_ext_linkable_code(self):
121
+ files = (
122
+ test_device_functions_a,
123
+ test_device_functions_cubin,
124
+ test_device_functions_cu,
125
+ test_device_functions_fatbin,
126
+ test_device_functions_o,
127
+ test_device_functions_ptx,
128
+ )
129
+ for file in files:
130
+ with self.subTest(file=file):
131
+ patched_linker = PyNvJitLinker(
132
+ cc=get_current_device().compute_capability
133
+ )
134
+ patched_linker.add_file_guess_ext(file)
135
+
136
+ def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
137
+ with open(test_device_functions_cubin, "rb") as f:
138
+ content = f.read()
139
+
140
+ patched_linker = PyNvJitLinker(
141
+ cc=get_current_device().compute_capability
142
+ )
143
+ with self.assertRaisesRegex(
144
+ TypeError, "Expected path to file or a LinkableCode"
145
+ ):
146
+ # Feeding raw data as bytes to add_file_guess_ext should raise,
147
+ # because there's no way to know what kind of file to treat it as
148
+ patched_linker.add_file_guess_ext(content)
149
+
150
+ def test_nvjitlink_jit_with_linkable_code(self):
151
+ files = (
152
+ test_device_functions_a,
153
+ test_device_functions_cubin,
154
+ test_device_functions_cu,
155
+ test_device_functions_fatbin,
156
+ test_device_functions_o,
157
+ test_device_functions_ptx,
158
+ )
159
+ for file in files:
160
+ with self.subTest(file=file):
161
+ sig = "uint32(uint32, uint32)"
162
+ add_from_numba = cuda.declare_device("add_from_numba", sig)
163
+
164
+ @cuda.jit(link=[file])
165
+ def kernel(result):
166
+ result[0] = add_from_numba(1, 2)
167
+
168
+ result = cuda.device_array(1)
169
+ kernel[1, 1](result)
170
+ assert result[0] == 3
171
+
172
+ def test_nvjitlink_jit_with_linkable_code_lto(self):
173
+ file = test_device_functions_ltoir
174
+
175
+ sig = "uint32(uint32, uint32)"
176
+ add_from_numba = cuda.declare_device("add_from_numba", sig)
177
+
178
+ @cuda.jit(link=[file], lto=True)
179
+ def kernel(result):
180
+ result[0] = add_from_numba(1, 2)
181
+
182
+ result = cuda.device_array(1)
183
+ kernel[1, 1](result)
184
+ assert result[0] == 3
185
+
186
+ def test_nvjitlink_jit_with_invalid_linkable_code(self):
187
+ with open(test_device_functions_cubin, "rb") as f:
188
+ content = f.read()
189
+ with self.assertRaisesRegex(
190
+ TypeError, "Expected path to file or a LinkableCode"
191
+ ):
192
+
193
+ @cuda.jit("void()", link=[content])
194
+ def kernel():
195
+ pass
196
+
197
+
198
+ if __name__ == "__main__":
199
+ unittest.main()
@@ -68,6 +68,10 @@ def simple_popc(ary, c):
68
68
  ary[0] = cuda.popc(c)
69
69
 
70
70
 
71
+ def simple_bit_count(ary, c):
72
+ ary[0] = c.bit_count()
73
+
74
+
71
75
  def simple_fma(ary, a, b, c):
72
76
  ary[0] = cuda.fma(a, b, c)
73
77
 
@@ -550,17 +554,53 @@ class TestCudaIntrinsic(CUDATestCase):
550
554
 
551
555
  self.assertTrue(np.all(arr))
552
556
 
557
+ def test_popc_u1(self):
558
+ compiled = cuda.jit("void(int32[:], uint8)")(simple_popc)
559
+ ary = np.zeros(1, dtype=np.int8)
560
+ compiled[1, 1](ary, np.uint8(0xFF))
561
+ self.assertEqual(ary[0], 8)
562
+
563
+ def test_popc_u2(self):
564
+ compiled = cuda.jit("void(int32[:], uint16)")(simple_popc)
565
+ ary = np.zeros(1, dtype=np.int16)
566
+ compiled[1, 1](ary, np.uint16(0xFFFF))
567
+ self.assertEqual(ary[0], 16)
568
+
553
569
  def test_popc_u4(self):
554
570
  compiled = cuda.jit("void(int32[:], uint32)")(simple_popc)
555
571
  ary = np.zeros(1, dtype=np.int32)
556
- compiled[1, 1](ary, 0xF0)
557
- self.assertEqual(ary[0], 4)
572
+ compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
573
+ self.assertEqual(ary[0], 32)
558
574
 
559
575
  def test_popc_u8(self):
560
576
  compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
561
577
  ary = np.zeros(1, dtype=np.int32)
562
- compiled[1, 1](ary, 0xF00000000000)
563
- self.assertEqual(ary[0], 4)
578
+ compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
579
+ self.assertEqual(ary[0], 64)
580
+
581
+ def test_bit_count_u1(self):
582
+ compiled = cuda.jit("void(int32[:], uint8)")(simple_bit_count)
583
+ ary = np.zeros(1, dtype=np.int8)
584
+ compiled[1, 1](ary, np.uint8(0xFF))
585
+ self.assertEqual(ary[0], 8)
586
+
587
+ def test_bit_count_u2(self):
588
+ compiled = cuda.jit("void(int32[:], uint16)")(simple_bit_count)
589
+ ary = np.zeros(1, dtype=np.int16)
590
+ compiled[1, 1](ary, np.uint16(0xFFFF))
591
+ self.assertEqual(ary[0], 16)
592
+
593
+ def test_bit_count_u4(self):
594
+ compiled = cuda.jit("void(int32[:], uint32)")(simple_bit_count)
595
+ ary = np.zeros(1, dtype=np.int32)
596
+ compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
597
+ self.assertEqual(ary[0], 32)
598
+
599
+ def test_bit_count_u8(self):
600
+ compiled = cuda.jit("void(int32[:], uint64)")(simple_bit_count)
601
+ ary = np.zeros(1, dtype=np.int32)
602
+ compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
603
+ self.assertEqual(ary[0], 64)
564
604
 
565
605
  def test_fma_f4(self):
566
606
  compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
126
126
 
127
127
  def test_bool(self):
128
128
  output, _ = self.run_code(printbool_usecase)
129
- expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
130
- self.assertEqual(output.strip(), expected)
129
+ expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
130
+ self.assertRegex(output.strip(), expected)
131
131
 
132
132
  def test_printempty(self):
133
133
  output, _ = self.run_code(printempty_usecase)