numba-cuda 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +15 -3
- numba_cuda/numba/cuda/cuda_paths.py +68 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +209 -47
- numba_cuda/numba/cuda/cudadrv/enums.py +3 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +63 -0
- numba_cuda/numba/cuda/cudadrv/mappings.py +24 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
- numba_cuda/numba/cuda/device_init.py +3 -0
- numba_cuda/numba/cuda/dispatcher.py +48 -8
- numba_cuda/numba/cuda/intrinsics.py +6 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
- numba_cuda/numba/cuda/simulator/api.py +14 -0
- numba_cuda/numba/cuda/target.py +8 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +44 -4
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +48 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +51 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +170 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +19 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +3 -0
- {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/METADATA +1 -1
- {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/RECORD +32 -20
- {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/WHEEL +1 -1
- {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/LICENSE +0 -0
- {numba_cuda-0.0.17.dist-info → numba_cuda-0.0.19.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import sys
|
4
5
|
import ctypes
|
5
6
|
import functools
|
@@ -43,10 +44,25 @@ class _Kernel(serialize.ReduceMixin):
|
|
43
44
|
object launches the kernel on the device.
|
44
45
|
'''
|
45
46
|
|
47
|
+
NRT_functions = [
|
48
|
+
"NRT_Allocate",
|
49
|
+
"NRT_MemInfo_init",
|
50
|
+
"NRT_MemInfo_new",
|
51
|
+
"NRT_Free",
|
52
|
+
"NRT_dealloc",
|
53
|
+
"NRT_MemInfo_destroy",
|
54
|
+
"NRT_MemInfo_call_dtor",
|
55
|
+
"NRT_MemInfo_data_fast",
|
56
|
+
"NRT_MemInfo_alloc_aligned",
|
57
|
+
"NRT_Allocate_External",
|
58
|
+
"NRT_decref",
|
59
|
+
"NRT_incref"
|
60
|
+
]
|
61
|
+
|
46
62
|
@global_compiler_lock
|
47
63
|
def __init__(self, py_func, argtypes, link=None, debug=False,
|
48
64
|
lineinfo=False, inline=False, fastmath=False, extensions=None,
|
49
|
-
max_registers=None, opt=True, device=False):
|
65
|
+
max_registers=None, lto=False, opt=True, device=False):
|
50
66
|
|
51
67
|
if device:
|
52
68
|
raise RuntimeError('Cannot compile a device function as a kernel')
|
@@ -94,7 +110,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
94
110
|
lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
|
95
111
|
debug, lineinfo, nvvm_options,
|
96
112
|
filename, linenum,
|
97
|
-
max_registers)
|
113
|
+
max_registers, lto)
|
98
114
|
|
99
115
|
if not link:
|
100
116
|
link = []
|
@@ -105,16 +121,20 @@ class _Kernel(serialize.ReduceMixin):
|
|
105
121
|
if self.cooperative:
|
106
122
|
lib.needs_cudadevrt = True
|
107
123
|
|
124
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
125
|
+
asm = lib.get_asm_str()
|
126
|
+
|
108
127
|
res = [fn for fn in cuda_fp16_math_funcs
|
109
|
-
if (f'__numba_wrapper_{fn}' in
|
128
|
+
if (f'__numba_wrapper_{fn}' in asm)]
|
110
129
|
|
111
130
|
if res:
|
112
131
|
# Path to the source containing the foreign function
|
113
|
-
basedir = os.path.dirname(os.path.abspath(__file__))
|
114
132
|
functions_cu_path = os.path.join(basedir,
|
115
133
|
'cpp_function_wrappers.cu')
|
116
134
|
link.append(functions_cu_path)
|
117
135
|
|
136
|
+
link = self.maybe_link_nrt(link, tgt_ctx, asm)
|
137
|
+
|
118
138
|
for filepath in link:
|
119
139
|
lib.add_linking_file(filepath)
|
120
140
|
|
@@ -136,6 +156,25 @@ class _Kernel(serialize.ReduceMixin):
|
|
136
156
|
self.lifted = []
|
137
157
|
self.reload_init = []
|
138
158
|
|
159
|
+
def maybe_link_nrt(self, link, tgt_ctx, asm):
|
160
|
+
if not tgt_ctx.enable_nrt:
|
161
|
+
return link
|
162
|
+
|
163
|
+
all_nrt = "|".join(self.NRT_functions)
|
164
|
+
pattern = (
|
165
|
+
r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
|
166
|
+
+ all_nrt + r')\s*\([^)]*\)\s*;'
|
167
|
+
)
|
168
|
+
|
169
|
+
nrt_in_asm = re.findall(pattern, asm)
|
170
|
+
|
171
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
172
|
+
if nrt_in_asm:
|
173
|
+
nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
|
174
|
+
link.append(nrt_path)
|
175
|
+
|
176
|
+
return link
|
177
|
+
|
139
178
|
@property
|
140
179
|
def library(self):
|
141
180
|
return self._codelibrary
|
@@ -385,7 +424,6 @@ class _Kernel(serialize.ReduceMixin):
|
|
385
424
|
|
386
425
|
if isinstance(ty, types.Array):
|
387
426
|
devary = wrap_arg(val).to_device(retr, stream)
|
388
|
-
|
389
427
|
c_intp = ctypes.c_ssize_t
|
390
428
|
|
391
429
|
meminfo = ctypes.c_void_p(0)
|
@@ -519,7 +557,10 @@ class _LaunchConfiguration:
|
|
519
557
|
self.stream = stream
|
520
558
|
self.sharedmem = sharedmem
|
521
559
|
|
522
|
-
if
|
560
|
+
if (
|
561
|
+
config.CUDA_LOW_OCCUPANCY_WARNINGS
|
562
|
+
and not config.DISABLE_PERFORMANCE_WARNINGS
|
563
|
+
):
|
523
564
|
# Warn when the grid has fewer than 128 blocks. This number is
|
524
565
|
# chosen somewhat heuristically - ideally the minimum is 2 times
|
525
566
|
# the number of SMs, but the number of SMs varies between devices -
|
@@ -708,8 +749,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
|
|
708
749
|
*args*.
|
709
750
|
'''
|
710
751
|
cc = get_current_device().compute_capability
|
711
|
-
argtypes = tuple(
|
712
|
-
[self.typingctx.resolve_argument_type(a) for a in args])
|
752
|
+
argtypes = tuple(self.typeof_pyval(a) for a in args)
|
713
753
|
if self.specialized:
|
714
754
|
raise RuntimeError('Dispatcher already specialized')
|
715
755
|
|
@@ -4,7 +4,7 @@ from numba import cuda, types
|
|
4
4
|
from numba.core import cgutils
|
5
5
|
from numba.core.errors import RequireLiteralValue
|
6
6
|
from numba.core.typing import signature
|
7
|
-
from numba.core.extending import overload_attribute
|
7
|
+
from numba.core.extending import overload_attribute, overload_method
|
8
8
|
from numba.cuda import nvvmutils
|
9
9
|
from numba.cuda.extending import intrinsic
|
10
10
|
|
@@ -196,3 +196,8 @@ def syncthreads_or(typingctx, predicate):
|
|
196
196
|
'''
|
197
197
|
fname = 'llvm.nvvm.barrier0.or'
|
198
198
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
199
|
+
|
200
|
+
|
201
|
+
@overload_method(types.Integer, 'bit_count', target='cuda')
|
202
|
+
def integer_bit_count(i):
|
203
|
+
return lambda i: cuda.popc(i)
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#ifndef _NRT_H
|
2
|
+
#define _NRT_H
|
3
|
+
|
4
|
+
#include <cuda/atomic>
|
5
|
+
|
6
|
+
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
7
|
+
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
8
|
+
|
9
|
+
typedef struct MemInfo NRT_MemInfo;
|
10
|
+
|
11
|
+
extern "C" {
|
12
|
+
struct MemInfo {
|
13
|
+
cuda::atomic<size_t, cuda::thread_scope_device> refct;
|
14
|
+
NRT_dtor_function dtor;
|
15
|
+
void* dtor_info;
|
16
|
+
void* data;
|
17
|
+
size_t size;
|
18
|
+
};
|
19
|
+
}
|
20
|
+
|
21
|
+
// Globally needed variables
|
22
|
+
struct NRT_MemSys {
|
23
|
+
struct {
|
24
|
+
bool enabled;
|
25
|
+
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
26
|
+
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
27
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
28
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
29
|
+
} stats;
|
30
|
+
};
|
31
|
+
|
32
|
+
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
33
|
+
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
34
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
35
|
+
|
36
|
+
/* The Memory System object */
|
37
|
+
__device__ NRT_MemSys* TheMSys;
|
38
|
+
|
39
|
+
extern "C" __device__ void* NRT_Allocate(size_t size)
|
40
|
+
{
|
41
|
+
void* ptr = NULL;
|
42
|
+
ptr = malloc(size);
|
43
|
+
// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
|
44
|
+
return ptr;
|
45
|
+
}
|
46
|
+
|
47
|
+
extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
48
|
+
void* data,
|
49
|
+
size_t size,
|
50
|
+
NRT_dtor_function dtor,
|
51
|
+
void* dtor_info)
|
52
|
+
// NRT_MemSys* TheMSys)
|
53
|
+
{
|
54
|
+
mi->refct = 1; /* starts with 1 refct */
|
55
|
+
mi->dtor = dtor;
|
56
|
+
mi->dtor_info = dtor_info;
|
57
|
+
mi->data = data;
|
58
|
+
mi->size = size;
|
59
|
+
// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
|
60
|
+
}
|
61
|
+
|
62
|
+
extern "C"
|
63
|
+
__device__ NRT_MemInfo* NRT_MemInfo_new(
|
64
|
+
void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
|
65
|
+
{
|
66
|
+
NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
|
67
|
+
if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
|
68
|
+
return mi;
|
69
|
+
}
|
70
|
+
|
71
|
+
extern "C" __device__ void NRT_Free(void* ptr)
|
72
|
+
{
|
73
|
+
free(ptr);
|
74
|
+
//if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
|
75
|
+
}
|
76
|
+
|
77
|
+
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
78
|
+
{
|
79
|
+
NRT_Free(mi);
|
80
|
+
}
|
81
|
+
|
82
|
+
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
|
83
|
+
{
|
84
|
+
NRT_dealloc(mi);
|
85
|
+
//if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
|
86
|
+
}
|
87
|
+
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
|
88
|
+
{
|
89
|
+
if (mi->dtor) /* We have a destructor */
|
90
|
+
mi->dtor(mi->data, mi->size, NULL);
|
91
|
+
/* Clear and release MemInfo */
|
92
|
+
NRT_MemInfo_destroy(mi);
|
93
|
+
}
|
94
|
+
|
95
|
+
extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
|
96
|
+
{
|
97
|
+
return mi->data;
|
98
|
+
}
|
99
|
+
|
100
|
+
extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
|
101
|
+
NRT_MemInfo *mi = NULL;
|
102
|
+
void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
|
103
|
+
if (data == NULL) {
|
104
|
+
return NULL; /* return early as allocation failed */
|
105
|
+
}
|
106
|
+
//NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
|
107
|
+
NRT_MemInfo_init(mi, data, size, NULL, NULL);
|
108
|
+
return mi;
|
109
|
+
}
|
110
|
+
|
111
|
+
static
|
112
|
+
__device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
|
113
|
+
NRT_MemInfo **mi)
|
114
|
+
{
|
115
|
+
size_t offset = 0, intptr = 0, remainder = 0;
|
116
|
+
//NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
|
117
|
+
char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
|
118
|
+
if (base == NULL) {
|
119
|
+
return NULL; /* return early as allocation failed */
|
120
|
+
}
|
121
|
+
intptr = (size_t) base;
|
122
|
+
/*
|
123
|
+
* See if the allocation is aligned already...
|
124
|
+
* Check if align is a power of 2, if so the modulo can be avoided.
|
125
|
+
*/
|
126
|
+
if((align & (align - 1)) == 0)
|
127
|
+
{
|
128
|
+
remainder = intptr & (align - 1);
|
129
|
+
}
|
130
|
+
else
|
131
|
+
{
|
132
|
+
remainder = intptr % align;
|
133
|
+
}
|
134
|
+
if (remainder == 0){ /* Yes */
|
135
|
+
offset = 0;
|
136
|
+
} else { /* No, move forward `offset` bytes */
|
137
|
+
offset = align - remainder;
|
138
|
+
}
|
139
|
+
return (void*)((char *)base + offset);
|
140
|
+
}
|
141
|
+
|
142
|
+
static
|
143
|
+
__device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
|
144
|
+
NRT_MemInfo *mi = NULL;
|
145
|
+
//NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
|
146
|
+
char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
|
147
|
+
if (base == NULL) {
|
148
|
+
*mi_out = NULL; /* set meminfo to NULL as allocation failed */
|
149
|
+
return NULL; /* return early as allocation failed */
|
150
|
+
}
|
151
|
+
mi = (NRT_MemInfo *) base;
|
152
|
+
*mi_out = mi;
|
153
|
+
return (void*)((char *)base + sizeof(NRT_MemInfo));
|
154
|
+
}
|
155
|
+
|
156
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size) {
|
157
|
+
void *ptr = NULL;
|
158
|
+
ptr = malloc(size);
|
159
|
+
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
|
160
|
+
|
161
|
+
//if (TheMSys.stats.enabled)
|
162
|
+
//{
|
163
|
+
// TheMSys.stats.alloc++;
|
164
|
+
//}
|
165
|
+
return ptr;
|
166
|
+
}
|
167
|
+
|
168
|
+
|
169
|
+
/*
|
170
|
+
c++ version of the NRT_decref function that usually is added to
|
171
|
+
the final kernel link in PTX form by numba. This version may be
|
172
|
+
used by c++ APIs that accept ownership of live objects and must
|
173
|
+
manage them going forward.
|
174
|
+
*/
|
175
|
+
extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
|
176
|
+
{
|
177
|
+
if (mi != NULL) {
|
178
|
+
mi->refct--;
|
179
|
+
if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
#endif
|
184
|
+
|
185
|
+
extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
|
186
|
+
{
|
187
|
+
if (mi != NULL) {
|
188
|
+
mi->refct++;
|
189
|
+
}
|
190
|
+
}
|
@@ -35,6 +35,20 @@ class stream(object):
|
|
35
35
|
pass
|
36
36
|
|
37
37
|
|
38
|
+
# Default stream APIs. Since execution from the perspective of the host is
|
39
|
+
# synchronous in the simulator, these can be the same as the stream class.
|
40
|
+
default_stream = stream
|
41
|
+
legacy_default_stream = stream
|
42
|
+
per_thread_default_stream = stream
|
43
|
+
|
44
|
+
|
45
|
+
# There is no way to use external streams with the simulator. Since the
|
46
|
+
# implementation is not really using streams, we can't meaningfully interact
|
47
|
+
# with external ones.
|
48
|
+
def external_stream(ptr):
|
49
|
+
raise RuntimeError("External streams are unsupported in the simulator")
|
50
|
+
|
51
|
+
|
38
52
|
def synchronize():
|
39
53
|
pass
|
40
54
|
|
numba_cuda/numba/cuda/target.py
CHANGED
@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
|
|
74
74
|
datamodel.default_manager
|
75
75
|
)
|
76
76
|
|
77
|
+
@property
|
78
|
+
def enable_nrt(self):
|
79
|
+
return getattr(config, 'CUDA_ENABLE_NRT', False)
|
80
|
+
|
77
81
|
@property
|
78
82
|
def DIBuilder(self):
|
79
83
|
return debuginfo.DIBuilder
|
@@ -148,7 +152,7 @@ class CUDATargetContext(BaseContext):
|
|
148
152
|
|
149
153
|
def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
|
150
154
|
nvvm_options, filename, linenum,
|
151
|
-
max_registers=None):
|
155
|
+
max_registers=None, lto=False):
|
152
156
|
"""
|
153
157
|
Adapt a code library ``codelib`` with the numba compiled CUDA kernel
|
154
158
|
with name ``fname`` and arguments ``argtypes`` for NVVM.
|
@@ -175,7 +179,9 @@ class CUDATargetContext(BaseContext):
|
|
175
179
|
library = self.codegen().create_library(f'{codelib.name}_kernel_',
|
176
180
|
entry_name=kernel_name,
|
177
181
|
nvvm_options=nvvm_options,
|
178
|
-
max_registers=max_registers
|
182
|
+
max_registers=max_registers,
|
183
|
+
lto=lto
|
184
|
+
)
|
179
185
|
library.add_linking_library(codelib)
|
180
186
|
wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name,
|
181
187
|
debug, lineinfo, filename,
|
@@ -0,0 +1,199 @@
|
|
1
|
+
from numba.cuda.testing import unittest
|
2
|
+
from numba.cuda.testing import skip_on_cudasim
|
3
|
+
from numba.cuda.testing import CUDATestCase
|
4
|
+
from numba.cuda.cudadrv.driver import PyNvJitLinker
|
5
|
+
|
6
|
+
import itertools
|
7
|
+
import os
|
8
|
+
from numba.cuda import get_current_device
|
9
|
+
from numba import cuda
|
10
|
+
from numba import config
|
11
|
+
|
12
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
13
|
+
if TEST_BIN_DIR:
|
14
|
+
test_device_functions_a = os.path.join(
|
15
|
+
TEST_BIN_DIR, "test_device_functions.a"
|
16
|
+
)
|
17
|
+
test_device_functions_cubin = os.path.join(
|
18
|
+
TEST_BIN_DIR, "test_device_functions.cubin"
|
19
|
+
)
|
20
|
+
test_device_functions_cu = os.path.join(
|
21
|
+
TEST_BIN_DIR, "test_device_functions.cu"
|
22
|
+
)
|
23
|
+
test_device_functions_fatbin = os.path.join(
|
24
|
+
TEST_BIN_DIR, "test_device_functions.fatbin"
|
25
|
+
)
|
26
|
+
test_device_functions_o = os.path.join(
|
27
|
+
TEST_BIN_DIR, "test_device_functions.o"
|
28
|
+
)
|
29
|
+
test_device_functions_ptx = os.path.join(
|
30
|
+
TEST_BIN_DIR, "test_device_functions.ptx"
|
31
|
+
)
|
32
|
+
test_device_functions_ltoir = os.path.join(
|
33
|
+
TEST_BIN_DIR, "test_device_functions.ltoir"
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
@unittest.skipIf(
|
38
|
+
not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR,
|
39
|
+
"pynvjitlink not enabled"
|
40
|
+
)
|
41
|
+
@skip_on_cudasim("Linking unsupported in the simulator")
|
42
|
+
class TestLinker(CUDATestCase):
|
43
|
+
_NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
|
44
|
+
|
45
|
+
def test_nvjitlink_create(self):
|
46
|
+
patched_linker = PyNvJitLinker(cc=(7, 5))
|
47
|
+
assert "-arch=sm_75" in patched_linker.options
|
48
|
+
|
49
|
+
def test_nvjitlink_create_no_cc_error(self):
|
50
|
+
# nvJitLink expects at least the architecture to be specified.
|
51
|
+
with self.assertRaisesRegex(
|
52
|
+
RuntimeError, "PyNvJitLinker requires CC to be specified"
|
53
|
+
):
|
54
|
+
PyNvJitLinker()
|
55
|
+
|
56
|
+
def test_nvjitlink_invalid_arch_error(self):
|
57
|
+
from pynvjitlink.api import NvJitLinkError
|
58
|
+
|
59
|
+
# CC 0.0 is not a valid compute capability
|
60
|
+
with self.assertRaisesRegex(
|
61
|
+
NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"
|
62
|
+
):
|
63
|
+
PyNvJitLinker(cc=(0, 0))
|
64
|
+
|
65
|
+
def test_nvjitlink_invalid_cc_type_error(self):
|
66
|
+
with self.assertRaisesRegex(
|
67
|
+
TypeError, "`cc` must be a list or tuple of length 2"
|
68
|
+
):
|
69
|
+
PyNvJitLinker(cc=0)
|
70
|
+
|
71
|
+
def test_nvjitlink_ptx_compile_options(self):
|
72
|
+
|
73
|
+
max_registers = (None, 32)
|
74
|
+
lineinfo = (False, True)
|
75
|
+
lto = (False, True)
|
76
|
+
additional_flags = (None, ("-g",), ("-g", "-time"))
|
77
|
+
for (
|
78
|
+
max_registers_i,
|
79
|
+
line_info_i,
|
80
|
+
lto_i,
|
81
|
+
additional_flags_i,
|
82
|
+
) in itertools.product(max_registers, lineinfo, lto, additional_flags):
|
83
|
+
with self.subTest(
|
84
|
+
max_registers=max_registers_i,
|
85
|
+
lineinfo=line_info_i,
|
86
|
+
lto=lto_i,
|
87
|
+
additional_flags=additional_flags_i,
|
88
|
+
):
|
89
|
+
patched_linker = PyNvJitLinker(
|
90
|
+
cc=(7, 5),
|
91
|
+
max_registers=max_registers_i,
|
92
|
+
lineinfo=line_info_i,
|
93
|
+
lto=lto_i,
|
94
|
+
additional_flags=additional_flags_i,
|
95
|
+
)
|
96
|
+
assert "-arch=sm_75" in patched_linker.options
|
97
|
+
|
98
|
+
if max_registers_i:
|
99
|
+
assert (
|
100
|
+
f"-maxrregcount={max_registers_i}"
|
101
|
+
in patched_linker.options
|
102
|
+
)
|
103
|
+
else:
|
104
|
+
assert "-maxrregcount" not in patched_linker.options
|
105
|
+
|
106
|
+
if line_info_i:
|
107
|
+
assert "-lineinfo" in patched_linker.options
|
108
|
+
else:
|
109
|
+
assert "-lineinfo" not in patched_linker.options
|
110
|
+
|
111
|
+
if lto_i:
|
112
|
+
assert "-lto" in patched_linker.options
|
113
|
+
else:
|
114
|
+
assert "-lto" not in patched_linker.options
|
115
|
+
|
116
|
+
if additional_flags_i:
|
117
|
+
for flag in additional_flags_i:
|
118
|
+
assert flag in patched_linker.options
|
119
|
+
|
120
|
+
def test_nvjitlink_add_file_guess_ext_linkable_code(self):
|
121
|
+
files = (
|
122
|
+
test_device_functions_a,
|
123
|
+
test_device_functions_cubin,
|
124
|
+
test_device_functions_cu,
|
125
|
+
test_device_functions_fatbin,
|
126
|
+
test_device_functions_o,
|
127
|
+
test_device_functions_ptx,
|
128
|
+
)
|
129
|
+
for file in files:
|
130
|
+
with self.subTest(file=file):
|
131
|
+
patched_linker = PyNvJitLinker(
|
132
|
+
cc=get_current_device().compute_capability
|
133
|
+
)
|
134
|
+
patched_linker.add_file_guess_ext(file)
|
135
|
+
|
136
|
+
def test_nvjitlink_test_add_file_guess_ext_invalid_input(self):
|
137
|
+
with open(test_device_functions_cubin, "rb") as f:
|
138
|
+
content = f.read()
|
139
|
+
|
140
|
+
patched_linker = PyNvJitLinker(
|
141
|
+
cc=get_current_device().compute_capability
|
142
|
+
)
|
143
|
+
with self.assertRaisesRegex(
|
144
|
+
TypeError, "Expected path to file or a LinkableCode"
|
145
|
+
):
|
146
|
+
# Feeding raw data as bytes to add_file_guess_ext should raise,
|
147
|
+
# because there's no way to know what kind of file to treat it as
|
148
|
+
patched_linker.add_file_guess_ext(content)
|
149
|
+
|
150
|
+
def test_nvjitlink_jit_with_linkable_code(self):
|
151
|
+
files = (
|
152
|
+
test_device_functions_a,
|
153
|
+
test_device_functions_cubin,
|
154
|
+
test_device_functions_cu,
|
155
|
+
test_device_functions_fatbin,
|
156
|
+
test_device_functions_o,
|
157
|
+
test_device_functions_ptx,
|
158
|
+
)
|
159
|
+
for file in files:
|
160
|
+
with self.subTest(file=file):
|
161
|
+
sig = "uint32(uint32, uint32)"
|
162
|
+
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
163
|
+
|
164
|
+
@cuda.jit(link=[file])
|
165
|
+
def kernel(result):
|
166
|
+
result[0] = add_from_numba(1, 2)
|
167
|
+
|
168
|
+
result = cuda.device_array(1)
|
169
|
+
kernel[1, 1](result)
|
170
|
+
assert result[0] == 3
|
171
|
+
|
172
|
+
def test_nvjitlink_jit_with_linkable_code_lto(self):
|
173
|
+
file = test_device_functions_ltoir
|
174
|
+
|
175
|
+
sig = "uint32(uint32, uint32)"
|
176
|
+
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
177
|
+
|
178
|
+
@cuda.jit(link=[file], lto=True)
|
179
|
+
def kernel(result):
|
180
|
+
result[0] = add_from_numba(1, 2)
|
181
|
+
|
182
|
+
result = cuda.device_array(1)
|
183
|
+
kernel[1, 1](result)
|
184
|
+
assert result[0] == 3
|
185
|
+
|
186
|
+
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
187
|
+
with open(test_device_functions_cubin, "rb") as f:
|
188
|
+
content = f.read()
|
189
|
+
with self.assertRaisesRegex(
|
190
|
+
TypeError, "Expected path to file or a LinkableCode"
|
191
|
+
):
|
192
|
+
|
193
|
+
@cuda.jit("void()", link=[content])
|
194
|
+
def kernel():
|
195
|
+
pass
|
196
|
+
|
197
|
+
|
198
|
+
if __name__ == "__main__":
|
199
|
+
unittest.main()
|
@@ -68,6 +68,10 @@ def simple_popc(ary, c):
|
|
68
68
|
ary[0] = cuda.popc(c)
|
69
69
|
|
70
70
|
|
71
|
+
def simple_bit_count(ary, c):
|
72
|
+
ary[0] = c.bit_count()
|
73
|
+
|
74
|
+
|
71
75
|
def simple_fma(ary, a, b, c):
|
72
76
|
ary[0] = cuda.fma(a, b, c)
|
73
77
|
|
@@ -550,17 +554,53 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
550
554
|
|
551
555
|
self.assertTrue(np.all(arr))
|
552
556
|
|
557
|
+
def test_popc_u1(self):
|
558
|
+
compiled = cuda.jit("void(int32[:], uint8)")(simple_popc)
|
559
|
+
ary = np.zeros(1, dtype=np.int8)
|
560
|
+
compiled[1, 1](ary, np.uint8(0xFF))
|
561
|
+
self.assertEqual(ary[0], 8)
|
562
|
+
|
563
|
+
def test_popc_u2(self):
|
564
|
+
compiled = cuda.jit("void(int32[:], uint16)")(simple_popc)
|
565
|
+
ary = np.zeros(1, dtype=np.int16)
|
566
|
+
compiled[1, 1](ary, np.uint16(0xFFFF))
|
567
|
+
self.assertEqual(ary[0], 16)
|
568
|
+
|
553
569
|
def test_popc_u4(self):
|
554
570
|
compiled = cuda.jit("void(int32[:], uint32)")(simple_popc)
|
555
571
|
ary = np.zeros(1, dtype=np.int32)
|
556
|
-
compiled[1, 1](ary,
|
557
|
-
self.assertEqual(ary[0],
|
572
|
+
compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
|
573
|
+
self.assertEqual(ary[0], 32)
|
558
574
|
|
559
575
|
def test_popc_u8(self):
|
560
576
|
compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
|
561
577
|
ary = np.zeros(1, dtype=np.int32)
|
562
|
-
compiled[1, 1](ary,
|
563
|
-
self.assertEqual(ary[0],
|
578
|
+
compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
|
579
|
+
self.assertEqual(ary[0], 64)
|
580
|
+
|
581
|
+
def test_bit_count_u1(self):
|
582
|
+
compiled = cuda.jit("void(int32[:], uint8)")(simple_bit_count)
|
583
|
+
ary = np.zeros(1, dtype=np.int8)
|
584
|
+
compiled[1, 1](ary, np.uint8(0xFF))
|
585
|
+
self.assertEqual(ary[0], 8)
|
586
|
+
|
587
|
+
def test_bit_count_u2(self):
|
588
|
+
compiled = cuda.jit("void(int32[:], uint16)")(simple_bit_count)
|
589
|
+
ary = np.zeros(1, dtype=np.int16)
|
590
|
+
compiled[1, 1](ary, np.uint16(0xFFFF))
|
591
|
+
self.assertEqual(ary[0], 16)
|
592
|
+
|
593
|
+
def test_bit_count_u4(self):
|
594
|
+
compiled = cuda.jit("void(int32[:], uint32)")(simple_bit_count)
|
595
|
+
ary = np.zeros(1, dtype=np.int32)
|
596
|
+
compiled[1, 1](ary, np.uint32(0xFFFFFFFF))
|
597
|
+
self.assertEqual(ary[0], 32)
|
598
|
+
|
599
|
+
def test_bit_count_u8(self):
|
600
|
+
compiled = cuda.jit("void(int32[:], uint64)")(simple_bit_count)
|
601
|
+
ary = np.zeros(1, dtype=np.int32)
|
602
|
+
compiled[1, 1](ary, np.uint64(0xFFFFFFFFFFFFFFFF))
|
603
|
+
self.assertEqual(ary[0], 64)
|
564
604
|
|
565
605
|
def test_fma_f4(self):
|
566
606
|
compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
|
@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
|
|
126
126
|
|
127
127
|
def test_bool(self):
|
128
128
|
output, _ = self.run_code(printbool_usecase)
|
129
|
-
expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
|
130
|
-
self.
|
129
|
+
expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
|
130
|
+
self.assertRegex(output.strip(), expected)
|
131
131
|
|
132
132
|
def test_printempty(self):
|
133
133
|
output, _ = self.run_code(printempty_usecase)
|