numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +46 -12
- numba_cuda/numba/cuda/compiler.py +15 -9
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +12 -11
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
- numba_cuda/numba/cuda/core/interpreter.py +334 -160
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +149 -128
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +5 -5
- numba_cuda/numba/cuda/core/transforms.py +29 -16
- numba_cuda/numba/cuda/core/typed_passes.py +10 -10
- numba_cuda/numba/cuda/core/typeinfer.py +42 -27
- numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
- numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +25 -0
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +39 -49
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +36 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +61 -9
- numba_cuda/numba/cuda/np/numpy_support.py +32 -9
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +51 -2
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -2,21 +2,25 @@
|
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
4
|
from ctypes import c_int, sizeof
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
5
|
+
import cffi
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
|
|
9
|
+
from numba.cuda._compat import (
|
|
10
|
+
LaunchConfig,
|
|
11
|
+
Device,
|
|
12
|
+
Stream as ExperimentalStream,
|
|
13
|
+
launch,
|
|
11
14
|
)
|
|
12
15
|
|
|
13
16
|
from numba import cuda
|
|
14
|
-
from numba.cuda.cudadrv import devices,
|
|
15
|
-
from numba.cuda.testing import unittest, CUDATestCase
|
|
17
|
+
from numba.cuda.cudadrv import devices, nvrtc
|
|
18
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_unless_cc_90
|
|
16
19
|
from numba.cuda.testing import skip_on_cudasim
|
|
20
|
+
from numba.cuda.tests.support import override_config
|
|
21
|
+
from numba.core import types
|
|
17
22
|
import contextlib
|
|
18
23
|
|
|
19
|
-
from cuda.core.experimental import Device
|
|
20
24
|
|
|
21
25
|
ptx1 = """
|
|
22
26
|
.version 1.4
|
|
@@ -98,22 +102,15 @@ class TestCudaDriver(CUDATestCase):
|
|
|
98
102
|
host_to_device(memory, array, sizeof(array))
|
|
99
103
|
|
|
100
104
|
ptr = memory.device_ctypes_pointer
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
100,
|
|
111
|
-
1,
|
|
112
|
-
1, # bx, by, bz
|
|
113
|
-
0, # dynamic shared mem
|
|
114
|
-
stream, # stream
|
|
115
|
-
[ptr],
|
|
116
|
-
) # arguments
|
|
105
|
+
|
|
106
|
+
config = LaunchConfig(
|
|
107
|
+
grid=(1, 1, 1),
|
|
108
|
+
block=(100, 1, 1),
|
|
109
|
+
shmem_size=0,
|
|
110
|
+
cooperative_launch=False,
|
|
111
|
+
)
|
|
112
|
+
exp_stream = ExperimentalStream.from_handle(0)
|
|
113
|
+
launch(exp_stream, config, function.kernel, ptr)
|
|
117
114
|
|
|
118
115
|
device_to_host(array, memory, sizeof(array))
|
|
119
116
|
for i, v in enumerate(array):
|
|
@@ -122,6 +119,8 @@ class TestCudaDriver(CUDATestCase):
|
|
|
122
119
|
module.unload()
|
|
123
120
|
|
|
124
121
|
def test_cuda_driver_stream_operations(self):
|
|
122
|
+
from numba.cuda.cudadrv.driver import _to_core_stream
|
|
123
|
+
|
|
125
124
|
module = self.context.create_module_ptx(self.ptx)
|
|
126
125
|
function = module.get_function("_Z10helloworldPi")
|
|
127
126
|
|
|
@@ -135,21 +134,14 @@ class TestCudaDriver(CUDATestCase):
|
|
|
135
134
|
|
|
136
135
|
ptr = memory.device_ctypes_pointer
|
|
137
136
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
100,
|
|
147
|
-
1,
|
|
148
|
-
1, # bx, by, bz
|
|
149
|
-
0, # dynamic shared mem
|
|
150
|
-
stream_handle, # stream
|
|
151
|
-
[ptr],
|
|
152
|
-
) # arguments
|
|
137
|
+
config = LaunchConfig(
|
|
138
|
+
grid=(1, 1, 1),
|
|
139
|
+
block=(100, 1, 1),
|
|
140
|
+
shmem_size=0,
|
|
141
|
+
cooperative_launch=False,
|
|
142
|
+
)
|
|
143
|
+
# Convert numba Stream to ExperimentalStream
|
|
144
|
+
launch(_to_core_stream(stream), config, function.kernel, ptr)
|
|
153
145
|
|
|
154
146
|
device_to_host(array, memory, sizeof(array), stream=stream)
|
|
155
147
|
|
|
@@ -177,18 +169,13 @@ class TestCudaDriver(CUDATestCase):
|
|
|
177
169
|
|
|
178
170
|
ptr = memory.device_ctypes_pointer
|
|
179
171
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
1,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
100,
|
|
186
|
-
1,
|
|
187
|
-
1, # bx, by, bz
|
|
188
|
-
0, # dynamic shared mem
|
|
189
|
-
stream.handle, # stream
|
|
190
|
-
[ptr],
|
|
172
|
+
config = LaunchConfig(
|
|
173
|
+
grid=(1, 1, 1),
|
|
174
|
+
block=(100, 1, 1),
|
|
175
|
+
shmem_size=0,
|
|
176
|
+
cooperative_launch=False,
|
|
191
177
|
)
|
|
178
|
+
launch(stream, config, function.kernel, ptr)
|
|
192
179
|
|
|
193
180
|
device_to_host(array, memory, sizeof(array), stream=stream)
|
|
194
181
|
for i, v in enumerate(array):
|
|
@@ -285,6 +272,105 @@ class TestCudaDriver(CUDATestCase):
|
|
|
285
272
|
self.assertTrue(grid > 0)
|
|
286
273
|
self.assertTrue(block > 0)
|
|
287
274
|
|
|
275
|
+
def test_cuda_cache_config(self):
|
|
276
|
+
from numba import types
|
|
277
|
+
import numpy as np
|
|
278
|
+
|
|
279
|
+
sig = (types.float32[::1], types.float32[::1])
|
|
280
|
+
|
|
281
|
+
@cuda.jit(sig)
|
|
282
|
+
def add_one(r, x):
|
|
283
|
+
i = cuda.grid(1)
|
|
284
|
+
if i < len(r):
|
|
285
|
+
r[i] = x[i] + 1
|
|
286
|
+
|
|
287
|
+
kernel = add_one.overloads[sig]
|
|
288
|
+
cufunc = kernel._codelibrary.get_cufunc()
|
|
289
|
+
|
|
290
|
+
configs_to_test = [
|
|
291
|
+
("prefer_shared", dict(prefer_shared=True)),
|
|
292
|
+
("prefer_cache", dict(prefer_cache=True)),
|
|
293
|
+
("prefer_equal", dict(prefer_equal=True)),
|
|
294
|
+
("default", dict()),
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
for name, kwargs in configs_to_test:
|
|
298
|
+
with self.subTest(config=name):
|
|
299
|
+
try:
|
|
300
|
+
cufunc.cache_config(**kwargs)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
self.fail(f"cache_config({name}) failed: {e}")
|
|
303
|
+
|
|
304
|
+
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
305
|
+
r = np.zeros_like(x)
|
|
306
|
+
|
|
307
|
+
d_x = cuda.to_device(x)
|
|
308
|
+
d_r = cuda.to_device(r)
|
|
309
|
+
|
|
310
|
+
cufunc.cache_config(prefer_shared=True)
|
|
311
|
+
add_one[1, 5](d_r, d_x)
|
|
312
|
+
|
|
313
|
+
result = d_r.copy_to_host()
|
|
314
|
+
expected = x + 1
|
|
315
|
+
|
|
316
|
+
np.testing.assert_array_almost_equal(
|
|
317
|
+
result,
|
|
318
|
+
expected,
|
|
319
|
+
err_msg="Kernel produced incorrect results after cache_config",
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def test_cuda_set_shared_memory_carveout(self):
|
|
323
|
+
from numba import types
|
|
324
|
+
import numpy as np
|
|
325
|
+
|
|
326
|
+
sig = (types.float32[::1], types.float32[::1])
|
|
327
|
+
|
|
328
|
+
@cuda.jit(sig)
|
|
329
|
+
def add_one(r, x):
|
|
330
|
+
i = cuda.grid(1)
|
|
331
|
+
if i < len(r):
|
|
332
|
+
r[i] = x[i] + 1
|
|
333
|
+
|
|
334
|
+
kernel = add_one.overloads[sig]
|
|
335
|
+
cufunc = kernel._codelibrary.get_cufunc()
|
|
336
|
+
|
|
337
|
+
# valid carveout values
|
|
338
|
+
carveout_values = [-1, 0, 50, 100]
|
|
339
|
+
for value in carveout_values:
|
|
340
|
+
with self.subTest(carveout=value):
|
|
341
|
+
try:
|
|
342
|
+
cufunc.set_shared_memory_carveout(value)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
self.fail(
|
|
345
|
+
f"set_shared_memory_carveout({value}) failed: {e}"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# invalid carveout values
|
|
349
|
+
invalid_values = [-2, 101, 150]
|
|
350
|
+
for value in invalid_values:
|
|
351
|
+
with self.subTest(invalid_carveout=value):
|
|
352
|
+
with self.assertRaises(ValueError):
|
|
353
|
+
cufunc.set_shared_memory_carveout(value)
|
|
354
|
+
|
|
355
|
+
# test the kernel
|
|
356
|
+
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
357
|
+
r = np.zeros_like(x)
|
|
358
|
+
|
|
359
|
+
d_x = cuda.to_device(x)
|
|
360
|
+
d_r = cuda.to_device(r)
|
|
361
|
+
|
|
362
|
+
cufunc.set_shared_memory_carveout(75)
|
|
363
|
+
add_one[1, 5](d_r, d_x)
|
|
364
|
+
|
|
365
|
+
result = d_r.copy_to_host()
|
|
366
|
+
expected = x + 1
|
|
367
|
+
|
|
368
|
+
np.testing.assert_array_almost_equal(
|
|
369
|
+
result,
|
|
370
|
+
expected,
|
|
371
|
+
err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
|
|
372
|
+
)
|
|
373
|
+
|
|
288
374
|
|
|
289
375
|
class TestDevice(CUDATestCase):
|
|
290
376
|
def test_device_get_uuid(self):
|
|
@@ -309,5 +395,63 @@ class TestDevice(CUDATestCase):
|
|
|
309
395
|
self.assertRegex(dev.uuid, uuid_format)
|
|
310
396
|
|
|
311
397
|
|
|
398
|
+
@skip_on_cudasim("CUDA asm unsupported in the simulator")
|
|
399
|
+
class TestAcceleratedArchitecture(CUDATestCase):
|
|
400
|
+
@skip_unless_cc_90
|
|
401
|
+
def test_device_arch_specific(self):
|
|
402
|
+
set_desc = cuda.CUSource("""
|
|
403
|
+
#include <cuda_fp16.h>
|
|
404
|
+
|
|
405
|
+
extern "C" __device__
|
|
406
|
+
int set_descriptor(int *out, int* smem) {
|
|
407
|
+
unsigned usmem = __cvta_generic_to_shared(smem);
|
|
408
|
+
asm volatile("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], 2;" :: "r"(usmem));
|
|
409
|
+
return 0;
|
|
410
|
+
}
|
|
411
|
+
""")
|
|
412
|
+
|
|
413
|
+
set_descriptor = cuda.declare_device(
|
|
414
|
+
"set_descriptor",
|
|
415
|
+
types.int32(types.CPointer(types.int32)),
|
|
416
|
+
link=[set_desc],
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
ffi = cffi.FFI()
|
|
420
|
+
|
|
421
|
+
@cuda.jit
|
|
422
|
+
def kernel(a):
|
|
423
|
+
sm = cuda.shared.array(1, dtype=np.int32)
|
|
424
|
+
data_ptr = ffi.from_buffer(sm)
|
|
425
|
+
set_descriptor(data_ptr)
|
|
426
|
+
|
|
427
|
+
# just to prevent optimization:
|
|
428
|
+
sm[0] = 2
|
|
429
|
+
cuda.syncthreads()
|
|
430
|
+
a[0] = sm[0]
|
|
431
|
+
|
|
432
|
+
a = np.ones(1, dtype=np.int32)
|
|
433
|
+
|
|
434
|
+
kernel[1, 1](a)
|
|
435
|
+
|
|
436
|
+
assert a[0] == 2
|
|
437
|
+
|
|
438
|
+
def test_get_arch_option_force_cc(self):
|
|
439
|
+
with override_config("FORCE_CUDA_CC", (8, 0)):
|
|
440
|
+
arch = nvrtc.get_arch_option(9, 0, "a")
|
|
441
|
+
self.assertEqual("compute_80", arch)
|
|
442
|
+
|
|
443
|
+
def test_get_arch_option_force_cc_arch_specific(self):
|
|
444
|
+
with override_config("FORCE_CUDA_CC", (9, 0, "a")):
|
|
445
|
+
arch = nvrtc.get_arch_option(9, 0)
|
|
446
|
+
self.assertEqual("compute_90a", arch)
|
|
447
|
+
|
|
448
|
+
def test_get_arch_option_illegal_arch_specific(self):
|
|
449
|
+
# Using a fictitious very high compute capability (major 99) for this
|
|
450
|
+
# test to ensure future toolkits are unlikely to provide an exact match
|
|
451
|
+
msg = "Can't use arch-specific compute_990a with"
|
|
452
|
+
with self.assertRaisesRegex(ValueError, msg):
|
|
453
|
+
nvrtc.get_arch_option(99, 0, "a")
|
|
454
|
+
|
|
455
|
+
|
|
312
456
|
if __name__ == "__main__":
|
|
313
457
|
unittest.main()
|
|
@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
|
|
|
87
87
|
dtor_invoked[0] += 1
|
|
88
88
|
|
|
89
89
|
# Ensure finalizer is called when pointer is deleted
|
|
90
|
-
ptr = driver.MemoryPointer(
|
|
90
|
+
ptr = driver.MemoryPointer(
|
|
91
|
+
context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
|
|
92
|
+
)
|
|
91
93
|
self.assertEqual(dtor_invoked[0], 0)
|
|
92
94
|
del ptr
|
|
93
95
|
self.assertEqual(dtor_invoked[0], 1)
|
|
94
96
|
|
|
95
97
|
# Ensure removing derived pointer doesn't call finalizer
|
|
96
|
-
ptr = driver.MemoryPointer(
|
|
98
|
+
ptr = driver.MemoryPointer(
|
|
99
|
+
context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
|
|
100
|
+
)
|
|
97
101
|
owned = ptr.own()
|
|
98
102
|
del owned
|
|
99
103
|
self.assertEqual(dtor_invoked[0], 1)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import ctypes
|
|
5
5
|
import numpy as np
|
|
6
|
+
import weakref
|
|
6
7
|
|
|
7
8
|
from numba import cuda
|
|
8
9
|
from numba.cuda.core import config
|
|
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
|
|
|
57
58
|
|
|
58
59
|
# We use an AutoFreePointer so that the finalizer will be run when
|
|
59
60
|
# the reference count drops to zero.
|
|
61
|
+
ctx = weakref.proxy(self.context)
|
|
60
62
|
ptr = ctypes.c_void_p(alloc_count)
|
|
61
63
|
return cuda.cudadrv.driver.AutoFreePointer(
|
|
62
|
-
ptr, size, finalizer=finalizer
|
|
64
|
+
ctx, ptr, size, finalizer=finalizer
|
|
63
65
|
)
|
|
64
66
|
|
|
65
67
|
def initialize(self):
|
|
@@ -10,11 +10,12 @@ from numba.cuda.testing import (
|
|
|
10
10
|
skip_if_nvjitlink_missing,
|
|
11
11
|
)
|
|
12
12
|
from numba.cuda.testing import CUDATestCase, test_data_dir
|
|
13
|
-
from numba.cuda.cudadrv.driver import
|
|
13
|
+
from numba.cuda.cudadrv.driver import _Linker, LinkerError
|
|
14
14
|
from numba.cuda import require_context
|
|
15
15
|
from numba import cuda
|
|
16
16
|
from numba.cuda import void, float64, int64, int32, float32
|
|
17
17
|
from numba.cuda.typing.typeof import typeof
|
|
18
|
+
from numba.cuda._compat import CUDAError
|
|
18
19
|
|
|
19
20
|
CONST1D = np.arange(10, dtype=np.float64)
|
|
20
21
|
|
|
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
|
|
|
113
114
|
@require_context
|
|
114
115
|
def test_linker_basic(self):
|
|
115
116
|
"""Simply go through the constructor and destructor"""
|
|
116
|
-
linker = _Linker
|
|
117
|
+
linker = _Linker(max_registers=0, cc=(7, 5))
|
|
117
118
|
del linker
|
|
118
119
|
|
|
119
120
|
def _test_linking(self, eager):
|
|
@@ -195,7 +196,7 @@ class TestLinker(CUDATestCase):
|
|
|
195
196
|
|
|
196
197
|
link = str(test_data_dir / "error.cu")
|
|
197
198
|
|
|
198
|
-
from cuda.
|
|
199
|
+
from numba.cuda._compat import NVRTCError
|
|
199
200
|
|
|
200
201
|
errty = NVRTCError
|
|
201
202
|
with self.assertRaises(errty) as e:
|
|
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
|
|
|
308
309
|
max_threads = compiled.get_max_threads_per_block()
|
|
309
310
|
nelem = max_threads + 1
|
|
310
311
|
ary = np.empty(nelem, dtype=np.int32)
|
|
311
|
-
|
|
312
|
+
with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
|
|
312
313
|
compiled[1, nelem](ary)
|
|
313
|
-
except CudaAPIError as e:
|
|
314
|
-
self.assertIn("cuLaunchKernel", e.msg)
|
|
315
314
|
|
|
316
315
|
def test_get_local_mem_per_thread(self):
|
|
317
316
|
sig = void(int32[::1], int32[::1], typeof(np.int32))
|
|
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
|
|
|
333
332
|
|
|
334
333
|
@skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
|
|
335
334
|
def test_link_for_different_cc(self):
|
|
336
|
-
linker = _Linker
|
|
335
|
+
linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
|
|
337
336
|
code = """
|
|
338
337
|
__device__ int foo(int x) {
|
|
339
338
|
return x + 1;
|
|
@@ -13,11 +13,10 @@ from numba.cuda.testing import (
|
|
|
13
13
|
CUDATestCase,
|
|
14
14
|
skip_on_cudasim,
|
|
15
15
|
)
|
|
16
|
+
from numba.cuda._compat import ObjectCode
|
|
16
17
|
|
|
17
18
|
if not config.ENABLE_CUDASIM:
|
|
18
|
-
from cuda.bindings.driver import
|
|
19
|
-
|
|
20
|
-
from cuda.bindings.driver import CUmodule as cu_module_type
|
|
19
|
+
from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
def wipe_all_modules_in_context():
|
|
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
|
|
|
31
30
|
ctx.reset()
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def get_hashable_handle_value(
|
|
35
|
-
return handle
|
|
33
|
+
def get_hashable_handle_value(object_code):
|
|
34
|
+
return object_code.handle
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
@skip_on_cudasim("Module loading not implemented in the simulator")
|
|
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
|
|
|
40
39
|
def test_basic(self):
|
|
41
40
|
counter = 0
|
|
42
41
|
|
|
43
|
-
def setup(
|
|
44
|
-
self.
|
|
42
|
+
def setup(object_code):
|
|
43
|
+
self.assertIsInstance(object_code, ObjectCode)
|
|
45
44
|
nonlocal counter
|
|
46
45
|
counter += 1
|
|
47
46
|
|
|
48
|
-
def teardown(
|
|
49
|
-
self.
|
|
47
|
+
def teardown(object_code):
|
|
48
|
+
self.assertIsInstance(object_code, ObjectCode)
|
|
50
49
|
nonlocal counter
|
|
51
50
|
counter -= 1
|
|
52
51
|
|
|
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
|
|
|
183
182
|
}
|
|
184
183
|
"""
|
|
185
184
|
|
|
186
|
-
def set_forty_two(
|
|
185
|
+
def set_forty_two(object_code):
|
|
187
186
|
# Initialize 42 to global variable `num`
|
|
188
|
-
res, dptr, size =
|
|
189
|
-
get_hashable_handle_value(
|
|
187
|
+
res, dptr, size = cuLibraryGetGlobal(
|
|
188
|
+
get_hashable_handle_value(object_code), b"num"
|
|
190
189
|
)
|
|
191
190
|
|
|
192
191
|
arr = np.array([42], np.int32)
|
|
@@ -43,6 +43,12 @@ if TEST_BIN_DIR:
|
|
|
43
43
|
TEST_BIN_DIR, "test_device_functions.ltoir"
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
+
require_cuobjdump = (
|
|
47
|
+
test_device_functions_fatbin_multi,
|
|
48
|
+
test_device_functions_fatbin,
|
|
49
|
+
test_device_functions_o,
|
|
50
|
+
)
|
|
51
|
+
|
|
46
52
|
|
|
47
53
|
@unittest.skipIf(
|
|
48
54
|
not TEST_BIN_DIR or not _have_nvjitlink(),
|
|
@@ -99,17 +105,50 @@ class TestLinker(CUDATestCase):
|
|
|
99
105
|
kernel[1, 1](result)
|
|
100
106
|
assert result[0] == 3
|
|
101
107
|
|
|
108
|
+
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
|
109
|
+
with open(test_device_functions_cubin, "rb") as f:
|
|
110
|
+
content = f.read()
|
|
111
|
+
with self.assertRaisesRegex(
|
|
112
|
+
TypeError, "Expected path to file or a LinkableCode"
|
|
113
|
+
):
|
|
114
|
+
|
|
115
|
+
@cuda.jit("void()", link=[content])
|
|
116
|
+
def kernel():
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@unittest.skipIf(
|
|
121
|
+
not TEST_BIN_DIR or not _have_nvjitlink(),
|
|
122
|
+
"nvJitLink not installed or new enough (>12.3)",
|
|
123
|
+
)
|
|
124
|
+
@skip_on_cudasim("Linking unsupported in the simulator")
|
|
125
|
+
class TestLinkerDumpAssembly(CUDATestCase):
|
|
126
|
+
def setUp(self):
|
|
127
|
+
super().setUp()
|
|
128
|
+
self._prev_dump_assembly = config.DUMP_ASSEMBLY
|
|
129
|
+
config.DUMP_ASSEMBLY = True
|
|
130
|
+
|
|
131
|
+
def tearDown(self):
|
|
132
|
+
config.DUMP_ASSEMBLY = self._prev_dump_assembly
|
|
133
|
+
super().tearDown()
|
|
134
|
+
|
|
102
135
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
|
|
103
|
-
files =
|
|
136
|
+
files = (
|
|
104
137
|
test_device_functions_cu,
|
|
105
138
|
test_device_functions_ltoir,
|
|
106
139
|
test_device_functions_fatbin_multi,
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
config.DUMP_ASSEMBLY = True
|
|
140
|
+
)
|
|
110
141
|
|
|
111
142
|
for file in files:
|
|
112
143
|
with self.subTest(file=file):
|
|
144
|
+
if (
|
|
145
|
+
file in require_cuobjdump
|
|
146
|
+
and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
|
|
147
|
+
):
|
|
148
|
+
self.skipTest(
|
|
149
|
+
"wheel-only environments do not have cuobjdump"
|
|
150
|
+
)
|
|
151
|
+
|
|
113
152
|
f = io.StringIO()
|
|
114
153
|
with contextlib.redirect_stdout(f):
|
|
115
154
|
sig = "uint32(uint32, uint32)"
|
|
@@ -125,21 +164,25 @@ class TestLinker(CUDATestCase):
|
|
|
125
164
|
|
|
126
165
|
self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
|
|
127
166
|
|
|
128
|
-
config.DUMP_ASSEMBLY = False
|
|
129
|
-
|
|
130
167
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
|
|
131
|
-
files =
|
|
168
|
+
files = (
|
|
132
169
|
test_device_functions_a,
|
|
133
170
|
test_device_functions_cubin,
|
|
134
171
|
test_device_functions_fatbin,
|
|
135
172
|
test_device_functions_o,
|
|
136
173
|
test_device_functions_ptx,
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
config.DUMP_ASSEMBLY = True
|
|
174
|
+
)
|
|
140
175
|
|
|
141
176
|
for file in files:
|
|
142
177
|
with self.subTest(file=file):
|
|
178
|
+
if (
|
|
179
|
+
file in require_cuobjdump
|
|
180
|
+
and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
|
|
181
|
+
):
|
|
182
|
+
self.skipTest(
|
|
183
|
+
"wheel-only environments do not have cuobjdump"
|
|
184
|
+
)
|
|
185
|
+
|
|
143
186
|
sig = "uint32(uint32, uint32)"
|
|
144
187
|
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
|
145
188
|
|
|
@@ -156,19 +199,6 @@ class TestLinker(CUDATestCase):
|
|
|
156
199
|
func(result)
|
|
157
200
|
assert result[0] == 3
|
|
158
201
|
|
|
159
|
-
config.DUMP_ASSEMBLY = False
|
|
160
|
-
|
|
161
|
-
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
|
162
|
-
with open(test_device_functions_cubin, "rb") as f:
|
|
163
|
-
content = f.read()
|
|
164
|
-
with self.assertRaisesRegex(
|
|
165
|
-
TypeError, "Expected path to file or a LinkableCode"
|
|
166
|
-
):
|
|
167
|
-
|
|
168
|
-
@cuda.jit("void()", link=[content])
|
|
169
|
-
def kernel():
|
|
170
|
-
pass
|
|
171
|
-
|
|
172
202
|
|
|
173
203
|
if __name__ == "__main__":
|
|
174
204
|
unittest.main()
|
|
@@ -854,13 +854,25 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
854
854
|
_CONST2 = "PLACEHOLDER2"
|
|
855
855
|
return _CONST2 + 4
|
|
856
856
|
|
|
857
|
-
|
|
857
|
+
if PYVERSION in ((3, 14),):
|
|
858
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
859
|
+
new = self._literal_const_sample_generator(impl, {0: 0, 2: 20})
|
|
860
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
861
|
+
new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
|
|
862
|
+
else:
|
|
863
|
+
raise NotImplementedError(PYVERSION)
|
|
858
864
|
iconst = impl.__code__.co_consts
|
|
859
865
|
nconst = new.__code__.co_consts
|
|
860
|
-
|
|
861
|
-
iconst, (
|
|
862
|
-
|
|
863
|
-
|
|
866
|
+
if PYVERSION in ((3, 14),):
|
|
867
|
+
self.assertEqual(iconst, ("PLACEHOLDER1", 3.14159, "PLACEHOLDER2"))
|
|
868
|
+
self.assertEqual(nconst, (0, 3.14159, 20))
|
|
869
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
870
|
+
self.assertEqual(
|
|
871
|
+
iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
|
|
872
|
+
)
|
|
873
|
+
self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
|
|
874
|
+
else:
|
|
875
|
+
raise NotImplementedError(PYVERSION)
|
|
864
876
|
self.assertEqual(impl(None), 3.14159)
|
|
865
877
|
self.assertEqual(new(None), 24)
|
|
866
878
|
|
|
@@ -872,7 +884,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
872
884
|
|
|
873
885
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
874
886
|
for const in c_inp:
|
|
875
|
-
|
|
887
|
+
if PYVERSION in ((3, 14),):
|
|
888
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
889
|
+
func = self._literal_const_sample_generator(
|
|
890
|
+
impl, {0: const}
|
|
891
|
+
)
|
|
892
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
893
|
+
func = self._literal_const_sample_generator(
|
|
894
|
+
impl, {1: const}
|
|
895
|
+
)
|
|
896
|
+
else:
|
|
897
|
+
raise NotImplementedError(PYVERSION)
|
|
876
898
|
self.assert_prune(
|
|
877
899
|
func, (types.NoneType("none"),), [prune], None
|
|
878
900
|
)
|
|
@@ -885,7 +907,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
885
907
|
|
|
886
908
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
887
909
|
for const in c_inp:
|
|
888
|
-
|
|
910
|
+
if PYVERSION in ((3, 14),):
|
|
911
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
912
|
+
func = self._literal_const_sample_generator(
|
|
913
|
+
impl, {0: const}
|
|
914
|
+
)
|
|
915
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
916
|
+
func = self._literal_const_sample_generator(
|
|
917
|
+
impl, {1: const}
|
|
918
|
+
)
|
|
919
|
+
else:
|
|
920
|
+
raise NotImplementedError(PYVERSION)
|
|
889
921
|
self.assert_prune(
|
|
890
922
|
func, (types.NoneType("none"),), [prune], None
|
|
891
923
|
)
|
|
@@ -900,7 +932,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
900
932
|
|
|
901
933
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
902
934
|
for const in c_inp:
|
|
903
|
-
|
|
935
|
+
if PYVERSION in ((3, 14),):
|
|
936
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
937
|
+
func = self._literal_const_sample_generator(
|
|
938
|
+
impl, {0: const}
|
|
939
|
+
)
|
|
940
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
941
|
+
func = self._literal_const_sample_generator(
|
|
942
|
+
impl, {1: const}
|
|
943
|
+
)
|
|
944
|
+
else:
|
|
945
|
+
raise NotImplementedError(PYVERSION)
|
|
904
946
|
self.assert_prune(
|
|
905
947
|
func, (types.NoneType("none"),), [prune], None
|
|
906
948
|
)
|
|
@@ -915,7 +957,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
915
957
|
|
|
916
958
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
917
959
|
for const in c_inp:
|
|
918
|
-
|
|
960
|
+
if PYVERSION in ((3, 14),):
|
|
961
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
962
|
+
func = self._literal_const_sample_generator(
|
|
963
|
+
impl, {0: const}
|
|
964
|
+
)
|
|
965
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
966
|
+
func = self._literal_const_sample_generator(
|
|
967
|
+
impl, {1: const}
|
|
968
|
+
)
|
|
969
|
+
else:
|
|
970
|
+
raise NotImplementedError(PYVERSION)
|
|
919
971
|
self.assert_prune(
|
|
920
972
|
func, (types.NoneType("none"),), [prune], None
|
|
921
973
|
)
|