numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +46 -12
- numba_cuda/numba/cuda/compiler.py +15 -9
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +12 -11
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
- numba_cuda/numba/cuda/core/interpreter.py +334 -160
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +149 -128
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +5 -5
- numba_cuda/numba/cuda/core/transforms.py +29 -16
- numba_cuda/numba/cuda/core/typed_passes.py +10 -10
- numba_cuda/numba/cuda/core/typeinfer.py +42 -27
- numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
- numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +25 -0
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +39 -49
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +36 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +61 -9
- numba_cuda/numba/cuda/np/numpy_support.py +32 -9
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +51 -2
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -187,41 +187,43 @@ def make_fancy_creation_kernel(vtype):
|
|
|
187
187
|
|
|
188
188
|
f4_34 = v4(f4_1) # 1 2 3 4
|
|
189
189
|
|
|
190
|
-
for v in (
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
190
|
+
for v in tuple(
|
|
191
|
+
(
|
|
192
|
+
f4_1,
|
|
193
|
+
f4_2,
|
|
194
|
+
f4_3,
|
|
195
|
+
f4_4,
|
|
196
|
+
f4_5,
|
|
197
|
+
f4_6,
|
|
198
|
+
f4_7,
|
|
199
|
+
f4_8,
|
|
200
|
+
f4_9,
|
|
201
|
+
f4_10,
|
|
202
|
+
f4_11,
|
|
203
|
+
f4_12,
|
|
204
|
+
f4_13,
|
|
205
|
+
f4_14,
|
|
206
|
+
f4_15,
|
|
207
|
+
f4_16,
|
|
208
|
+
f4_17,
|
|
209
|
+
f4_18,
|
|
210
|
+
f4_19,
|
|
211
|
+
f4_20,
|
|
212
|
+
f4_21,
|
|
213
|
+
f4_22,
|
|
214
|
+
f4_23,
|
|
215
|
+
f4_24,
|
|
216
|
+
f4_25,
|
|
217
|
+
f4_26,
|
|
218
|
+
f4_27,
|
|
219
|
+
f4_28,
|
|
220
|
+
f4_29,
|
|
221
|
+
f4_30,
|
|
222
|
+
f4_31,
|
|
223
|
+
f4_32,
|
|
224
|
+
f4_33,
|
|
225
|
+
f4_34,
|
|
226
|
+
)
|
|
225
227
|
):
|
|
226
228
|
res[j] = v.x
|
|
227
229
|
res[j + 1] = v.y
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
|
-
from numba import cuda
|
|
7
|
+
from numba import cuda, errors
|
|
8
8
|
from numba.cuda import int32, int64, float32, float64
|
|
9
9
|
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
|
10
10
|
from numba.cuda.compiler import compile_ptx
|
|
@@ -208,6 +208,122 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
|
208
208
|
compiled[1, nelem](ary, val)
|
|
209
209
|
self.assertTrue(np.all(ary == val))
|
|
210
210
|
|
|
211
|
+
def test_vote_sync_const_mode_val(self):
|
|
212
|
+
nelem = 32
|
|
213
|
+
ary1 = np.ones(nelem, dtype=np.int32)
|
|
214
|
+
ary2 = np.empty(nelem, dtype=np.int32)
|
|
215
|
+
|
|
216
|
+
subtest = [
|
|
217
|
+
(use_vote_sync_all, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
218
|
+
(use_vote_sync_any, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
219
|
+
(use_vote_sync_eq, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
220
|
+
(use_vote_sync_ballot, "void(uint32[:])", (ary2,)),
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
args_re = r"\((.*)\)"
|
|
224
|
+
m = re.compile(args_re)
|
|
225
|
+
|
|
226
|
+
for func, sig, input in subtest:
|
|
227
|
+
with self.subTest(func=func.__name__):
|
|
228
|
+
compiled = cuda.jit(sig)(func)
|
|
229
|
+
compiled[1, nelem](*input)
|
|
230
|
+
irs = next(iter(compiled.inspect_llvm().values()))
|
|
231
|
+
|
|
232
|
+
for ir in irs.split("\n"):
|
|
233
|
+
if "call" in ir and "llvm.nvvm.vote.sync" in ir:
|
|
234
|
+
args = m.search(ir).group(0)
|
|
235
|
+
arglist = args.split(",")
|
|
236
|
+
mode_arg = arglist[1]
|
|
237
|
+
self.assertNotIn("%", mode_arg)
|
|
238
|
+
|
|
239
|
+
def test_vote_sync_const_mode_val_sm100(self):
|
|
240
|
+
subtest = [
|
|
241
|
+
(use_vote_sync_all, "void(int32[:], int32[:])"),
|
|
242
|
+
(use_vote_sync_any, "void(int32[:], int32[:])"),
|
|
243
|
+
(use_vote_sync_eq, "void(int32[:], int32[:])"),
|
|
244
|
+
(use_vote_sync_ballot, "void(uint32[:])"),
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
for func, sig in subtest:
|
|
248
|
+
with self.subTest(func=func.__name__):
|
|
249
|
+
compile_ptx(func, sig, cc=(10, 0))
|
|
250
|
+
|
|
251
|
+
def test_vote_sync_type_validation(self):
|
|
252
|
+
nelem = 32
|
|
253
|
+
|
|
254
|
+
def use_vote_sync_all_with_mask(mask, predicate, result):
|
|
255
|
+
i = cuda.grid(1)
|
|
256
|
+
if i < result.shape[0]:
|
|
257
|
+
result[i] = cuda.all_sync(mask[i], predicate[i])
|
|
258
|
+
|
|
259
|
+
invalid_cases = [
|
|
260
|
+
(
|
|
261
|
+
"void(float32[:], int32[:], int32[:])",
|
|
262
|
+
"Mask type must be an integer",
|
|
263
|
+
),
|
|
264
|
+
(
|
|
265
|
+
"void(boolean[:], int32[:], int32[:])",
|
|
266
|
+
"Mask type must be an integer",
|
|
267
|
+
),
|
|
268
|
+
(
|
|
269
|
+
"void(float64[:], int32[:], int32[:])",
|
|
270
|
+
"Mask type must be an integer",
|
|
271
|
+
),
|
|
272
|
+
(
|
|
273
|
+
"void(int32[:], float32[:], int32[:])",
|
|
274
|
+
"Predicate must be an integer or boolean",
|
|
275
|
+
),
|
|
276
|
+
(
|
|
277
|
+
"void(int32[:], float64[:], int32[:])",
|
|
278
|
+
"Predicate must be an integer or boolean",
|
|
279
|
+
),
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
for sig, expected_msg in invalid_cases:
|
|
283
|
+
with self.subTest(sig=sig):
|
|
284
|
+
with self.assertRaisesRegex(errors.TypingError, expected_msg):
|
|
285
|
+
cuda.jit(sig)(use_vote_sync_all_with_mask)
|
|
286
|
+
|
|
287
|
+
valid_cases = [
|
|
288
|
+
# mask: unsigned/signed integer
|
|
289
|
+
# predicate: unsigned/signed integer, boolean
|
|
290
|
+
("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32),
|
|
291
|
+
("void(int64[:], int64[:], int32[:])", np.int64, np.int64),
|
|
292
|
+
("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64),
|
|
293
|
+
("void(int32[:], int32[:], int32[:])", np.int32, np.int32),
|
|
294
|
+
("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_),
|
|
295
|
+
("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_),
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
for sig, mask_dtype, pred_dtype in valid_cases:
|
|
299
|
+
with self.subTest(sig=sig):
|
|
300
|
+
mask_val = (~np.array(0, dtype=mask_dtype)).item()
|
|
301
|
+
compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
|
|
302
|
+
ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
|
|
303
|
+
ary_pred = np.ones(nelem, dtype=pred_dtype)
|
|
304
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
305
|
+
compiled[1, nelem](ary_mask, ary_pred, ary_result)
|
|
306
|
+
|
|
307
|
+
# literals
|
|
308
|
+
@cuda.jit
|
|
309
|
+
def use_vote_sync_all_with_literal(result):
|
|
310
|
+
i = cuda.grid(1)
|
|
311
|
+
if i < result.shape[0]:
|
|
312
|
+
result[i] = cuda.all_sync(0xFFFFFFFF, 1)
|
|
313
|
+
|
|
314
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
315
|
+
use_vote_sync_all_with_literal[1, nelem](ary_result)
|
|
316
|
+
|
|
317
|
+
@cuda.jit
|
|
318
|
+
def use_vote_sync_all_with_predicate_literal(mask, result):
|
|
319
|
+
i = cuda.grid(1)
|
|
320
|
+
if i < mask.shape[0]:
|
|
321
|
+
result[i] = cuda.all_sync(mask[i], 1)
|
|
322
|
+
|
|
323
|
+
ary_mask = np.full(nelem, 0xFFFFFFFF, dtype=np.uint32)
|
|
324
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
325
|
+
use_vote_sync_all_with_predicate_literal[1, nelem](ary_mask, ary_result)
|
|
326
|
+
|
|
211
327
|
def test_vote_sync_all(self):
|
|
212
328
|
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
|
|
213
329
|
nelem = 32
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import unittest
|
|
5
|
+
|
|
6
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
|
7
|
+
from numba.cuda.tests.support import captured_stdout
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
|
11
|
+
class TestGlobals(CUDATestCase):
|
|
12
|
+
"""
|
|
13
|
+
Tests demonstrating how global variables are captured in CUDA kernels.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def setUp(self):
|
|
17
|
+
# Prevent output from this test showing
|
|
18
|
+
# up when running the test suite
|
|
19
|
+
self._captured_stdout = captured_stdout()
|
|
20
|
+
self._captured_stdout.__enter__()
|
|
21
|
+
super().setUp()
|
|
22
|
+
|
|
23
|
+
def tearDown(self):
|
|
24
|
+
# No exception type, value, or traceback
|
|
25
|
+
self._captured_stdout.__exit__(None, None, None)
|
|
26
|
+
super().tearDown()
|
|
27
|
+
|
|
28
|
+
def test_ex_globals_constant_capture(self):
|
|
29
|
+
"""
|
|
30
|
+
Test demonstrating how global variables are captured as constants.
|
|
31
|
+
"""
|
|
32
|
+
# magictoken.ex_globals_constant_capture.begin
|
|
33
|
+
import numpy as np
|
|
34
|
+
from numba import cuda
|
|
35
|
+
|
|
36
|
+
TAX_RATE = 0.08
|
|
37
|
+
PRICES = np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float64)
|
|
38
|
+
|
|
39
|
+
@cuda.jit
|
|
40
|
+
def compute_totals(quantities, totals):
|
|
41
|
+
i = cuda.grid(1)
|
|
42
|
+
if i < totals.size:
|
|
43
|
+
totals[i] = quantities[i] * PRICES[i] * (1 + TAX_RATE)
|
|
44
|
+
|
|
45
|
+
d_quantities = cuda.to_device(
|
|
46
|
+
np.array([1, 2, 3, 4, 5], dtype=np.float64)
|
|
47
|
+
)
|
|
48
|
+
d_totals = cuda.device_array(5, dtype=np.float64)
|
|
49
|
+
|
|
50
|
+
# First kernel call - compiles and captures values
|
|
51
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
52
|
+
print("Value of d_totals:", d_totals.copy_to_host())
|
|
53
|
+
|
|
54
|
+
# These modifications have no effect on subsequent kernel calls
|
|
55
|
+
TAX_RATE = 0.10 # noqa: F841
|
|
56
|
+
PRICES[:] = [20.0, 50.0, 10.0, 30.0, 60.0]
|
|
57
|
+
|
|
58
|
+
# Second kernel call still uses the original values
|
|
59
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
60
|
+
print("Value of d_totals:", d_totals.copy_to_host())
|
|
61
|
+
# magictoken.ex_globals_constant_capture.end
|
|
62
|
+
|
|
63
|
+
# Verify the values are the same (original values were captured)
|
|
64
|
+
expected = np.array([10.8, 54.0, 16.2, 64.8, 162.0])
|
|
65
|
+
np.testing.assert_allclose(d_totals.copy_to_host(), expected)
|
|
66
|
+
|
|
67
|
+
def test_ex_globals_device_array_capture(self):
|
|
68
|
+
"""
|
|
69
|
+
Test demonstrating how global device arrays are captured by pointer.
|
|
70
|
+
"""
|
|
71
|
+
# magictoken.ex_globals_device_array_capture.begin
|
|
72
|
+
import numpy as np
|
|
73
|
+
from numba import cuda
|
|
74
|
+
|
|
75
|
+
# Global device array - pointer is captured, not data
|
|
76
|
+
PRICES = cuda.to_device(
|
|
77
|
+
np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float32)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@cuda.jit
|
|
81
|
+
def compute_totals(quantities, totals):
|
|
82
|
+
i = cuda.grid(1)
|
|
83
|
+
if i < totals.size:
|
|
84
|
+
totals[i] = quantities[i] * PRICES[i]
|
|
85
|
+
|
|
86
|
+
d_quantities = cuda.to_device(
|
|
87
|
+
np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
|
|
88
|
+
)
|
|
89
|
+
d_totals = cuda.device_array(5, dtype=np.float32)
|
|
90
|
+
|
|
91
|
+
# First kernel call
|
|
92
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
93
|
+
print(d_totals.copy_to_host()) # [10. 25. 5. 15. 30.]
|
|
94
|
+
|
|
95
|
+
# Mutate the device array in-place
|
|
96
|
+
PRICES.copy_to_device(
|
|
97
|
+
np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Second kernel call sees the updated values
|
|
101
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
102
|
+
print(d_totals.copy_to_host()) # [20. 50. 10. 30. 60.]
|
|
103
|
+
# magictoken.ex_globals_device_array_capture.end
|
|
104
|
+
|
|
105
|
+
# Verify the second call sees updated values
|
|
106
|
+
expected = np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
|
|
107
|
+
np.testing.assert_allclose(d_totals.copy_to_host(), expected)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
unittest.main()
|
|
@@ -387,5 +387,66 @@ class TestIterate(unittest.TestCase):
|
|
|
387
387
|
x = val # noqa: F841
|
|
388
388
|
|
|
389
389
|
|
|
390
|
+
@skip_on_cudasim("Tests internals of the CUDA driver device array")
|
|
391
|
+
class TestEmptyArrays(unittest.TestCase):
|
|
392
|
+
def test_empty_array_flags(self):
|
|
393
|
+
test_shapes = [
|
|
394
|
+
(0,),
|
|
395
|
+
(10, 0),
|
|
396
|
+
(0, 10),
|
|
397
|
+
(0, 0),
|
|
398
|
+
(5, 0, 3),
|
|
399
|
+
(0, 5, 3),
|
|
400
|
+
(5, 3, 0),
|
|
401
|
+
(0, 0, 0),
|
|
402
|
+
]
|
|
403
|
+
for shape in test_shapes:
|
|
404
|
+
with self.subTest(shape=shape):
|
|
405
|
+
nparr = np.empty(shape)
|
|
406
|
+
arr = Array.from_desc(
|
|
407
|
+
0, nparr.shape, nparr.strides, nparr.dtype.itemsize
|
|
408
|
+
)
|
|
409
|
+
# Empty arrays should be both C and F contiguous
|
|
410
|
+
self.assertEqual(
|
|
411
|
+
arr.flags["C_CONTIGUOUS"],
|
|
412
|
+
nparr.flags["C_CONTIGUOUS"],
|
|
413
|
+
f"C_CONTIGUOUS mismatch for shape {shape}",
|
|
414
|
+
)
|
|
415
|
+
self.assertEqual(
|
|
416
|
+
arr.flags["F_CONTIGUOUS"],
|
|
417
|
+
nparr.flags["F_CONTIGUOUS"],
|
|
418
|
+
f"F_CONTIGUOUS mismatch for shape {shape}",
|
|
419
|
+
)
|
|
420
|
+
self.assertTrue(arr.flags["C_CONTIGUOUS"])
|
|
421
|
+
self.assertTrue(arr.flags["F_CONTIGUOUS"])
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
@skip_on_cudasim("Tests CUDA device array type inference")
|
|
425
|
+
class TestEmptyArrayTypeInference(unittest.TestCase):
|
|
426
|
+
def test_empty_array_typeof(self):
|
|
427
|
+
from numba import cuda, typeof
|
|
428
|
+
|
|
429
|
+
test_cases = [
|
|
430
|
+
((0,), np.int64),
|
|
431
|
+
((10, 0), np.int64),
|
|
432
|
+
((0, 10), np.int64),
|
|
433
|
+
((0, 0), np.float32),
|
|
434
|
+
((5, 0, 3), np.float32),
|
|
435
|
+
((0, 5, 3), np.int32),
|
|
436
|
+
((5, 3, 0), np.float64),
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
for shape, dtype in test_cases:
|
|
440
|
+
with self.subTest(shape=shape, dtype=dtype):
|
|
441
|
+
h_values = np.empty(shape, dtype=dtype)
|
|
442
|
+
d_values = cuda.to_device(h_values)
|
|
443
|
+
self.assertEqual(
|
|
444
|
+
typeof(h_values),
|
|
445
|
+
typeof(d_values),
|
|
446
|
+
f"Type mismatch for shape {shape}, dtype {dtype}: "
|
|
447
|
+
f"host={typeof(h_values)}, device={typeof(d_values)}",
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
390
451
|
if __name__ == "__main__":
|
|
391
452
|
unittest.main()
|
|
@@ -382,6 +382,37 @@ class TestNrtStatistics(CUDATestCase):
|
|
|
382
382
|
self.assertEqual(stats.free, stats_free)
|
|
383
383
|
self.assertEqual(stats.mi_free, stats_mi_free)
|
|
384
384
|
|
|
385
|
+
def test_nrt_toggle_enabled(self):
|
|
386
|
+
def array_reshape1d(arr, newshape, got):
|
|
387
|
+
y = arr.reshape(newshape)
|
|
388
|
+
for i in range(y.shape[0]):
|
|
389
|
+
got[i] = y[i]
|
|
390
|
+
|
|
391
|
+
def array_reshape(arr, newshape):
|
|
392
|
+
return arr.reshape(newshape)
|
|
393
|
+
|
|
394
|
+
with override_config("CUDA_ENABLE_NRT", True):
|
|
395
|
+
# compile a kernel that caches an NRT enabled reshape primitive
|
|
396
|
+
@cuda.jit
|
|
397
|
+
def kernel(out):
|
|
398
|
+
out = out.reshape(out.shape)
|
|
399
|
+
out[0] = 1
|
|
400
|
+
|
|
401
|
+
out = cuda.to_device(np.zeros(1, dtype=np.float64))
|
|
402
|
+
kernel[1, 1](out)
|
|
403
|
+
|
|
404
|
+
with override_config("CUDA_ENABLE_NRT", False):
|
|
405
|
+
# compile and launch a new kernel that gets a cache hit on the
|
|
406
|
+
# NRT enabled reshape, but tries to launch with NRT disabled
|
|
407
|
+
# globally
|
|
408
|
+
new_kernel = cuda.jit(array_reshape1d)
|
|
409
|
+
arr = np.arange(24)
|
|
410
|
+
expected = array_reshape(arr, (24,))
|
|
411
|
+
got = np.zeros(expected.shape, dtype=arr.dtype)
|
|
412
|
+
new_kernel[1, 1](arr, (24,), got)
|
|
413
|
+
|
|
414
|
+
self.assertTrue(np.array_equal(expected, got))
|
|
415
|
+
|
|
385
416
|
|
|
386
417
|
if __name__ == "__main__":
|
|
387
418
|
unittest.main()
|
|
@@ -38,6 +38,7 @@ from numba.cuda.datamodel.models import OpaqueModel
|
|
|
38
38
|
from numba.cuda.np import numpy_support
|
|
39
39
|
|
|
40
40
|
from numba.cuda import HAS_NUMBA
|
|
41
|
+
from numba.cuda.utils import PYVERSION
|
|
41
42
|
|
|
42
43
|
if HAS_NUMBA:
|
|
43
44
|
from numba.core.extending import (
|
|
@@ -56,6 +57,16 @@ class EnableNRTStatsMixin(object):
|
|
|
56
57
|
rtsys.memsys_disable_stats()
|
|
57
58
|
|
|
58
59
|
|
|
60
|
+
skip_if_py314 = unittest.skipIf(PYVERSION == (3, 14), "Test unstable on 3.14")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def expected_failure_py314(fn):
|
|
64
|
+
if PYVERSION == (3, 14):
|
|
65
|
+
return unittest.expectedFailure(fn)
|
|
66
|
+
else:
|
|
67
|
+
return fn
|
|
68
|
+
|
|
69
|
+
|
|
59
70
|
skip_unless_cffi = unittest.skipUnless(cffi_utils.SUPPORTED, "requires cffi")
|
|
60
71
|
|
|
61
72
|
_lnx_reason = "linux only test"
|
|
@@ -334,7 +334,7 @@ class BaseFunction(Callable):
|
|
|
334
334
|
k: _unlit_non_poison(v) for k, v in kws.items()
|
|
335
335
|
}
|
|
336
336
|
sig = temp.apply(nolitargs, nolitkws)
|
|
337
|
-
except Exception as e:
|
|
337
|
+
except Exception as e: # noqa: PERF203
|
|
338
338
|
if not isinstance(e, errors.NumbaError):
|
|
339
339
|
raise e
|
|
340
340
|
sig = None
|
|
@@ -7,6 +7,7 @@ import typing as py_typing
|
|
|
7
7
|
from numba.cuda.typing.typeof import typeof
|
|
8
8
|
from numba.cuda.core import errors
|
|
9
9
|
from numba.cuda import types
|
|
10
|
+
from numba.cuda.utils import PYVERSION
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class AsNumbaTypeRegistry:
|
|
@@ -40,8 +41,42 @@ class AsNumbaTypeRegistry:
|
|
|
40
41
|
return py_type
|
|
41
42
|
|
|
42
43
|
def _builtin_infer(self, py_type):
|
|
43
|
-
if
|
|
44
|
-
return
|
|
44
|
+
if PYVERSION in ((3, 14),):
|
|
45
|
+
# As of 3.14 the typing module has been updated to return a
|
|
46
|
+
# different type when calling: `typing.Optional[X]`.
|
|
47
|
+
#
|
|
48
|
+
# On 3.14:
|
|
49
|
+
#
|
|
50
|
+
# >>> type(typing.Optional[float])
|
|
51
|
+
# <class 'typing.Union'>
|
|
52
|
+
#
|
|
53
|
+
#
|
|
54
|
+
# On 3.13 (and presumably below):
|
|
55
|
+
#
|
|
56
|
+
# >>> type(typing._UnionGenericAlias)
|
|
57
|
+
# <class 'typing._UnionGenericAlias'>
|
|
58
|
+
#
|
|
59
|
+
#
|
|
60
|
+
# The previous implementation of this predicate used
|
|
61
|
+
# `_GenericAlias`, which was possible because `_UnionGenericAlias`
|
|
62
|
+
# is a subclass of `_GenericAlias`...
|
|
63
|
+
#
|
|
64
|
+
# >>> issubclass(typing._UnionGenericAlias, typing._GenericAlias)
|
|
65
|
+
# True
|
|
66
|
+
#
|
|
67
|
+
# However, other types, such as `typing.List[float]` remain as
|
|
68
|
+
# `typing._GenericAlias`, so that must be keept.
|
|
69
|
+
#
|
|
70
|
+
if not isinstance(
|
|
71
|
+
py_type, (py_typing.Union, py_typing._GenericAlias)
|
|
72
|
+
):
|
|
73
|
+
return
|
|
74
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
75
|
+
# Use of underscore type `_GenericAlias`.
|
|
76
|
+
if not isinstance(py_type, py_typing._GenericAlias):
|
|
77
|
+
return
|
|
78
|
+
else:
|
|
79
|
+
raise NotImplementedError(PYVERSION)
|
|
45
80
|
|
|
46
81
|
if getattr(py_type, "__origin__", None) is py_typing.Union:
|
|
47
82
|
if len(py_type.__args__) != 2:
|
|
@@ -460,7 +460,9 @@ class BaseContext(object):
|
|
|
460
460
|
def is_external(obj):
|
|
461
461
|
"""Check if obj is from outside numba.* namespace."""
|
|
462
462
|
try:
|
|
463
|
-
|
|
463
|
+
is_numba_module = obj.__module__.startswith("numba.")
|
|
464
|
+
is_test_module = obj.__module__.startswith("numba.cuda.tests.")
|
|
465
|
+
return not is_numba_module or is_test_module
|
|
464
466
|
except AttributeError:
|
|
465
467
|
return True
|
|
466
468
|
|
|
@@ -5,6 +5,7 @@ from collections import namedtuple
|
|
|
5
5
|
from functools import singledispatch
|
|
6
6
|
import ctypes
|
|
7
7
|
import enum
|
|
8
|
+
import operator
|
|
8
9
|
|
|
9
10
|
import numpy as np
|
|
10
11
|
from numpy.random.bit_generator import BitGenerator
|
|
@@ -47,11 +48,20 @@ def typeof_impl(val, c):
|
|
|
47
48
|
"""
|
|
48
49
|
Generic typeof() implementation.
|
|
49
50
|
"""
|
|
50
|
-
tp =
|
|
51
|
+
tp = getattr(val, "_numba_type_", None)
|
|
51
52
|
if tp is not None:
|
|
52
53
|
return tp
|
|
53
54
|
|
|
54
|
-
|
|
55
|
+
# Check for __cuda_array_interface__ objects (third-party device arrays)
|
|
56
|
+
|
|
57
|
+
# Numba's own DeviceNDArray is handled above via _numba_type_.
|
|
58
|
+
cai = getattr(val, "__cuda_array_interface__", None)
|
|
59
|
+
if cai is not None:
|
|
60
|
+
tp = _typeof_cuda_array_interface(cai, c)
|
|
61
|
+
if tp is not None:
|
|
62
|
+
return tp
|
|
63
|
+
|
|
64
|
+
tp = _typeof_buffer(val, c)
|
|
55
65
|
if tp is not None:
|
|
56
66
|
return tp
|
|
57
67
|
|
|
@@ -299,3 +309,42 @@ def typeof_numpy_polynomial(val, c):
|
|
|
299
309
|
domain = typeof(val.domain)
|
|
300
310
|
window = typeof(val.window)
|
|
301
311
|
return types.PolynomialType(coef, domain, window)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _typeof_cuda_array_interface(val, c):
|
|
315
|
+
"""
|
|
316
|
+
Determine the type of a __cuda_array_interface__ object.
|
|
317
|
+
|
|
318
|
+
This handles third-party device arrays that implement the CUDA
|
|
319
|
+
Array Interface. These are typed as regular Array types, with lowering
|
|
320
|
+
handled in numba.cuda.np.arrayobj.
|
|
321
|
+
"""
|
|
322
|
+
dtype = numpy_support.from_dtype(np.dtype(val["typestr"]))
|
|
323
|
+
shape = val["shape"]
|
|
324
|
+
ndim = len(shape)
|
|
325
|
+
strides = val.get("strides")
|
|
326
|
+
|
|
327
|
+
# Determine layout
|
|
328
|
+
if not ndim:
|
|
329
|
+
layout = "C"
|
|
330
|
+
elif strides is None:
|
|
331
|
+
layout = "C"
|
|
332
|
+
else:
|
|
333
|
+
itemsize = np.dtype(val["typestr"]).itemsize
|
|
334
|
+
# Quick rejection: C-contiguous has strides[-1] == itemsize,
|
|
335
|
+
# F-contiguous has strides[0] == itemsize. If neither, it's "A".
|
|
336
|
+
if strides[-1] == itemsize:
|
|
337
|
+
c_strides = numpy_support.strides_from_shape(
|
|
338
|
+
shape, itemsize, order="C"
|
|
339
|
+
)
|
|
340
|
+
layout = "C" if all(map(operator.eq, strides, c_strides)) else "A"
|
|
341
|
+
elif strides[0] == itemsize:
|
|
342
|
+
f_strides = numpy_support.strides_from_shape(
|
|
343
|
+
shape, itemsize, order="F"
|
|
344
|
+
)
|
|
345
|
+
layout = "F" if all(map(operator.eq, strides, f_strides)) else "A"
|
|
346
|
+
else:
|
|
347
|
+
layout = "A"
|
|
348
|
+
|
|
349
|
+
_, readonly = val["data"]
|
|
350
|
+
return types.Array(dtype, ndim, layout, readonly=readonly)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: numba-cuda
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.24.0
|
|
4
4
|
Summary: CUDA target for Numba
|
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
|
6
6
|
License-Expression: BSD-2-Clause
|
|
@@ -16,24 +16,15 @@ License-File: LICENSE.numba
|
|
|
16
16
|
Requires-Dist: numba>=0.60.0
|
|
17
17
|
Requires-Dist: cuda-bindings<14.0.0,>=12.9.1
|
|
18
18
|
Requires-Dist: cuda-core<1.0.0,>=0.3.2
|
|
19
|
+
Requires-Dist: packaging
|
|
19
20
|
Provides-Extra: cu12
|
|
20
21
|
Requires-Dist: cuda-bindings<13.0.0,>=12.9.1; extra == "cu12"
|
|
21
22
|
Requires-Dist: cuda-core<1.0.0,>=0.3.0; extra == "cu12"
|
|
22
|
-
Requires-Dist: cuda-
|
|
23
|
-
Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
|
|
24
|
-
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
|
25
|
-
Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
|
|
26
|
-
Requires-Dist: nvidia-nvjitlink-cu12; extra == "cu12"
|
|
27
|
-
Requires-Dist: nvidia-cuda-cccl-cu12; extra == "cu12"
|
|
23
|
+
Requires-Dist: cuda-toolkit[cccl,cudart,nvcc,nvjitlink,nvrtc]==12.*; extra == "cu12"
|
|
28
24
|
Provides-Extra: cu13
|
|
29
25
|
Requires-Dist: cuda-bindings==13.*; extra == "cu13"
|
|
30
26
|
Requires-Dist: cuda-core<1.0.0,>=0.3.2; extra == "cu13"
|
|
31
|
-
Requires-Dist: cuda-
|
|
32
|
-
Requires-Dist: nvidia-nvvm==13.*; extra == "cu13"
|
|
33
|
-
Requires-Dist: nvidia-cuda-runtime==13.*; extra == "cu13"
|
|
34
|
-
Requires-Dist: nvidia-cuda-nvrtc==13.*; extra == "cu13"
|
|
35
|
-
Requires-Dist: nvidia-nvjitlink==13.*; extra == "cu13"
|
|
36
|
-
Requires-Dist: nvidia-cuda-cccl==13.*; extra == "cu13"
|
|
27
|
+
Requires-Dist: cuda-toolkit[cccl,cudart,nvjitlink,nvrtc,nvvm]==13.*; extra == "cu13"
|
|
37
28
|
Dynamic: license-file
|
|
38
29
|
|
|
39
30
|
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|