numba-cuda 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
  3. numba_cuda/numba/cuda/api.py +13 -0
  4. numba_cuda/numba/cuda/bf16.py +112 -0
  5. numba_cuda/numba/cuda/cg.py +2 -0
  6. numba_cuda/numba/cuda/codegen.py +8 -0
  7. numba_cuda/numba/cuda/compiler.py +2 -1
  8. numba_cuda/numba/cuda/cudadecl.py +6 -1
  9. numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
  10. numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
  11. numba_cuda/numba/cuda/debuginfo.py +27 -0
  12. numba_cuda/numba/cuda/decorators.py +5 -2
  13. numba_cuda/numba/cuda/dispatcher.py +2 -2
  14. numba_cuda/numba/cuda/target.py +10 -1
  15. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  16. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  17. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
  18. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
  19. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  20. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
  21. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  22. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  23. numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  24. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  25. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  26. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  27. {numba_cuda-0.11.0.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
  28. {numba_cuda-0.11.0.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +31 -24
  29. {numba_cuda-0.11.0.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
  30. {numba_cuda-0.11.0.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
  31. {numba_cuda-0.11.0.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.11.0
1
+ 0.12.1
@@ -2,7 +2,7 @@
2
2
  # Generator Information:
3
3
  # Ast_canopy version: 0.3.0
4
4
  # Numbast version: 0.3.0
5
- # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/
5
+ # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
6
6
  # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
7
7
  # Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
8
8
  # Cudatoolkit version: (12, 8)
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from .cudadrv import devicearray, devices, driver
11
11
  from numba.core import config
12
12
  from numba.cuda.api_util import prepare_shape_strides_dtype
13
+ from numba.cuda.cudadrv.runtime import get_version
13
14
 
14
15
  # NDarray device helper
15
16
 
@@ -95,6 +96,18 @@ def is_float16_supported():
95
96
  return True
96
97
 
97
98
 
99
+ def is_bfloat16_supported():
100
+ """Whether bfloat16 are supported.
101
+
102
+ bfloat16 are only supported on devices with compute capability >= 8.0 and cuda version >= 12.0
103
+ """
104
+ cuda_version = get_version()
105
+ return current_context().device.supports_bfloat16 and cuda_version >= (
106
+ 12,
107
+ 0,
108
+ )
109
+
110
+
98
111
  @require_context
99
112
  def to_device(obj, stream=0, copy=True, to=None):
100
113
  """to_device(obj, stream=0, copy=True, to=None)
@@ -0,0 +1,112 @@
1
+ from numba.cuda._internal.cuda_bf16 import (
2
+ _type_class___nv_bfloat16,
3
+ nv_bfloat16 as bfloat16,
4
+ htrunc,
5
+ hceil,
6
+ hfloor,
7
+ hrint,
8
+ hsqrt,
9
+ hrsqrt,
10
+ hrcp,
11
+ hlog,
12
+ hlog2,
13
+ hlog10,
14
+ hcos,
15
+ hsin,
16
+ hexp,
17
+ hexp2,
18
+ hexp10,
19
+ htanh,
20
+ htanh_approx,
21
+ )
22
+ from numba.extending import overload
23
+
24
+ import math
25
+
26
+
27
+ def _make_unary(a, func):
28
+ if isinstance(a, _type_class___nv_bfloat16):
29
+ return lambda a: func(a)
30
+
31
+
32
+ # Bind low++ bindings to math APIs
33
+ @overload(math.trunc, target="cuda")
34
+ def trunc_ol(a):
35
+ return _make_unary(a, htrunc)
36
+
37
+
38
+ @overload(math.ceil, target="cuda")
39
+ def ceil_ol(a):
40
+ return _make_unary(a, hceil)
41
+
42
+
43
+ @overload(math.floor, target="cuda")
44
+ def floor_ol(a):
45
+ return _make_unary(a, hfloor)
46
+
47
+
48
+ @overload(math.sqrt, target="cuda")
49
+ def sqrt_ol(a):
50
+ return _make_unary(a, hsqrt)
51
+
52
+
53
+ @overload(math.log, target="cuda")
54
+ def log_ol(a):
55
+ return _make_unary(a, hlog)
56
+
57
+
58
+ @overload(math.log10, target="cuda")
59
+ def log10_ol(a):
60
+ return _make_unary(a, hlog10)
61
+
62
+
63
+ @overload(math.cos, target="cuda")
64
+ def cos_ol(a):
65
+ return _make_unary(a, hcos)
66
+
67
+
68
+ @overload(math.sin, target="cuda")
69
+ def sin_ol(a):
70
+ return _make_unary(a, hsin)
71
+
72
+
73
+ @overload(math.tanh, target="cuda")
74
+ def tanh_ol(a):
75
+ return _make_unary(a, htanh)
76
+
77
+
78
+ @overload(math.exp, target="cuda")
79
+ def exp_ol(a):
80
+ return _make_unary(a, hexp)
81
+
82
+
83
+ try:
84
+ from math import exp2
85
+
86
+ @overload(exp2, target="cuda")
87
+ def exp2_ol(a):
88
+ return _make_unary(a, hexp2)
89
+ except ImportError:
90
+ pass
91
+
92
+
93
+ __all__ = [
94
+ "bfloat16",
95
+ "htrunc",
96
+ "hceil",
97
+ "hfloor",
98
+ "hrint",
99
+ "hsqrt",
100
+ "hrsqrt",
101
+ "hrcp",
102
+ "hlog",
103
+ "hlog2",
104
+ "hlog10",
105
+ "hcos",
106
+ "hsin",
107
+ "htanh",
108
+ "htanh_approx",
109
+ "hexp",
110
+ "hexp2",
111
+ "hexp10",
112
+ ]
@@ -23,6 +23,7 @@ def _this_grid(typingctx):
23
23
  sig = signature(grid_group)
24
24
 
25
25
  def codegen(context, builder, sig, args):
26
+ context.active_code_library.use_cooperative = True
26
27
  one = context.get_constant(types.int32, 1)
27
28
  mod = builder.module
28
29
  return builder.call(
@@ -45,6 +46,7 @@ def _grid_group_sync(typingctx, group):
45
46
  sig = signature(types.int32, group)
46
47
 
47
48
  def codegen(context, builder, sig, args):
49
+ context.active_code_library.use_cooperative = True
48
50
  flags = context.get_constant(types.int32, 0)
49
51
  mod = builder.module
50
52
  return builder.call(
@@ -70,6 +70,8 @@ class ExternalCodeLibrary(CodeLibrary):
70
70
  self._setup_functions = []
71
71
  self._teardown_functions = []
72
72
 
73
+ self.use_cooperative = False
74
+
73
75
  @property
74
76
  def modules(self):
75
77
  # There are no LLVM IR modules in an ExternalCodeLibrary
@@ -181,6 +183,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
181
183
  self._nvvm_options = nvvm_options
182
184
  self._entry_name = entry_name
183
185
 
186
+ self.use_cooperative = False
187
+
184
188
  @property
185
189
  def llvm_strs(self):
186
190
  if self._llvm_strs is None:
@@ -352,6 +356,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
352
356
  self._linking_files.update(library._linking_files)
353
357
  self._setup_functions.extend(library._setup_functions)
354
358
  self._teardown_functions.extend(library._teardown_functions)
359
+ self.use_cooperative |= library.use_cooperative
355
360
 
356
361
  def add_linking_file(self, path_or_obj):
357
362
  if isinstance(path_or_obj, LinkableCode):
@@ -442,6 +447,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
442
447
  nvvm_options=self._nvvm_options,
443
448
  needs_cudadevrt=self.needs_cudadevrt,
444
449
  nrt=nrt,
450
+ use_cooperative=self.use_cooperative,
445
451
  )
446
452
 
447
453
  @classmethod
@@ -458,6 +464,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
458
464
  nvvm_options,
459
465
  needs_cudadevrt,
460
466
  nrt,
467
+ use_cooperative,
461
468
  ):
462
469
  """
463
470
  Rebuild an instance.
@@ -472,6 +479,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
472
479
  instance._max_registers = max_registers
473
480
  instance._nvvm_options = nvvm_options
474
481
  instance.needs_cudadevrt = needs_cudadevrt
482
+ instance.use_cooperative = use_cooperative
475
483
 
476
484
  instance._finalized = True
477
485
  if nrt:
@@ -797,7 +797,7 @@ def compile_ptx_for_current_device(
797
797
  )
798
798
 
799
799
 
800
- def declare_device_function(name, restype, argtypes, link):
800
+ def declare_device_function(name, restype, argtypes, link, use_cooperative):
801
801
  from .descriptor import cuda_target
802
802
 
803
803
  typingctx = cuda_target.typing_context
@@ -816,6 +816,7 @@ def declare_device_function(name, restype, argtypes, link):
816
816
  lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
817
817
  for file in link:
818
818
  lib.add_linking_file(file)
819
+ lib.use_cooperative = use_cooperative
819
820
 
820
821
  # ExternalFunctionDescriptor provides a lowering implementation for calling
821
822
  # external functions
@@ -423,7 +423,11 @@ _genfp16_binary_operator(operator.itruediv)
423
423
  def _resolve_wrapped_unary(fname):
424
424
  link = tuple()
425
425
  decl = declare_device_function(
426
- f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
426
+ f"__numba_wrapper_{fname}",
427
+ types.float16,
428
+ (types.float16,),
429
+ link,
430
+ use_cooperative=False,
427
431
  )
428
432
  return types.Function(decl)
429
433
 
@@ -438,6 +442,7 @@ def _resolve_wrapped_binary(fname):
438
442
  types.float16,
439
443
  ),
440
444
  link,
445
+ use_cooperative=False,
441
446
  )
442
447
  return types.Function(decl)
443
448
 
@@ -714,6 +714,10 @@ class Device(object):
714
714
  def supports_float16(self):
715
715
  return self.compute_capability >= (5, 3)
716
716
 
717
+ @property
718
+ def supports_bfloat16(self):
719
+ return self.compute_capability >= (8, 0)
720
+
717
721
 
718
722
  def met_requirement_for_device(device):
719
723
  if device.compute_capability < MIN_REQUIRED_CC:
@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
6
6
  NvrtcCompilationError,
7
7
  NvrtcSupportError,
8
8
  )
9
+ from numba import config
9
10
  from numba.cuda.cuda_paths import get_cuda_paths
11
+ from numba.cuda.utils import _readenv
10
12
 
11
13
  import functools
12
14
  import os
13
15
  import threading
14
16
  import warnings
15
17
 
18
+ NVRTC_EXTRA_SEARCH_PATHS = _readenv(
19
+ "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
20
+ ) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
21
+ if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
22
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
23
+
16
24
  # Opaque handle for compilation unit
17
25
  nvrtc_program = c_void_p
18
26
 
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
383
391
  else:
384
392
  numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
385
393
 
394
+ if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
395
+ extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
396
+ extra_includes = [f"-I{p}" for p in extra_search_paths]
397
+ else:
398
+ extra_includes = []
399
+
386
400
  nrt_path = os.path.join(numba_cuda_path, "runtime")
387
401
  nrt_include = f"-I{nrt_path}"
388
402
 
389
- options = [arch, numba_include, *cuda_include, nrt_include, "-rdc", "true"]
403
+ options = [
404
+ arch,
405
+ numba_include,
406
+ *cuda_include,
407
+ nrt_include,
408
+ *extra_includes,
409
+ "-rdc",
410
+ "true",
411
+ ]
390
412
 
391
413
  if ltoir:
392
414
  options.append("-dlto")
@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
59
59
  # For other cases, use upstream Numba implementation
60
60
  return super()._var_type(lltype, size, datamodel=datamodel)
61
61
 
62
+ def _di_subroutine_type(self, line, function, argmap):
63
+ # The function call conv needs encoding.
64
+ llfunc = function
65
+ md = []
66
+
67
+ # Create metadata type for return value
68
+ if len(llfunc.args) > 0:
69
+ lltype = llfunc.args[0].type
70
+ size = self.cgctx.get_abi_sizeof(lltype)
71
+ mdtype = self._var_type(lltype, size, datamodel=None)
72
+ md.append(mdtype)
73
+
74
+ # Create metadata type for arguments
75
+ for idx, (name, nbtype) in enumerate(argmap.items()):
76
+ datamodel = self.cgctx.data_model_manager[nbtype]
77
+ lltype = self.cgctx.get_value_type(nbtype)
78
+ size = self.cgctx.get_abi_sizeof(lltype)
79
+ mdtype = self._var_type(lltype, size, datamodel=datamodel)
80
+ md.append(mdtype)
81
+
82
+ return self.module.add_debug_info(
83
+ "DISubroutineType",
84
+ {
85
+ "types": self.module.add_metadata(md),
86
+ },
87
+ )
88
+
62
89
  def mark_variable(
63
90
  self,
64
91
  builder,
@@ -229,7 +229,7 @@ def jit(
229
229
  return disp
230
230
 
231
231
 
232
- def declare_device(name, sig, link=None):
232
+ def declare_device(name, sig, link=None, use_cooperative=False):
233
233
  """
234
234
  Declare the signature of a foreign function. Returns a descriptor that can
235
235
  be used to call the function from a Python kernel.
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
238
238
  :type name: str
239
239
  :param sig: The Numba signature of the function.
240
240
  :param link: External code to link when calling the function.
241
+ :param use_cooperative: External code requires cooperative launch.
241
242
  """
242
243
  if link is None:
243
244
  link = tuple()
@@ -250,6 +251,8 @@ def declare_device(name, sig, link=None):
250
251
  msg = "Return type must be provided for device declarations"
251
252
  raise TypeError(msg)
252
253
 
253
- template = declare_device_function(name, restype, argtypes, link)
254
+ template = declare_device_function(
255
+ name, restype, argtypes, link, use_cooperative
256
+ )
254
257
 
255
258
  return template.key
@@ -151,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
151
151
 
152
152
  asm = lib.get_asm_str()
153
153
 
154
- # A kernel needs cooperative launch if grid_sync is being used.
155
- self.cooperative = "cudaCGGetIntrinsicHandle" in asm
154
+ # The code library contains functions that require cooperative launch.
155
+ self.cooperative = lib.use_cooperative
156
156
  # We need to link against cudadevrt if grid sync is being used.
157
157
  if self.cooperative:
158
158
  lib.needs_cudadevrt = True
@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
290
290
 
291
291
 
292
292
  class CUDACallConv(MinimalCallConv):
293
- pass
293
+ def decorate_function(self, fn, args, fe_argtypes, noalias=False):
294
+ """
295
+ Set names and attributes of function arguments.
296
+ """
297
+ assert not noalias
298
+ arginfo = self._get_arg_packer(fe_argtypes)
299
+ # Do not prefix "arg." on argument name, so that nvvm compiler
300
+ # can track debug info of argument more accurately
301
+ arginfo.assign_names(self.get_arguments(fn), args)
302
+ fn.args[0].name = ".ret"
294
303
 
295
304
 
296
305
  class CUDACABICallConv(BaseCallConv):
@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
203
203
  simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
204
204
 
205
205
 
206
- # Usecase with cooperative groups
207
-
208
-
209
- @cuda.jit(cache=True)
210
- def cg_usecase_kernel(r, x):
211
- grid = cuda.cg.this_grid()
212
- grid.sync()
213
-
214
-
215
- cg_usecase = CUDAUseCase(cg_usecase_kernel)
216
-
217
-
218
206
  class _TestModule(CUDATestCase):
219
207
  """
220
208
  Tests for functionality of this module's functions.
@@ -0,0 +1,33 @@
1
+ from numba import cuda
2
+ from numba.cuda.testing import CUDATestCase
3
+ import sys
4
+
5
+ from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase
6
+
7
+
8
+ # Usecase with cooperative groups
9
+
10
+
11
+ @cuda.jit(cache=True)
12
+ def cg_usecase_kernel(r, x):
13
+ grid = cuda.cg.this_grid()
14
+ grid.sync()
15
+
16
+
17
+ cg_usecase = CUDAUseCase(cg_usecase_kernel)
18
+
19
+
20
+ class _TestModule(CUDATestCase):
21
+ """
22
+ Tests for functionality of this module's functions.
23
+ Note this does not define any "test_*" method, instead check_module()
24
+ should be called by hand.
25
+ """
26
+
27
+ def check_module(self, mod):
28
+ mod.cg_usecase(0)
29
+
30
+
31
+ def self_test():
32
+ mod = sys.modules[__name__]
33
+ _TestModule().check_module(mod)
@@ -0,0 +1,55 @@
1
+ from numba import cuda, float32
2
+ from numba.cuda.bf16 import bfloat16
3
+ from numba.cuda.testing import CUDATestCase
4
+
5
+ import math
6
+
7
+
8
+ class TestBfloat16HighLevelBindings(CUDATestCase):
9
+ def skip_unsupported(self):
10
+ if not cuda.is_bfloat16_supported():
11
+ self.skipTest(
12
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
13
+ )
14
+
15
+ def test_use_type_in_kernel(self):
16
+ self.skip_unsupported()
17
+
18
+ @cuda.jit
19
+ def kernel():
20
+ bfloat16(3.14)
21
+
22
+ kernel[1, 1]()
23
+
24
+ def test_math_bindings(self):
25
+ self.skip_unsupported()
26
+ functions = [
27
+ math.trunc,
28
+ math.ceil,
29
+ math.floor,
30
+ math.sqrt,
31
+ math.log,
32
+ math.log10,
33
+ math.cos,
34
+ math.sin,
35
+ math.tanh,
36
+ math.exp,
37
+ math.exp2,
38
+ ]
39
+
40
+ for f in functions:
41
+ with self.subTest(func=f):
42
+
43
+ @cuda.jit
44
+ def kernel(arr):
45
+ x = bfloat16(3.14)
46
+ y = f(x)
47
+ arr[0] = float32(y)
48
+
49
+ arr = cuda.device_array((1,), dtype="float32")
50
+ kernel[1, 1](arr)
51
+
52
+ if f in (math.exp, math.exp2):
53
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
54
+ else:
55
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
@@ -5,7 +5,7 @@ import numpy as np
5
5
  from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
6
6
  from numba.types import float16
7
7
 
8
- from numba.cuda.cuda_bf16 import (
8
+ from numba.cuda._internal.cuda_bf16 import (
9
9
  nv_bfloat16,
10
10
  htrunc,
11
11
  hceil,
@@ -22,21 +22,23 @@ from numba.cuda.cuda_bf16 import (
22
22
  hexp,
23
23
  hexp2,
24
24
  hexp10,
25
+ htanh,
26
+ htanh_approx,
25
27
  )
26
28
 
27
- from numba.cuda.cudadrv.runtime import get_version
28
-
29
- cuda_version = get_version()
30
-
31
29
  dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
32
30
 
33
31
 
34
- @unittest.skipIf(
35
- (cuda.get_current_device().compute_capability < (8, 0)),
36
- "bfloat16 requires compute capability 8.0+",
37
- )
38
32
  class Bfloat16Test(CUDATestCase):
33
+ def skip_unsupported(self):
34
+ if not cuda.is_bfloat16_supported():
35
+ self.skipTest(
36
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
37
+ )
38
+
39
39
  def test_ctor(self):
40
+ self.skip_unsupported()
41
+
40
42
  @cuda.jit
41
43
  def simple_kernel():
42
44
  a = nv_bfloat16(float64(1.0)) # noqa: F841
@@ -47,18 +49,13 @@ class Bfloat16Test(CUDATestCase):
47
49
  f = nv_bfloat16(uint16(6)) # noqa: F841
48
50
  g = nv_bfloat16(uint32(7)) # noqa: F841
49
51
  h = nv_bfloat16(uint64(8)) # noqa: F841
52
+ i = nv_bfloat16(float16(9)) # noqa: F841
50
53
 
51
54
  simple_kernel[1, 1]()
52
55
 
53
- if cuda_version >= (12, 0):
54
-
55
- @cuda.jit
56
- def simple_kernel_fp16():
57
- i = nv_bfloat16(float16(9)) # noqa: F841
58
-
59
- simple_kernel_fp16[1, 1]()
60
-
61
56
  def test_casts(self):
57
+ self.skip_unsupported()
58
+
62
59
  @cuda.jit
63
60
  def simple_kernel(b, c, d, e, f, g, h):
64
61
  a = nv_bfloat16(3.14)
@@ -90,6 +87,7 @@ class Bfloat16Test(CUDATestCase):
90
87
  assert h[0] == 3
91
88
 
92
89
  def test_ctor_cast_loop(self):
90
+ self.skip_unsupported()
93
91
  for dtype in dtypes:
94
92
  with self.subTest(dtype=dtype):
95
93
 
@@ -106,6 +104,8 @@ class Bfloat16Test(CUDATestCase):
106
104
  assert a[0] == 3
107
105
 
108
106
  def test_arithmetic(self):
107
+ self.skip_unsupported()
108
+
109
109
  @cuda.jit
110
110
  def simple_kernel(arith, logic):
111
111
  # Binary Arithmetic Operators
@@ -175,6 +175,8 @@ class Bfloat16Test(CUDATestCase):
175
175
  )
176
176
 
177
177
  def test_math_func(self):
178
+ self.skip_unsupported()
179
+
178
180
  @cuda.jit
179
181
  def simple_kernel(a):
180
182
  x = nv_bfloat16(3.14)
@@ -191,16 +193,18 @@ class Bfloat16Test(CUDATestCase):
191
193
  a[9] = float32(hlog10(x))
192
194
  a[10] = float32(hcos(x))
193
195
  a[11] = float32(hsin(x))
194
- a[12] = float32(hexp(x))
195
- a[13] = float32(hexp2(x))
196
- a[14] = float32(hexp10(x))
196
+ a[12] = float32(htanh(x))
197
+ a[13] = float32(htanh_approx(x))
198
+ a[14] = float32(hexp(x))
199
+ a[15] = float32(hexp2(x))
200
+ a[16] = float32(hexp10(x))
197
201
 
198
- a = np.zeros(15, dtype=np.float32)
202
+ a = np.zeros(17, dtype=np.float32)
199
203
  simple_kernel[1, 1](a)
200
204
 
201
205
  x = 3.14
202
206
  np.testing.assert_allclose(
203
- a[:12],
207
+ a[:14],
204
208
  [
205
209
  np.trunc(x),
206
210
  np.ceil(x),
@@ -214,15 +218,19 @@ class Bfloat16Test(CUDATestCase):
214
218
  np.log10(x),
215
219
  np.cos(x),
216
220
  np.sin(x),
221
+ np.tanh(x),
222
+ np.tanh(x),
217
223
  ],
218
224
  atol=1e-2,
219
225
  )
220
226
 
221
227
  np.testing.assert_allclose(
222
- a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
228
+ a[14:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
223
229
  )
224
230
 
225
231
  def test_check_bfloat16_type(self):
232
+ self.skip_unsupported()
233
+
226
234
  @cuda.jit
227
235
  def kernel(arr):
228
236
  x = nv_bfloat16(3.14)
@@ -237,6 +245,8 @@ class Bfloat16Test(CUDATestCase):
237
245
  np.testing.assert_allclose(arr, [3.14], atol=1e-2)
238
246
 
239
247
  def test_use_within_device_func(self):
248
+ self.skip_unsupported()
249
+
240
250
  @cuda.jit(device=True)
241
251
  def add_bf16(a, b):
242
252
  return a + b
@@ -252,6 +262,22 @@ class Bfloat16Test(CUDATestCase):
252
262
 
253
263
  np.testing.assert_allclose(arr, [8], atol=1e-2)
254
264
 
265
+ def test_use_binding_inside_dfunc(self):
266
+ @cuda.jit(device=True)
267
+ def f(arr):
268
+ pi = nv_bfloat16(3.14)
269
+ three = htrunc(pi)
270
+ arr[0] = float32(three)
271
+
272
+ @cuda.jit
273
+ def kernel(arr):
274
+ f(arr)
275
+
276
+ arr = np.zeros(1, np.float32)
277
+ kernel[1, 1](arr)
278
+
279
+ np.testing.assert_allclose(arr, [3], atol=1e-2)
280
+
255
281
 
256
282
  if __name__ == "__main__":
257
283
  unittest.main()
@@ -1,8 +1,6 @@
1
1
  import multiprocessing
2
2
  import os
3
3
  import shutil
4
- import subprocess
5
- import sys
6
4
  import unittest
7
5
  import warnings
8
6
 
@@ -163,55 +161,6 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
163
161
  f = mod.renamed_function2
164
162
  self.assertPreciseEqual(f(2), 8)
165
163
 
166
- @skip_unless_cc_60
167
- @skip_if_cudadevrt_missing
168
- @skip_if_mvc_enabled("CG not supported with MVC")
169
- def test_cache_cg(self):
170
- # Functions using cooperative groups should be cacheable. See Issue
171
- # #8888: https://github.com/numba/numba/issues/8888
172
- self.check_pycache(0)
173
- mod = self.import_module()
174
- self.check_pycache(0)
175
-
176
- mod.cg_usecase(0)
177
- self.check_pycache(2) # 1 index, 1 data
178
-
179
- # Check the code runs ok from another process
180
- self.run_in_separate_process()
181
-
182
- @skip_unless_cc_60
183
- @skip_if_cudadevrt_missing
184
- @skip_if_mvc_enabled("CG not supported with MVC")
185
- def test_cache_cg_clean_run(self):
186
- # See Issue #9432: https://github.com/numba/numba/issues/9432
187
- # If a cached function using CG sync was the first thing to compile,
188
- # the compile would fail.
189
- self.check_pycache(0)
190
-
191
- # This logic is modelled on run_in_separate_process(), but executes the
192
- # CG usecase directly in the subprocess.
193
- code = """if 1:
194
- import sys
195
-
196
- sys.path.insert(0, %(tempdir)r)
197
- mod = __import__(%(modname)r)
198
- mod.cg_usecase(0)
199
- """ % dict(tempdir=self.tempdir, modname=self.modname)
200
-
201
- popen = subprocess.Popen(
202
- [sys.executable, "-c", code],
203
- stdout=subprocess.PIPE,
204
- stderr=subprocess.PIPE,
205
- )
206
- out, err = popen.communicate(timeout=60)
207
- if popen.returncode != 0:
208
- raise AssertionError(
209
- "process failed with code %s: \n"
210
- "stdout follows\n%s\n"
211
- "stderr follows\n%s\n"
212
- % (popen.returncode, out.decode(), err.decode()),
213
- )
214
-
215
164
  def _test_pycache_fallback(self):
216
165
  """
217
166
  With a disabled __pycache__, test there is a working fallback
@@ -275,6 +224,40 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
275
224
  pass
276
225
 
277
226
 
227
+ @skip_on_cudasim("Simulator does not implement caching")
228
+ class CUDACooperativeGroupTest(SerialMixin, DispatcherCacheUsecasesTest):
229
+ # See Issue #9432: https://github.com/numba/numba/issues/9432
230
+ # If a cached function using CG sync was the first thing to compile,
231
+ # the compile would fail.
232
+ here = os.path.dirname(__file__)
233
+ usecases_file = os.path.join(here, "cg_cache_usecases.py")
234
+ modname = "cuda_cooperative_caching_test_fodder"
235
+
236
+ def setUp(self):
237
+ DispatcherCacheUsecasesTest.setUp(self)
238
+ CUDATestCase.setUp(self)
239
+
240
+ def tearDown(self):
241
+ CUDATestCase.tearDown(self)
242
+ DispatcherCacheUsecasesTest.tearDown(self)
243
+
244
+ @skip_unless_cc_60
245
+ @skip_if_cudadevrt_missing
246
+ @skip_if_mvc_enabled("CG not supported with MVC")
247
+ def test_cache_cg(self):
248
+ # Functions using cooperative groups should be cacheable. See Issue
249
+ # #8888: https://github.com/numba/numba/issues/8888
250
+ self.check_pycache(0)
251
+ mod = self.import_module()
252
+ self.check_pycache(0)
253
+
254
+ mod.cg_usecase(0)
255
+ self.check_pycache(2) # 1 index, 1 data
256
+
257
+ # Check the code runs ok from another process
258
+ self.run_in_separate_process()
259
+
260
+
278
261
  @skip_on_cudasim("Simulator does not implement caching")
279
262
  class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
280
263
  here = os.path.dirname(__file__)
@@ -1,8 +1,13 @@
1
1
  from __future__ import print_function
2
2
 
3
+ import os
4
+
5
+ import cffi
6
+
3
7
  import numpy as np
4
8
 
5
9
  from numba import config, cuda, int32
10
+ from numba.types import CPointer
6
11
  from numba.cuda.testing import (
7
12
  unittest,
8
13
  CUDATestCase,
@@ -11,6 +16,9 @@ from numba.cuda.testing import (
11
16
  skip_if_cudadevrt_missing,
12
17
  skip_if_mvc_enabled,
13
18
  )
19
+ from numba.core.typing import signature
20
+
21
+ ffi = cffi.FFI()
14
22
 
15
23
 
16
24
  @cuda.jit
@@ -149,6 +157,32 @@ class TestCudaCooperativeGroups(CUDATestCase):
149
157
  self.assertEqual(blocks1d, blocks2d)
150
158
  self.assertEqual(blocks1d, blocks3d)
151
159
 
160
+ @skip_unless_cc_60
161
+ def test_external_cooperative_func(self):
162
+ cudapy_test_path = os.path.dirname(__file__)
163
+ tests_path = os.path.dirname(cudapy_test_path)
164
+ data_path = os.path.join(tests_path, "data")
165
+ src = os.path.join(data_path, "cta_barrier.cu")
166
+
167
+ sig = signature(
168
+ CPointer(int32),
169
+ )
170
+ cta_barrier = cuda.declare_device(
171
+ "cta_barrier", sig=sig, link=[src], use_cooperative=True
172
+ )
173
+
174
+ @cuda.jit
175
+ def kernel():
176
+ cta_barrier()
177
+
178
+ block_size = 32
179
+ grid_size = 1024
180
+
181
+ kernel[grid_size, block_size]()
182
+
183
+ overload = kernel.overloads[()]
184
+ self.assertTrue(overload.cooperative)
185
+
152
186
 
153
187
  if __name__ == "__main__":
154
188
  unittest.main()
@@ -310,6 +310,23 @@ class TestCudaDebugInfo(CUDATestCase):
310
310
  with captured_stdout():
311
311
  self._test_kernel_args_types()
312
312
 
313
+ def test_kernel_args_names(self):
314
+ sig = (types.int32,)
315
+
316
+ @cuda.jit("void(int32)", debug=True, opt=False)
317
+ def f(x):
318
+ z = x # noqa: F841
319
+
320
+ llvm_ir = f.inspect_llvm(sig)
321
+
322
+ # Verify argument name is not prefixed with "arg."
323
+ pat = r"define void @.*\(i32 %\"x\"\)"
324
+ match = re.compile(pat).search(llvm_ir)
325
+ self.assertIsNotNone(match, msg=llvm_ir)
326
+ pat = r"define void @.*\(i32 %\"arg\.x\"\)"
327
+ match = re.compile(pat).search(llvm_ir)
328
+ self.assertIsNone(match, msg=llvm_ir)
329
+
313
330
  def test_llvm_dbg_value(self):
314
331
  sig = (types.int32, types.int32)
315
332
 
@@ -0,0 +1,23 @@
1
+ #include <cooperative_groups.h>
2
+ #include <cuda/barrier>
3
+
4
+ namespace cg = cooperative_groups;
5
+
6
+ __device__ void _wait_on_tile(cuda::barrier<cuda::thread_scope_block> &tile)
7
+ {
8
+ auto token = tile.arrive();
9
+ tile.wait(std::move(token));
10
+ }
11
+
12
+ extern "C"
13
+ __device__ int cta_barrier(int *ret) {
14
+ auto cta = cg::this_thread_block();
15
+ cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
16
+ __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
17
+ if (threadIdx.x == 0) {
18
+ init(&barrier, blockDim.x);
19
+ }
20
+
21
+ _wait_on_tile(barrier);
22
+ return 0;
23
+ }
@@ -0,0 +1,3 @@
1
+ // Templated addition function: myadd
2
+ template <typename T>
3
+ __device__ T myadd(T a, T b) { return a + b; }
@@ -0,0 +1,3 @@
1
+ // Templated multiplication function: mymul
2
+ template <typename T>
3
+ __device__ T mymul(T a, T b) { return a * b; }
@@ -0,0 +1,9 @@
1
+ #include <add.cuh> // In numba/cuda/tests/data/include
2
+ #include <mul.cuh> // In numba/cuda/tests/doc_examples/ffi/include
3
+
4
+ extern "C"
5
+ __device__ int saxpy(float *ret, float a, float x, float y)
6
+ {
7
+ *ret = myadd(mymul(a, x), y);
8
+ return 0;
9
+ }
@@ -3,7 +3,7 @@
3
3
 
4
4
  import unittest
5
5
  from numba.cuda.testing import CUDATestCase, skip_on_cudasim
6
- from numba.tests.support import skip_unless_cffi
6
+ from numba.tests.support import skip_unless_cffi, override_config
7
7
 
8
8
 
9
9
  @skip_unless_cffi
@@ -85,6 +85,53 @@ class TestFFI(CUDATestCase):
85
85
  actual = r[()]
86
86
  np.testing.assert_allclose(expected, actual)
87
87
 
88
+ def test_ex_extra_includes(self):
89
+ import numpy as np
90
+ from numba import cuda, config
91
+ import os
92
+
93
+ basedir = os.path.dirname(os.path.abspath(__file__))
94
+ mul_dir = os.path.join(basedir, "ffi", "include")
95
+ saxpy_cu = os.path.join(basedir, "ffi", "saxpy.cu")
96
+
97
+ testdir = os.path.dirname(basedir)
98
+ add_dir = os.path.join(testdir, "data", "include")
99
+
100
+ includedir = ":".join([mul_dir, add_dir])
101
+ with override_config("CUDA_NVRTC_EXTRA_SEARCH_PATHS", includedir):
102
+ # magictoken.ex_extra_search_paths.begin
103
+ from numba import config
104
+
105
+ includedir = ":".join([mul_dir, add_dir])
106
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = includedir
107
+ # magictoken.ex_extra_search_paths.end
108
+
109
+ # magictoken.ex_extra_search_paths_kernel.begin
110
+ sig = "float32(float32, float32, float32)"
111
+ saxpy = cuda.declare_device("saxpy", sig=sig, link=saxpy_cu)
112
+
113
+ @cuda.jit
114
+ def vector_saxpy(a, x, y, res):
115
+ i = cuda.grid(1)
116
+ if i < len(res):
117
+ res[i] = saxpy(a, x[i], y[i])
118
+
119
+ # magictoken.ex_extra_search_paths_kernel.end
120
+
121
+ size = 10_000
122
+ a = 3.0
123
+ X = np.ones((size,), dtype="float32")
124
+ Y = np.ones((size,), dtype="float32")
125
+ R = np.zeros((size,), dtype="float32")
126
+
127
+ block_size = 32
128
+ num_blocks = (size // block_size) + 1
129
+
130
+ vector_saxpy[num_blocks, block_size](a, X, Y, R)
131
+
132
+ expected = a * X + Y
133
+ np.testing.assert_equal(R, expected)
134
+
88
135
 
89
136
  if __name__ == "__main__":
90
137
  unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.11.0
3
+ Version: 0.12.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -1,27 +1,27 @@
1
1
  _numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
2
2
  _numba_cuda_redirector.py,sha256=n_r8MYbu5-vcXMnLJW147k8DnFXXvgb7nPIXnlXwTyQ,2659
3
- numba_cuda/VERSION,sha256=eV1rx5V00q7AOtnP7EBLuVCDyd0hDmUh4NQZl3LSjUQ,7
3
+ numba_cuda/VERSION,sha256=9u5pvxxLJ6JCJmzLWutKqMgwY0W56-T_czW4yUBFK4E,7
4
4
  numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
5
5
  numba_cuda/_version.py,sha256=nzrrJXi85d18m6SPdsPsetJNClDETkmF1MrEhGLYDBs,734
6
6
  numba_cuda/numba/cuda/__init__.py,sha256=3siqMXEKqa9ezQ8RxPC3KMdebUjgJt-EKxxV4CX9818,607
7
- numba_cuda/numba/cuda/api.py,sha256=XnyTZiAPdLhpFDKefpN59mK-RsM2uMVipQjHRNI0Z5s,17271
7
+ numba_cuda/numba/cuda/api.py,sha256=mkbZBcBfm819kCywQbH8jAvUex2m4pYTcFD-LE-tXsQ,17638
8
8
  numba_cuda/numba/cuda/api_util.py,sha256=jK8oUD3zf_D5IX7vbjc3uY_5kmOxwgEqO2m_lDHdWfM,861
9
9
  numba_cuda/numba/cuda/args.py,sha256=UlTHTJpwPeCtnW0Bb-Wetm5UO9TPR-PCgIt5ys8b8tQ,1894
10
- numba_cuda/numba/cuda/cg.py,sha256=azz1sIT_jXQfJEZfDjBeqboJc6Pu_NtrZxfE7D1eQLQ,1484
11
- numba_cuda/numba/cuda/codegen.py,sha256=N6zwdKah4Pb79TKPFVqTbJWX10MGu_7E2YR6K77OQwE,16451
12
- numba_cuda/numba/cuda/compiler.py,sha256=jOwiebq5K4eCn745MPNtaXMkLyyTBef65fpZ5sqKbEM,25548
10
+ numba_cuda/numba/cuda/bf16.py,sha256=PXuitxHhPMjnti3g9IOSoL90ofGgVRcDfqFg7AqCXpU,1778
11
+ numba_cuda/numba/cuda/cg.py,sha256=n-sBj05ut6U_GgFIq-PTCjPad4nXWAc0GVg_J9xD_Pc,1602
12
+ numba_cuda/numba/cuda/codegen.py,sha256=vZtLahHSLYzRpQ3GSbmMm5qYp4FS5mAlzGgSgJbaoz0,16709
13
+ numba_cuda/numba/cuda/compiler.py,sha256=aZwEVP8KXCIyccSw4vJyG6Qaai9oXsFuBAo_Ghwwai4,25607
13
14
  numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=8lUPmU6FURxphzEqkPLZRPYBCEK_wmDtHq2voPkckfs,950
14
- numba_cuda/numba/cuda/cuda_bf16.py,sha256=RfnWMV2_zSAW9FLN4JqfW6GfmWR8ZVO16e9Bw3jZnto,152203
15
15
  numba_cuda/numba/cuda/cuda_paths.py,sha256=kMIJ_1yV2qtcKEM5rCgSDJ3Gz7bgxbfAWh54E5cDndg,15872
16
- numba_cuda/numba/cuda/cudadecl.py,sha256=0JTTkA0yZljsa0EFlebmsAibvkf5OhHaeOCsfaUwjU0,22822
16
+ numba_cuda/numba/cuda/cudadecl.py,sha256=_TXMu8SIT2hIhsPI0n05wuShtzp8NcPX88NH5y7xauU,22909
17
17
  numba_cuda/numba/cuda/cudaimpl.py,sha256=q6CPqD8ZtJvY8JlpMEN--d6003_FIHoHLBqNP2McNyM,39274
18
18
  numba_cuda/numba/cuda/cudamath.py,sha256=wbGjlyGVwcUAoQjgXIaAaasLdVuDSKHkf6KyID5IYBw,3979
19
- numba_cuda/numba/cuda/debuginfo.py,sha256=tWlRAC1-AsSQp0pG9kXQY9tlVdZPA-nDUJsrvru4eaM,4504
20
- numba_cuda/numba/cuda/decorators.py,sha256=T2nFq5nCPmeyJb-RyuVUUaV4qHYTeYm3Zj-o8cMewMc,9483
19
+ numba_cuda/numba/cuda/debuginfo.py,sha256=5tCw_IEeZfoD6CtFpA_yUGdrq25Q9mFjfxxrudH_VFg,5476
20
+ numba_cuda/numba/cuda/decorators.py,sha256=bR8yOAIC68lhm8mSMU-DUt1qFrEogbmSAtzAI4MoToc,9608
21
21
  numba_cuda/numba/cuda/descriptor.py,sha256=t1rSVJSCAlVACC5_Un3FQ7iubdTTBe-euqz88cvs2tI,985
22
22
  numba_cuda/numba/cuda/device_init.py,sha256=Rtwd6hQMHMLMkj6MXtndbWYFJfkIaRe0MwOIJF2nzhU,3449
23
23
  numba_cuda/numba/cuda/deviceufunc.py,sha256=zj9BbLiZD-dPttHew4olw8ANgR2nXnXEE9qjCeGLrQI,30731
24
- numba_cuda/numba/cuda/dispatcher.py,sha256=1QzWn5IO_v27-NZlSjDbCIT_M5vtPuBadlwjliY2y0E,43169
24
+ numba_cuda/numba/cuda/dispatcher.py,sha256=cLXD2pnsU7k-bN5clfjuWqifFCr7LfECKtK7YeeHwis,43162
25
25
  numba_cuda/numba/cuda/errors.py,sha256=WRso1Q_jCoWP5yrDBMhihRhhVtVo1-7KdN8QVE9j46o,1712
26
26
  numba_cuda/numba/cuda/extending.py,sha256=VwuU5F0AQFlJsqaiwoWk-6Itihew1FsjVT_BVjhY8Us,2278
27
27
  numba_cuda/numba/cuda/initialize.py,sha256=0SnpjccQEYiWITIyfAJx833H1yhYFFDY42EpnwYyMn8,487
@@ -41,17 +41,18 @@ numba_cuda/numba/cuda/random.py,sha256=V30KaFdkuDyjxoP14awz-KkY3lRIXqIZuuH27UotI
41
41
  numba_cuda/numba/cuda/reshape_funcs.cu,sha256=frw1uoeMSYlkPC38LiKE8Tz2P70X2e4UZGyLKkaPzho,4326
42
42
  numba_cuda/numba/cuda/simulator_init.py,sha256=Hvzty6NJp1SeKspyb-b887xpeNLMMI0x9aPmV--X77E,450
43
43
  numba_cuda/numba/cuda/stubs.py,sha256=JMs4Xg8IHlAq5L6SBYWcYNzXfJGM6v0lZCQaOb5x9CQ,23014
44
- numba_cuda/numba/cuda/target.py,sha256=mSMnS-bSsC8_4KqkAsa1Byi2mO8jPJdKW3m31qxsxUE,12520
44
+ numba_cuda/numba/cuda/target.py,sha256=ymYBdkt7iNK_PJCfyqupKpcSj7j-UQzkWIq3KjoLBD8,12963
45
45
  numba_cuda/numba/cuda/testing.py,sha256=OR37AuDdzg7vLG4G_4s2uRAkNTScZc-BzHmTMJYuxhQ,6827
46
46
  numba_cuda/numba/cuda/types.py,sha256=hC1MUvgUwy-SLgbzFzXwssJzPR8BxQwqUcjwGJFzVac,1317
47
47
  numba_cuda/numba/cuda/ufuncs.py,sha256=AJifQgapyv62fdJeMm939R1I5TvIRmaA8dJ83Jy8DCw,23559
48
48
  numba_cuda/numba/cuda/utils.py,sha256=VRphC0PLr8Klq3D1FMONu4aRdVO23HOCBg4bxnsqmfc,785
49
49
  numba_cuda/numba/cuda/vector_types.py,sha256=FlzOKufhvBnZ-VC-liA7y9is8BV-uj0fD-En_vP6zl0,6783
50
50
  numba_cuda/numba/cuda/vectorizers.py,sha256=nEfQxjSA4oCX8ZzvoqjDRygDfwzxFVDXtnjx-K1aPqA,8387
51
+ numba_cuda/numba/cuda/_internal/cuda_bf16.py,sha256=QYck6s_D85HBEsc__SAl_UZxf7SptqAk31mLv_1gzuE,152212
51
52
  numba_cuda/numba/cuda/cudadrv/__init__.py,sha256=inat2K8K1OVrgDe64FK7CyRmyFyNKcNO4p2_L79yRZ0,201
52
53
  numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=6tF2TYnmjMbKk2fho1ONoD_QsRD9QVTT2kHP7x1u1J0,31556
53
54
  numba_cuda/numba/cuda/cudadrv/devices.py,sha256=k87EDIRhj1ncM9PxJCjZGPFfEks99vzmHlTc55GK5X0,8062
54
- numba_cuda/numba/cuda/cudadrv/driver.py,sha256=dcrti-XDhjdfTiF5HrrGCYtIZkknN_6UugxSA2f-JoE,118994
55
+ numba_cuda/numba/cuda/cudadrv/driver.py,sha256=63NDga5RLrk6JEiHW1aJDubqCbbHA5uumK3mSYy7SEY,119091
55
56
  numba_cuda/numba/cuda/cudadrv/drvapi.py,sha256=OnjYWnmy8ZlSfYouhzyYIpW-AJ3x1YHj32YcBY2xet4,16790
56
57
  numba_cuda/numba/cuda/cudadrv/dummyarray.py,sha256=2jycZhniMy3ncoVWQG9D8dBehTEeocBZTW43gKHL5Tc,14291
57
58
  numba_cuda/numba/cuda/cudadrv/enums.py,sha256=raWKryxamWQZ5A8ivMpyYVhhwbSpaD9lu7l1_wl2W9M,23742
@@ -60,7 +61,7 @@ numba_cuda/numba/cuda/cudadrv/libs.py,sha256=qjknQxYXd2ucwDLQqzhWC_srNg6FnwvcVHI
60
61
  numba_cuda/numba/cuda/cudadrv/linkable_code.py,sha256=IZ13laEG_altDQyi9HkdMcwW-YYEIn2erqz6AnYsqHg,2808
61
62
  numba_cuda/numba/cuda/cudadrv/mappings.py,sha256=9uEs1KepeVGRbEpVhLjtxSsvZpZsbrHnPywmx--y88A,804
62
63
  numba_cuda/numba/cuda/cudadrv/ndarray.py,sha256=HtULWWFyDlgqvrH5459yyPTvU4UbUo2DSdtcNfvbH00,473
63
- numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=6xtAR1af5BsBkDMJcQsTIUFFO02wwpfLClNIsh5L33Y,14324
64
+ numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=pDc5YsxOMdMbLnUKm1st2FVmFPRU-Mhlpd9mau9KZ-0,14976
64
65
  numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=7tTy6-VEbMBpDUmuSMnUwqPFfBndTh3aPq_n7nxhEA0,26344
65
66
  numba_cuda/numba/cuda/cudadrv/rtapi.py,sha256=J6PRGGK07XSLRzgCw5xs8VU5xVoqavvhojk1mxiQsi4,226
66
67
  numba_cuda/numba/cuda/cudadrv/runtime.py,sha256=CFumwg4iblWap_E7l7GM_hMYz1PsbH81-N0tZwFFooA,4372
@@ -132,8 +133,9 @@ numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py,sha256=4CcxftJN4S3whgnngOgrZ
132
133
  numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py,sha256=saAWvGuAYJ4ToT9qQjvB254EeBfduVqy7VQVRqeVo0Y,987
133
134
  numba_cuda/numba/cuda/tests/cudadrv/test_streams.py,sha256=rrQEA8iawR6UyKnK2MdI5X9GnuCWPUNpoMOEVXEd_u0,4196
134
135
  numba_cuda/numba/cuda/tests/cudapy/__init__.py,sha256=43EXdiXXRBd6yIcVGMrU9F_EJCD9Uw3mzOP3SB53AEE,260
135
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py,sha256=FnvjeqTZ-YBmroHctPrHgMHxnJ-HiT9KI79aHTej5G8,5840
136
+ numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py,sha256=3mYDpLS1FUBt7rerACFGR7HxsCJtHSLh_AYqxFEqRd0,5658
136
137
  numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py,sha256=9CbjosLNPN5IzrD-15sD_4B0BMmjo02Y7faZiS82cyk,1143
138
+ numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py,sha256=w9c0OXN6Mxb0Un0GxF-ndcq39dn5nMC8xaGzESZB40I,682
137
139
  numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py,sha256=2i_xq4B1t1tctr6ZrWA29ZHkmQlD_vCSewhr-AT9tMc,1651
138
140
  numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx,sha256=PKVafUhDH1SKRWXkt4N3v8SDMh4RyDFiJM-CMksa5uc,519
139
141
  numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py,sha256=wrWx8AeRhBHM74iYPKKrZqiyWrYCtQU3J-g3Zv7JmoY,1782
@@ -143,10 +145,11 @@ numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py,sha256=JDKbbRieNE0C3w
143
145
  numba_cuda/numba/cuda/tests/cudapy/test_array_args.py,sha256=iiFrt5Yn7gfheAGOYG2VBeWeuW3JlBhRLXNfSz4cHAA,4982
144
146
  numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py,sha256=SWa1MvpwG07yBkrFIUeM9pm3BIwUbhttMNBdUW-CpSM,969
145
147
  numba_cuda/numba/cuda/tests/cudapy/test_atomics.py,sha256=agsfUN3WOoh6ICAECtuMuxZNcKq5ivK30Ew3h_m76m0,57689
146
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py,sha256=NYLa_e60NYc63X7japCAsjUS84lXn92k4_S_E6-sEX4,6779
148
+ numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py,sha256=DdP7WlHev8R5DdY6DEEgOF45ljh8LwKeqmkvGLjNC7E,1444
149
+ numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py,sha256=wNP0NNtqVgaekY9fXp_H4LpPNLX-rDu9gp-_-e965Lg,7420
147
150
  numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py,sha256=0_wr6MSeHh0QVzPeH8SB7j0Nv_RrPAK01hNoQ_dGT5I,4417
148
151
  numba_cuda/numba/cuda/tests/cudapy/test_boolean.py,sha256=j4mIOv4rJTLjJzpKk1O9UFLT41_iOQRtwsmteXdKZ-M,547
149
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py,sha256=qbNisdxvoErKlDkD5dw7IkdJhfcQUpIdfHX11UzGBOo,18990
152
+ numba_cuda/numba/cuda/tests/cudapy/test_caching.py,sha256=obUSTJSP2Lh-YNElq8PZpVnRJOeq-uqV_VyLHtsXwAw,18427
150
153
  numba_cuda/numba/cuda/tests/cudapy/test_casting.py,sha256=3LaN3ZsSuOZXAZXCV85wYyhh0ih7JqABnjGTa7Y2YBE,8748
151
154
  numba_cuda/numba/cuda/tests/cudapy/test_cffi.py,sha256=tC7ZCA4dkzehS33iz2l35rX6OxE3BTQd9ivV4r74YXs,926
152
155
  numba_cuda/numba/cuda/tests/cudapy/test_compiler.py,sha256=OkCavTZAAcdffdUBYGEmlP_BN7zAH-rWlhr-LqSUUs8,10997
@@ -154,12 +157,12 @@ numba_cuda/numba/cuda/tests/cudapy/test_complex.py,sha256=hmAcyZim46yueXZDqDSJYq
154
157
  numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py,sha256=KIuXQ0ihgQQXM-eH7s3xAxhKe35YL1qDTHCVTWA4ut8,497
155
158
  numba_cuda/numba/cuda/tests/cudapy/test_const_string.py,sha256=li1UsV5vc2M01cJ7k6_526VPtuAOAKr8e7kb1CDUXi4,4323
156
159
  numba_cuda/numba/cuda/tests/cudapy/test_constmem.py,sha256=ZWmyKvFokRMjqyXjVpZVOnR6LR694GWcbUn2jVEQV14,5170
157
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py,sha256=x2sOmq6ACN6r00LpPVjwOclOL_OsagJqP5l_9NsAl2U,4984
160
+ numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py,sha256=kkrK5Mo9E8nNH3PYfQAEel0hY7CXZNsn88BAo7heX9g,5818
158
161
  numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py,sha256=RXCNHAZM35sbUf3Gi-x2E8-a6BmhFb2rhQkBOeiS_fo,15757
159
162
  numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py,sha256=8prL2FTiaajW-UHSL9al-nBniygOfpdAOT_Dkej4PWI,2138
160
163
  numba_cuda/numba/cuda/tests/cudapy/test_datetime.py,sha256=MnOeDWMz-rL3-07FsswM06Laxmm0KjTmTwhrP3rmchQ,3526
161
164
  numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=1P369s02AvGu7fSIEe_YxSgh3c6S72Aw1gRgmepDbQY,3383
162
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=796d8Oa1ZV2mZ9LTcwR3g6_j5sjSBk7kZEHYMOXPBfU,12606
165
+ numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=AE8D4U4dAv4nYP9oatDwROW6knpJ0-iggP4BaHymo6g,13170
163
166
  numba_cuda/numba/cuda/tests/cudapy/test_device_func.py,sha256=LNGBZfqFGUtVVQeC6FcHo8T3DbG-j6AjeBwJmwp9HH4,13157
164
167
  numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py,sha256=Oc6CdI1j9Ad_wklHdIYSMytrzUpzK6oXD0BGe45sTwg,26636
165
168
  numba_cuda/numba/cuda/tests/cudapy/test_enums.py,sha256=Yxac6S5P6C8GN0kMwieL3dQb1uogOVZQEx969B0AMpM,4533
@@ -226,15 +229,17 @@ numba_cuda/numba/cuda/tests/cudasim/__init__.py,sha256=GdfSq6pRVSOQwmgNi7ZFQ5l0y
226
229
  numba_cuda/numba/cuda/tests/cudasim/support.py,sha256=JjRrfrrLKS0V5p6GX6ibs6QTuFb1NanKfBQSgbLeiHs,114
227
230
  numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py,sha256=-GJCl2c063Ig6EUB8w5L_0GcmXzTLatGe_ddEzdnbgc,3177
228
231
  numba_cuda/numba/cuda/tests/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
232
+ numba_cuda/numba/cuda/tests/data/cta_barrier.cu,sha256=jJ3lzhbGr6WOHb56_fPaFg8j851ZwCpz8V4du-eyWbA,576
229
233
  numba_cuda/numba/cuda/tests/data/cuda_include.cu,sha256=1wj5Of86-kP0hxK5Gr6AhapuyTiiWWJAoFbCuCpyKfA,294
230
234
  numba_cuda/numba/cuda/tests/data/error.cu,sha256=5m65RDHgh39d0bIW6Dvj0xh9ffhKH1iILeCCR4p2ReI,138
231
235
  numba_cuda/numba/cuda/tests/data/jitlink.cu,sha256=A41S_002h_s4hEghJusT368JXX6H3bSMp3mC_6DX9Us,539
232
236
  numba_cuda/numba/cuda/tests/data/jitlink.ptx,sha256=KJZkTuc1u5xUAC7j5BrmrHkgRWr_ncZwN3ayVKa69dw,894
233
237
  numba_cuda/numba/cuda/tests/data/warn.cu,sha256=6L-qsXJIxAr_n3hVMAz_EZ5j0skcJAfgzuJfDEISG_I,172
238
+ numba_cuda/numba/cuda/tests/data/include/add.cuh,sha256=yv61Ilqge_kjj-_BPO5YWAx3sqJD73gEh66gxYwE8wc,107
234
239
  numba_cuda/numba/cuda/tests/doc_examples/__init__.py,sha256=GdfSq6pRVSOQwmgNi7ZFQ5l0yg4-2gNar_0Rz0buUpM,157
235
240
  numba_cuda/numba/cuda/tests/doc_examples/test_cg.py,sha256=VLWd5_v744Z5QKa4i3JVDLUwA1sxJFQzV5cRG6EkyOI,2888
236
241
  numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py,sha256=I4hWDF4DzTTtt3-XmQsP5RzPAO_pWUGsKjVO0hhPOCM,2251
237
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py,sha256=FUMfeKhSwCjrmHsawmSzwkIoHjqmKYQFhI1efN0SpvE,2743
242
+ numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py,sha256=AtjAzFgZWm1nwOokQyO7D8NVMYGd1QDD3EaUT_RQruQ,4403
238
243
  numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py,sha256=4C_drWYNZq_qGIt-N0fJ9r8DZBaJdO_5h7mxRZ6RcO8,5133
239
244
  numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py,sha256=cLIN3ejI-3cbW0xxgWjm7EsSlmluGB8stDKOqZN8EUo,6138
240
245
  numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py,sha256=IoS2pbEby3YxLKpnS6_IGlHaPgvOEL8lJtKOf2eaGLM,3493
@@ -245,6 +250,8 @@ numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py,sha256=UXwXjL9ybg0OuYOFKn
245
250
  numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py,sha256=CvExzNABd2Qk5EJqDq1TjxMNz4zw_QIjynzh1O52HU0,2032
246
251
  numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
247
252
  numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu,sha256=mRZEyCfZbq4ACTN3sj1236XmTpj1d0IxZ4QTMbI3g_E,877
253
+ numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu,sha256=xJ6D3RkxlU75Txp1_xsJKBuspDnqvr7-1L8Pb_BdMcU,246
254
+ numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh,sha256=LfYU4QwoAlAXKysg_pV9k0DSHW8oVg21DTmGK8BuZO8,113
248
255
  numba_cuda/numba/cuda/tests/nocuda/__init__.py,sha256=43EXdiXXRBd6yIcVGMrU9F_EJCD9Uw3mzOP3SB53AEE,260
249
256
  numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py,sha256=4WbuBaowiv4_3hE8lRuxgAQwnR2r3WGVNWx85M3fRUI,13399
250
257
  numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py,sha256=bnv8HbWQR0f9x8z9XdBykDCu89KaFWP0LU4OohSwHv4,1496
@@ -259,8 +266,8 @@ numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=
259
266
  numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu,sha256=T9ubst3fFUK7EXyXXMi73wAban3VFFQ986cY5OcKfvI,157
260
267
  numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=IB5t-dVhrKVoue3AbUx3yVMxPG0hBF_yZbzb4642sf0,538
261
268
  numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
262
- numba_cuda-0.11.0.dist-info/licenses/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
263
- numba_cuda-0.11.0.dist-info/METADATA,sha256=5fGOJBTyB10OIPwAfyn2W7vYjFM5SuRNJWPacBD_rgA,1859
264
- numba_cuda-0.11.0.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
265
- numba_cuda-0.11.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
266
- numba_cuda-0.11.0.dist-info/RECORD,,
269
+ numba_cuda-0.12.1.dist-info/licenses/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
270
+ numba_cuda-0.12.1.dist-info/METADATA,sha256=H6JW6cSrhykHqICS50fIbGkrZ6SRgh_cTC3hTC2-XvQ,1859
271
+ numba_cuda-0.12.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
272
+ numba_cuda-0.12.1.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
273
+ numba_cuda-0.12.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.4.0)
2
+ Generator: setuptools (80.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5