numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +1 -1
  3. numba_cuda/numba/cuda/compiler.py +24 -1
  4. numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
  5. numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
  6. numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
  7. numba_cuda/numba/cuda/debuginfo.py +52 -1
  8. numba_cuda/numba/cuda/decorators.py +14 -0
  9. numba_cuda/numba/cuda/dispatcher.py +9 -2
  10. numba_cuda/numba/cuda/lowering.py +83 -4
  11. numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
  12. numba_cuda/numba/cuda/simulator/__init__.py +10 -1
  13. numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
  14. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
  15. numba_cuda/numba/cuda/simulator/api.py +17 -0
  16. numba_cuda/numba/cuda/simulator/bf16.py +1 -0
  17. numba_cuda/numba/cuda/simulator/compiler.py +1 -0
  18. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
  19. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
  20. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
  21. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
  22. numba_cuda/numba/cuda/simulator/kernel.py +1 -1
  23. numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
  24. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
  25. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
  26. numba_cuda/numba/cuda/testing.py +10 -4
  27. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
  28. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
  29. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
  30. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
  31. numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
  32. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
  33. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
  34. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
  35. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
  36. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
  37. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
  38. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
  39. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
  40. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
  41. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
  42. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
  43. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
  44. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
  45. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
  46. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
  47. numba_cuda/numba/cuda/tests/support.py +1 -1
  48. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
  49. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
  50. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
  51. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
  52. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
  53. numba_cuda/numba/cuda/runtime/__init__.py +0 -1
  54. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
  55. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
  56. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
  57. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
  58. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
  59. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
  60. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.12.1
1
+ 0.14.0
@@ -5,7 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
5
5
  from .cudadrv import devices, driver, nvvm, runtime
6
6
  from numba.cuda.cudadrv.libs import get_cudalib
7
7
  from numba.cuda.cudadrv.linkable_code import LinkableCode
8
- from numba.cuda.runtime.nrt import NRT_LIBRARY
8
+ from numba.cuda.memory_management.nrt import NRT_LIBRARY
9
9
 
10
10
  import os
11
11
  import subprocess
@@ -575,6 +575,7 @@ def compile(
575
575
  abi_info=None,
576
576
  output="ptx",
577
577
  forceinline=False,
578
+ launch_bounds=None,
578
579
  ):
579
580
  """Compile a Python function to PTX or LTO-IR for a given set of argument
580
581
  types.
@@ -620,6 +621,16 @@ def compile(
620
621
  ``alwaysinline`` function attribute to the function
621
622
  definition. This is only valid when the output is
622
623
  ``"ltoir"``.
624
+ :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
625
+ of between one and three items. Tuple items provide:
626
+
627
+ - The maximum number of threads per block,
628
+ - The minimum number of blocks per SM,
629
+ - The maximum number of blocks per cluster.
630
+
631
+ If a scalar is provided, it is used as the maximum
632
+ number of threads per block.
633
+ :type launch_bounds: int | tuple[int]
623
634
  :return: (code, resty): The compiled code and inferred return type
624
635
  :rtype: tuple
625
636
  """
@@ -662,7 +673,12 @@ def compile(
662
673
 
663
674
  args, return_type = sigutils.normalize_signature(sig)
664
675
 
665
- cc = cc or config.CUDA_DEFAULT_PTX_CC
676
+ # If the user has used the config variable to specify a non-default that is
677
+ # greater than the lowest non-deprecated one, then we should default to
678
+ # their specified CC instead of the lowest non-deprecated one.
679
+ MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvvm.LOWEST_CURRENT_CC)
680
+ cc = cc or MIN_CC
681
+
666
682
  cres = compile_cuda(
667
683
  pyfunc,
668
684
  return_type,
@@ -693,6 +709,7 @@ def compile(
693
709
  kernel = lib.get_function(cres.fndesc.llvm_func_name)
694
710
  lib._entry_name = cres.fndesc.llvm_func_name
695
711
  kernel_fixup(kernel, debug)
712
+ nvvm.set_launch_bounds(kernel, launch_bounds)
696
713
 
697
714
  if lto:
698
715
  code = lib.get_ltoir(cc=cc)
@@ -713,6 +730,7 @@ def compile_for_current_device(
713
730
  abi_info=None,
714
731
  output="ptx",
715
732
  forceinline=False,
733
+ launch_bounds=None,
716
734
  ):
717
735
  """Compile a Python function to PTX or LTO-IR for a given signature for the
718
736
  current device's compute capabilility. This calls :func:`compile` with an
@@ -731,6 +749,7 @@ def compile_for_current_device(
731
749
  abi_info=abi_info,
732
750
  output=output,
733
751
  forceinline=forceinline,
752
+ launch_bounds=launch_bounds,
734
753
  )
735
754
 
736
755
 
@@ -746,6 +765,7 @@ def compile_ptx(
746
765
  abi="numba",
747
766
  abi_info=None,
748
767
  forceinline=False,
768
+ launch_bounds=None,
749
769
  ):
750
770
  """Compile a Python function to PTX for a given signature. See
751
771
  :func:`compile`. The defaults for this function are to compile a kernel
@@ -764,6 +784,7 @@ def compile_ptx(
764
784
  abi_info=abi_info,
765
785
  output="ptx",
766
786
  forceinline=forceinline,
787
+ launch_bounds=launch_bounds,
767
788
  )
768
789
 
769
790
 
@@ -778,6 +799,7 @@ def compile_ptx_for_current_device(
778
799
  abi="numba",
779
800
  abi_info=None,
780
801
  forceinline=False,
802
+ launch_bounds=None,
781
803
  ):
782
804
  """Compile a Python function to PTX for a given signature for the current
783
805
  device's compute capabilility. See :func:`compile_ptx`."""
@@ -794,6 +816,7 @@ def compile_ptx_for_current_device(
794
816
  abi=abi,
795
817
  abi_info=abi_info,
796
818
  forceinline=forceinline,
819
+ launch_bounds=launch_bounds,
797
820
  )
798
821
 
799
822
 
@@ -82,9 +82,21 @@ _MVC_ERROR_MESSAGE = (
82
82
  "to be available"
83
83
  )
84
84
 
85
- ENABLE_PYNVJITLINK = _readenv(
86
- "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False
87
- ) or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False)
85
+ # Enable pynvjitlink if the environment variables NUMBA_CUDA_ENABLE_PYNVJITLINK
86
+ # or CUDA_ENABLE_PYNVJITLINK are set, or if the pynvjitlink module is found. If
87
+ # explicitly disabled, do not use pynvjitlink, even if present in the env.
88
+ _pynvjitlink_enabled_in_env = _readenv(
89
+ "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None
90
+ )
91
+ _pynvjitlink_enabled_in_cfg = getattr(config, "CUDA_ENABLE_PYNVJITLINK", None)
92
+
93
+ if _pynvjitlink_enabled_in_env is not None:
94
+ ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
95
+ elif _pynvjitlink_enabled_in_cfg is not None:
96
+ ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_cfg
97
+ else:
98
+ ENABLE_PYNVJITLINK = importlib.util.find_spec("pynvjitlink") is not None
99
+
88
100
  if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"):
89
101
  config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
90
102
 
@@ -397,7 +397,7 @@ def compile(src, name, cc, ltoir=False):
397
397
  else:
398
398
  extra_includes = []
399
399
 
400
- nrt_path = os.path.join(numba_cuda_path, "runtime")
400
+ nrt_path = os.path.join(numba_cuda_path, "memory_management")
401
401
  nrt_include = f"-I{nrt_path}"
402
402
 
403
403
  options = [
@@ -369,48 +369,101 @@ COMPUTE_CAPABILITIES = (
369
369
  (9, 0),
370
370
  (10, 0),
371
371
  (10, 1),
372
+ (10, 3),
372
373
  (12, 0),
374
+ (12, 1),
373
375
  )
374
376
 
375
377
 
376
- # Maps CTK version -> (min supported cc, max supported cc) inclusive
378
+ # Maps CTK version -> (min supported cc, max supported cc) ranges, bounds inclusive
377
379
  _CUDA_CC_MIN_MAX_SUPPORT = {
378
- (11, 1): ((3, 5), (8, 0)),
379
- (11, 2): ((3, 5), (8, 6)),
380
- (11, 3): ((3, 5), (8, 6)),
381
- (11, 4): ((3, 5), (8, 7)),
382
- (11, 5): ((3, 5), (8, 7)),
383
- (11, 6): ((3, 5), (8, 7)),
384
- (11, 7): ((3, 5), (8, 7)),
385
- (11, 8): ((3, 5), (9, 0)),
386
- (12, 0): ((5, 0), (9, 0)),
387
- (12, 1): ((5, 0), (9, 0)),
388
- (12, 2): ((5, 0), (9, 0)),
389
- (12, 3): ((5, 0), (9, 0)),
390
- (12, 4): ((5, 0), (9, 0)),
391
- (12, 5): ((5, 0), (9, 0)),
392
- (12, 6): ((5, 0), (9, 0)),
393
- (12, 8): ((5, 0), (12, 0)),
380
+ (11, 2): [
381
+ ((3, 5), (8, 6)),
382
+ ],
383
+ (11, 3): [
384
+ ((3, 5), (8, 6)),
385
+ ],
386
+ (11, 4): [
387
+ ((3, 5), (8, 7)),
388
+ ],
389
+ (11, 5): [
390
+ ((3, 5), (8, 7)),
391
+ ],
392
+ (11, 6): [
393
+ ((3, 5), (8, 7)),
394
+ ],
395
+ (11, 7): [
396
+ ((3, 5), (8, 7)),
397
+ ],
398
+ (11, 8): [
399
+ ((3, 5), (9, 0)),
400
+ ],
401
+ (12, 0): [
402
+ ((5, 0), (9, 0)),
403
+ ],
404
+ (12, 1): [
405
+ ((5, 0), (9, 0)),
406
+ ],
407
+ (12, 2): [
408
+ ((5, 0), (9, 0)),
409
+ ],
410
+ (12, 3): [
411
+ ((5, 0), (9, 0)),
412
+ ],
413
+ (12, 4): [
414
+ ((5, 0), (9, 0)),
415
+ ],
416
+ (12, 5): [
417
+ ((5, 0), (9, 0)),
418
+ ],
419
+ (12, 6): [
420
+ ((5, 0), (9, 0)),
421
+ ],
422
+ (12, 8): [
423
+ ((5, 0), (10, 1)),
424
+ ((12, 0), (12, 0)),
425
+ ],
426
+ (12, 9): [
427
+ ((5, 0), (12, 1)),
428
+ ],
394
429
  }
395
430
 
431
+ # From CUDA 12.9 Release notes, Section 1.5.4, "Deprecated Architectures"
432
+ # https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#deprecated-architectures
433
+ #
434
+ # "Maxwell, Pascal, and Volta architectures are now feature-complete with no
435
+ # further enhancements planned. While CUDA Toolkit 12.x series will continue
436
+ # to support building applications for these architectures, offline
437
+ # compilation and library support will be removed in the next major CUDA
438
+ # Toolkit version release. Users should plan migration to newer
439
+ # architectures, as future toolkits will be unable to target Maxwell, Pascal,
440
+ # and Volta GPUs."
441
+ #
442
+ # In order to maintain compatibility with future toolkits, we use Turing (7.5)
443
+ # as the default CC if it is not otherwise specified.
444
+ LOWEST_CURRENT_CC = (7, 5)
445
+
396
446
 
397
447
  def ccs_supported_by_ctk(ctk_version):
398
448
  try:
399
449
  # For supported versions, we look up the range of supported CCs
400
- min_cc, max_cc = _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
401
- return tuple(
402
- [cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]
403
- )
404
- except KeyError:
405
- # For unsupported CUDA toolkit versions, all we can do is assume all
406
- # non-deprecated versions we are aware of are supported.
407
450
  return tuple(
408
451
  [
409
452
  cc
453
+ for min_cc, max_cc in _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
410
454
  for cc in COMPUTE_CAPABILITIES
411
- if cc >= config.CUDA_DEFAULT_PTX_CC
455
+ if min_cc <= cc <= max_cc
412
456
  ]
413
457
  )
458
+ except KeyError:
459
+ # For unsupported CUDA toolkit versions, all we can do is assume all
460
+ # non-deprecated versions we are aware of are supported.
461
+ #
462
+ # If the user has specified a non-default CC that is greater than the
463
+ # lowest non-deprecated one, then we should assume that instead.
464
+ MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, LOWEST_CURRENT_CC)
465
+
466
+ return tuple([cc for cc in COMPUTE_CAPABILITIES if cc >= MIN_CC])
414
467
 
415
468
 
416
469
  def get_supported_ccs():
@@ -857,6 +910,54 @@ def set_cuda_kernel(function):
857
910
  function.attributes.discard("noinline")
858
911
 
859
912
 
913
+ def set_launch_bounds(kernel, launch_bounds):
914
+ # Based on: CUDA C / C++ Programming Guide 12.9, Section 8.38:
915
+ # https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#launch-bounds
916
+ # PTX ISA Specification Version 8.7, Section 11.4:
917
+ # https://docs.nvidia.com/cuda/archive/12.8.1/parallel-thread-execution/index.html#performance-tuning-directives
918
+ # NVVM IR Specification 12.9, Section 13:
919
+ # https://docs.nvidia.com/cuda/archive/12.9.0/nvvm-ir-spec/index.html#global-property-annotation
920
+
921
+ if launch_bounds is None:
922
+ return
923
+
924
+ if isinstance(launch_bounds, int):
925
+ launch_bounds = (launch_bounds,)
926
+
927
+ if (n := len(launch_bounds)) > 3:
928
+ raise ValueError(
929
+ f"Got {n} launch bounds: {launch_bounds}. A maximum of three are supported: "
930
+ "(max_threads_per_block, min_blocks_per_sm, max_blocks_per_cluster)"
931
+ )
932
+
933
+ module = kernel.module
934
+ nvvm_annotations = cgutils.get_or_insert_named_metadata(
935
+ module, "nvvm.annotations"
936
+ )
937
+
938
+ # Note that only maxntidx is used even though NVVM IR and PTX allow
939
+ # maxntidy and maxntidz. This is because the thread block size limit
940
+ # pertains only to the total number of threads, and therefore bounds on
941
+ # individual dimensions may be exceeded anyway. To prevent an unsurprising
942
+ # interface, it is cleaner to only allow setting total size via maxntidx
943
+ # and assuming y and z to be 1 (as is the case in CUDA C/C++).
944
+
945
+ properties = (
946
+ # Max threads per block
947
+ "maxntidx",
948
+ # Min blocks per multiprocessor
949
+ "minctasm",
950
+ # Max blocks per cluster
951
+ "cluster_max_blocks",
952
+ )
953
+
954
+ for prop, bound in zip(properties, launch_bounds):
955
+ mdstr = ir.MetaDataString(module, prop)
956
+ mdvalue = ir.Constant(ir.IntType(32), bound)
957
+ md = module.add_metadata((kernel, mdstr, mdvalue))
958
+ nvvm_annotations.add(md)
959
+
960
+
860
961
  def add_ir_version(mod):
861
962
  """Add NVVM IR version to module"""
862
963
  # We specify the IR version to match the current NVVM's IR version
@@ -2,6 +2,7 @@ from llvmlite import ir
2
2
  from numba.core import types, cgutils
3
3
  from numba.core.debuginfo import DIBuilder
4
4
  from numba.cuda.types import GridGroup
5
+ from numba.core.datamodel.models import UnionModel
5
6
 
6
7
  _BYTE_SIZE = 8
7
8
 
@@ -16,6 +17,7 @@ class CUDADIBuilder(DIBuilder):
16
17
  is_bool = False
17
18
  is_int_literal = False
18
19
  is_grid_group = False
20
+ m = self.module
19
21
 
20
22
  if isinstance(lltype, ir.IntType):
21
23
  if datamodel is None:
@@ -36,7 +38,6 @@ class CUDADIBuilder(DIBuilder):
36
38
  is_grid_group = True
37
39
 
38
40
  if is_bool or is_int_literal or is_grid_group:
39
- m = self.module
40
41
  bitsize = _BYTE_SIZE * size
41
42
  # Boolean type workaround until upstream Numba is fixed
42
43
  if is_bool:
@@ -56,6 +57,56 @@ class CUDADIBuilder(DIBuilder):
56
57
  },
57
58
  )
58
59
 
60
+ if isinstance(datamodel, UnionModel):
61
+ # UnionModel is handled here to represent polymorphic types
62
+ meta = []
63
+ maxwidth = 0
64
+ for field, model in zip(
65
+ datamodel._fields, datamodel.inner_models()
66
+ ):
67
+ # Ignore the "tag" field, focus on the "payload" field which
68
+ # contains the data types in memory
69
+ if field == "payload":
70
+ for mod in model.inner_models():
71
+ dtype = mod.get_value_type()
72
+ membersize = self.cgctx.get_abi_sizeof(dtype)
73
+ basetype = self._var_type(
74
+ dtype, membersize, datamodel=mod
75
+ )
76
+ if isinstance(mod.fe_type, types.Literal):
77
+ typename = str(mod.fe_type.literal_type)
78
+ else:
79
+ typename = str(mod.fe_type)
80
+ # Use a prefix "_" on type names as field names
81
+ membername = "_" + typename
82
+ memberwidth = _BYTE_SIZE * membersize
83
+ derived_type = m.add_debug_info(
84
+ "DIDerivedType",
85
+ {
86
+ "tag": ir.DIToken("DW_TAG_member"),
87
+ "name": membername,
88
+ "baseType": basetype,
89
+ # DW_TAG_member size is in bits
90
+ "size": memberwidth,
91
+ },
92
+ )
93
+ meta.append(derived_type)
94
+ if memberwidth > maxwidth:
95
+ maxwidth = memberwidth
96
+
97
+ fake_union_name = "dbg_poly_union"
98
+ return m.add_debug_info(
99
+ "DICompositeType",
100
+ {
101
+ "file": self.difile,
102
+ "tag": ir.DIToken("DW_TAG_union_type"),
103
+ "name": fake_union_name,
104
+ "identifier": str(lltype),
105
+ "elements": m.add_metadata(meta),
106
+ "size": maxwidth,
107
+ },
108
+ is_distinct=True,
109
+ )
59
110
  # For other cases, use upstream Numba implementation
60
111
  return super()._var_type(lltype, size, datamodel=datamodel)
61
112
 
@@ -23,6 +23,7 @@ def jit(
23
23
  opt=None,
24
24
  lineinfo=False,
25
25
  cache=False,
26
+ launch_bounds=None,
26
27
  **kws,
27
28
  ):
28
29
  """
@@ -72,6 +73,16 @@ def jit(
72
73
  :type lineinfo: bool
73
74
  :param cache: If True, enables the file-based cache for this function.
74
75
  :type cache: bool
76
+ :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
77
+ of between one and three items. Tuple items provide:
78
+
79
+ - The maximum number of threads per block,
80
+ - The minimum number of blocks per SM,
81
+ - The maximum number of blocks per cluster.
82
+
83
+ If a scalar is provided, it is used as the maximum
84
+ number of threads per block.
85
+ :type launch_bounds: int | tuple[int]
75
86
  """
76
87
 
77
88
  if link and config.ENABLE_CUDASIM:
@@ -153,6 +164,7 @@ def jit(
153
164
  targetoptions["inline"] = inline
154
165
  targetoptions["forceinline"] = forceinline
155
166
  targetoptions["extensions"] = extensions
167
+ targetoptions["launch_bounds"] = launch_bounds
156
168
 
157
169
  disp = CUDADispatcher(func, targetoptions=targetoptions)
158
170
 
@@ -200,6 +212,7 @@ def jit(
200
212
  lineinfo=lineinfo,
201
213
  link=link,
202
214
  cache=cache,
215
+ launch_bounds=launch_bounds,
203
216
  **kws,
204
217
  )
205
218
 
@@ -221,6 +234,7 @@ def jit(
221
234
  targetoptions["inline"] = inline
222
235
  targetoptions["forceinline"] = forceinline
223
236
  targetoptions["extensions"] = extensions
237
+ targetoptions["launch_bounds"] = launch_bounds
224
238
  disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
225
239
 
226
240
  if cache:
@@ -18,7 +18,7 @@ from numba.cuda.compiler import (
18
18
  kernel_fixup,
19
19
  )
20
20
  import re
21
- from numba.cuda.cudadrv import driver
21
+ from numba.cuda.cudadrv import driver, nvvm
22
22
  from numba.cuda.cudadrv.linkable_code import LinkableCode
23
23
  from numba.cuda.cudadrv.devices import get_context
24
24
  from numba.cuda.descriptor import cuda_target
@@ -27,8 +27,8 @@ from numba.cuda.errors import (
27
27
  normalize_kernel_dimensions,
28
28
  )
29
29
  from numba.cuda import types as cuda_types
30
- from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
31
30
  from numba.cuda.locks import module_init_lock
31
+ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
32
32
 
33
33
  from numba import cuda
34
34
  from numba import _dispatcher
@@ -94,6 +94,7 @@ class _Kernel(serialize.ReduceMixin):
94
94
  lto=False,
95
95
  opt=True,
96
96
  device=False,
97
+ launch_bounds=None,
97
98
  ):
98
99
  if device:
99
100
  raise RuntimeError("Cannot compile a device function as a kernel")
@@ -120,6 +121,7 @@ class _Kernel(serialize.ReduceMixin):
120
121
  self.debug = debug
121
122
  self.lineinfo = lineinfo
122
123
  self.extensions = extensions or []
124
+ self.launch_bounds = launch_bounds
123
125
 
124
126
  nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
125
127
 
@@ -145,6 +147,7 @@ class _Kernel(serialize.ReduceMixin):
145
147
  kernel = lib.get_function(cres.fndesc.llvm_func_name)
146
148
  lib._entry_name = cres.fndesc.llvm_func_name
147
149
  kernel_fixup(kernel, self.debug)
150
+ nvvm.set_launch_bounds(kernel, launch_bounds)
148
151
 
149
152
  if not link:
150
153
  link = []
@@ -547,6 +550,10 @@ class _Kernel(serialize.ReduceMixin):
547
550
  for ax in range(devary.ndim):
548
551
  kernelargs.append(c_intp(devary.strides[ax]))
549
552
 
553
+ elif isinstance(ty, types.CPointer):
554
+ # Pointer arguments should be a pointer-sized integer
555
+ kernelargs.append(ctypes.c_uint64(val))
556
+
550
557
  elif isinstance(ty, types.Integer):
551
558
  cval = getattr(ctypes, "c_%s" % ty)(val)
552
559
  kernelargs.append(cval)
@@ -1,5 +1,7 @@
1
1
  from numba.core.lowering import Lower
2
2
  from llvmlite import ir
3
+ from numba.core import ir as numba_ir
4
+ from numba.core import types
3
5
 
4
6
 
5
7
  class CUDALower(Lower):
@@ -14,10 +16,7 @@ class CUDALower(Lower):
14
16
  if (
15
17
  self.context.enable_debuginfo
16
18
  # Conditions used to elide stores in parent method
17
- and (
18
- name not in self._singly_assigned_vars
19
- or self._disable_sroa_like_opt
20
- )
19
+ and self.store_var_needed(name)
21
20
  # No emission of debuginfo for internal names
22
21
  and not name.startswith("$")
23
22
  ):
@@ -27,6 +26,11 @@ class CUDALower(Lower):
27
26
  int_type = (ir.IntType,)
28
27
  real_type = ir.FloatType, ir.DoubleType
29
28
  if isinstance(lltype, int_type + real_type):
29
+ index = name.find(".")
30
+ src_name = name[:index] if index > 0 else name
31
+ if src_name in self.poly_var_typ_map:
32
+ # Do not emit debug value on polymorphic type var
33
+ return
30
34
  # Emit debug value for scalar variable
31
35
  sizeof = self.context.get_abi_sizeof(lltype)
32
36
  datamodel = self.context.data_model_manager[fetype]
@@ -41,3 +45,78 @@ class CUDALower(Lower):
41
45
  datamodel,
42
46
  argidx,
43
47
  )
48
+
49
+ def pre_lower(self):
50
+ """
51
+ Called before lowering all blocks.
52
+ """
53
+ super().pre_lower()
54
+
55
+ self.poly_var_typ_map = {}
56
+ self.poly_var_loc_map = {}
57
+
58
+ # When debug info is enabled, walk through function body and mark
59
+ # variables with polymorphic types.
60
+ if self.context.enable_debuginfo and self._disable_sroa_like_opt:
61
+ poly_map = {}
62
+ # pre-scan all blocks
63
+ for block in self.blocks.values():
64
+ for x in block.find_insts(numba_ir.Assign):
65
+ if x.target.name.startswith("$"):
66
+ continue
67
+ ssa_name = x.target.name
68
+ index = ssa_name.find(".")
69
+ src_name = ssa_name[:index] if index > 0 else ssa_name
70
+ # Check all the multi-versioned targets
71
+ if len(x.target.versioned_names) > 0:
72
+ fetype = self.typeof(ssa_name)
73
+ if src_name not in poly_map:
74
+ poly_map[src_name] = set()
75
+ # deduplicate polymorphic types
76
+ if isinstance(fetype, types.Literal):
77
+ fetype = fetype.literal_type
78
+ poly_map[src_name].add(fetype)
79
+ # Filter out multi-versioned but single typed variables
80
+ self.poly_var_typ_map = {
81
+ k: v for k, v in poly_map.items() if len(v) > 1
82
+ }
83
+
84
+ def _alloca_var(self, name, fetype):
85
+ """
86
+ Ensure the given variable has an allocated stack slot (if needed).
87
+ """
88
+ # If the name is not handled yet and a store is needed
89
+ if name not in self.varmap and self.store_var_needed(name):
90
+ index = name.find(".")
91
+ src_name = name[:index] if index > 0 else name
92
+ if src_name in self.poly_var_typ_map:
93
+ dtype = types.UnionType(self.poly_var_typ_map[src_name])
94
+ datamodel = self.context.data_model_manager[dtype]
95
+ if src_name not in self.poly_var_loc_map:
96
+ # UnionType has sorted set of types, max at last index
97
+ maxsizetype = dtype.types[-1]
98
+ # Create a single element aggregate type
99
+ aggr_type = types.UniTuple(maxsizetype, 1)
100
+ lltype = self.context.get_value_type(aggr_type)
101
+ ptr = self.alloca_lltype(src_name, lltype, datamodel)
102
+ # save the location of the union type for polymorphic var
103
+ self.poly_var_loc_map[src_name] = ptr
104
+ # Any member of this union type shoud type cast ptr to fetype
105
+ lltype = self.context.get_value_type(fetype)
106
+ castptr = self.builder.bitcast(
107
+ self.poly_var_loc_map[src_name], ir.PointerType(lltype)
108
+ )
109
+ # Remember the pointer
110
+ self.varmap[name] = castptr
111
+
112
+ super()._alloca_var(name, fetype)
113
+
114
+ def store_var_needed(self, name):
115
+ # Check the conditions used to elide stores in parent class,
116
+ # e.g. in method storevar() and _alloca_var()
117
+ return (
118
+ # used in multiple blocks
119
+ name not in self._singly_assigned_vars
120
+ # lowering with debuginfo
121
+ or self._disable_sroa_like_opt
122
+ )
@@ -0,0 +1 @@
1
+ from numba.cuda.memory_management.nrt import rtsys # noqa: F401
@@ -38,11 +38,20 @@ if config.ENABLE_CUDASIM:
38
38
  sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
39
39
  sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
40
40
  sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
41
+ sys.modules["numba.cuda.cudadrv.linkable_code"] = cudadrv.linkable_code
41
42
  sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
42
43
  sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
43
44
  sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
44
45
  sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
45
46
 
46
- from . import compiler
47
+ from . import bf16, compiler, _internal
47
48
 
49
+ sys.modules["numba.cuda.bf16"] = bf16
48
50
  sys.modules["numba.cuda.compiler"] = compiler
51
+ sys.modules["numba.cuda._internal"] = _internal
52
+ sys.modules["numba.cuda._internal.cuda_bf16"] = _internal.cuda_bf16
53
+
54
+ from numba.cuda.simulator import memory_management
55
+
56
+ sys.modules["numba.cuda.memory_management"] = memory_management
57
+ sys.modules["numba.cuda.memory_management.nrt"] = memory_management.nrt
@@ -0,0 +1 @@
1
+ from numba.cuda.simulator._internal import cuda_bf16 # noqa: F401
File without changes
@@ -7,6 +7,15 @@ Contains CUDA API functions
7
7
  from contextlib import contextmanager
8
8
 
9
9
  from .cudadrv.devices import require_context, reset, gpus # noqa: F401
10
+ from .cudadrv.linkable_code import (
11
+ PTXSource, # noqa: F401
12
+ CUSource, # noqa: F401
13
+ Cubin, # noqa: F401
14
+ Fatbin, # noqa: F401
15
+ Archive, # noqa: F401
16
+ Object, # noqa: F401
17
+ LTOIR, # noqa: F401
18
+ ) # noqa: F401
10
19
  from .kernel import FakeCUDAKernel
11
20
  from numba.core.sigutils import is_signature
12
21
  from numba.core import config
@@ -22,6 +31,10 @@ def is_float16_supported():
22
31
  return True
23
32
 
24
33
 
34
+ def is_bfloat16_supported():
35
+ return False
36
+
37
+
25
38
  class stream(object):
26
39
  """
27
40
  The stream API is supported in the simulator - however, all execution
@@ -72,6 +85,10 @@ def list_devices():
72
85
  return gpus
73
86
 
74
87
 
88
+ def get_current_device():
89
+ return gpus[0].device
90
+
91
+
75
92
  # Events
76
93
 
77
94