numba-cuda 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +69 -2
  3. numba_cuda/numba/cuda/compiler.py +41 -17
  4. numba_cuda/numba/cuda/cudadecl.py +15 -5
  5. numba_cuda/numba/cuda/cudadrv/driver.py +103 -20
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
  7. numba_cuda/numba/cuda/cudaimpl.py +103 -11
  8. numba_cuda/numba/cuda/decorators.py +18 -2
  9. numba_cuda/numba/cuda/dispatcher.py +27 -66
  10. numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
  11. numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
  12. numba_cuda/numba/cuda/runtime/nrt.py +13 -1
  13. numba_cuda/numba/cuda/stubs.py +23 -11
  14. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
  15. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
  16. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +98 -1
  17. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
  18. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
  19. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
  20. numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
  21. numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
  22. numba_cuda/numba/cuda/utils.py +7 -0
  23. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/METADATA +1 -1
  24. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/RECORD +27 -24
  25. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/WHEEL +1 -1
  26. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/licenses/LICENSE +0 -0
  27. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  from functools import reduce
2
2
  import operator
3
3
  import math
4
+ import struct
4
5
 
5
6
  from llvmlite import ir
6
7
  import llvmlite.binding as ll
@@ -92,10 +93,61 @@ def _get_unique_smem_id(name):
92
93
  return "{0}_{1}".format(name, _unique_smem_id)
93
94
 
94
95
 
96
+ def _validate_alignment(alignment: int):
97
+ """
98
+ Ensures that *alignment*, if not None, is a) greater than zero, b) a power
99
+ of two, and c) a multiple of the size of a pointer. If any of these
100
+ conditions are not met, a ValueError is raised. Otherwise, this
101
+ function returns None, indicating that the alignment is valid.
102
+ """
103
+ if alignment is None:
104
+ return
105
+ if not isinstance(alignment, int):
106
+ raise ValueError("Alignment must be an integer")
107
+ if alignment <= 0:
108
+ raise ValueError("Alignment must be positive")
109
+ if (alignment & (alignment - 1)) != 0:
110
+ raise ValueError("Alignment must be a power of 2")
111
+ pointer_size = struct.calcsize("P")
112
+ if (alignment % pointer_size) != 0:
113
+ msg = f"Alignment must be a multiple of {pointer_size}"
114
+ raise ValueError(msg)
115
+
116
+
117
+ def _try_extract_and_validate_alignment(sig: types.Tuple):
118
+ """
119
+ Extracts and validates the alignment from the supplied signature.
120
+
121
+ Returns the alignment if it is present and is an integer literal;
122
+ otherwise, returns None.
123
+
124
+ N.B. Currently, this routine assumes the signature has exactly
125
+ three arguments, with the alignment (if present) as the third
126
+ argument, as is the case with the shared and local array
127
+ helper routines below.
128
+
129
+ If this routine is called from new places, you may need to
130
+ review this implicit assumption.
131
+ """
132
+ if len(sig.args) != 3:
133
+ return None
134
+
135
+ alignment_arg = sig.args[2]
136
+ if not isinstance(alignment_arg, types.IntegerLiteral):
137
+ return None
138
+
139
+ alignment_arg = alignment_arg.literal_value
140
+ _validate_alignment(alignment_arg)
141
+ return alignment_arg
142
+
143
+
95
144
  @lower(cuda.shared.array, types.IntegerLiteral, types.Any)
145
+ @lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
146
+ @lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
96
147
  def cuda_shared_array_integer(context, builder, sig, args):
97
148
  length = sig.args[0].literal_value
98
149
  dtype = parse_dtype(sig.args[1])
150
+ alignment = _try_extract_and_validate_alignment(sig)
99
151
  return _generic_array(
100
152
  context,
101
153
  builder,
@@ -104,14 +156,17 @@ def cuda_shared_array_integer(context, builder, sig, args):
104
156
  symbol_name=_get_unique_smem_id("_cudapy_smem"),
105
157
  addrspace=nvvm.ADDRSPACE_SHARED,
106
158
  can_dynsized=True,
159
+ alignment=alignment,
107
160
  )
108
161
 
109
162
 
110
- @lower(cuda.shared.array, types.Tuple, types.Any)
111
- @lower(cuda.shared.array, types.UniTuple, types.Any)
163
+ @lower(cuda.shared.array, types.BaseTuple, types.Any)
164
+ @lower(cuda.shared.array, types.BaseTuple, types.Any, types.IntegerLiteral)
165
+ @lower(cuda.shared.array, types.BaseTuple, types.Any, types.NoneType)
112
166
  def cuda_shared_array_tuple(context, builder, sig, args):
113
167
  shape = [s.literal_value for s in sig.args[0]]
114
168
  dtype = parse_dtype(sig.args[1])
169
+ alignment = _try_extract_and_validate_alignment(sig)
115
170
  return _generic_array(
116
171
  context,
117
172
  builder,
@@ -120,13 +175,17 @@ def cuda_shared_array_tuple(context, builder, sig, args):
120
175
  symbol_name=_get_unique_smem_id("_cudapy_smem"),
121
176
  addrspace=nvvm.ADDRSPACE_SHARED,
122
177
  can_dynsized=True,
178
+ alignment=alignment,
123
179
  )
124
180
 
125
181
 
126
182
  @lower(cuda.local.array, types.IntegerLiteral, types.Any)
183
+ @lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
184
+ @lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
127
185
  def cuda_local_array_integer(context, builder, sig, args):
128
186
  length = sig.args[0].literal_value
129
187
  dtype = parse_dtype(sig.args[1])
188
+ alignment = _try_extract_and_validate_alignment(sig)
130
189
  return _generic_array(
131
190
  context,
132
191
  builder,
@@ -135,14 +194,17 @@ def cuda_local_array_integer(context, builder, sig, args):
135
194
  symbol_name="_cudapy_lmem",
136
195
  addrspace=nvvm.ADDRSPACE_LOCAL,
137
196
  can_dynsized=False,
197
+ alignment=alignment,
138
198
  )
139
199
 
140
200
 
141
- @lower(cuda.local.array, types.Tuple, types.Any)
142
- @lower(cuda.local.array, types.UniTuple, types.Any)
143
- def ptx_lmem_alloc_array(context, builder, sig, args):
201
+ @lower(cuda.local.array, types.BaseTuple, types.Any)
202
+ @lower(cuda.local.array, types.BaseTuple, types.Any, types.IntegerLiteral)
203
+ @lower(cuda.local.array, types.BaseTuple, types.Any, types.NoneType)
204
+ def cuda_local_array_tuple(context, builder, sig, args):
144
205
  shape = [s.literal_value for s in sig.args[0]]
145
206
  dtype = parse_dtype(sig.args[1])
207
+ alignment = _try_extract_and_validate_alignment(sig)
146
208
  return _generic_array(
147
209
  context,
148
210
  builder,
@@ -151,6 +213,7 @@ def ptx_lmem_alloc_array(context, builder, sig, args):
151
213
  symbol_name="_cudapy_lmem",
152
214
  addrspace=nvvm.ADDRSPACE_LOCAL,
153
215
  can_dynsized=False,
216
+ alignment=alignment,
154
217
  )
155
218
 
156
219
 
@@ -966,7 +1029,14 @@ def ptx_nanosleep(context, builder, sig, args):
966
1029
 
967
1030
 
968
1031
  def _generic_array(
969
- context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False
1032
+ context,
1033
+ builder,
1034
+ shape,
1035
+ dtype,
1036
+ symbol_name,
1037
+ addrspace,
1038
+ can_dynsized=False,
1039
+ alignment=None,
970
1040
  ):
971
1041
  elemcount = reduce(operator.mul, shape, 1)
972
1042
 
@@ -994,6 +1064,14 @@ def _generic_array(
994
1064
  # NVVM is smart enough to only use local memory if no register is
995
1065
  # available
996
1066
  dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
1067
+
1068
+ # If the caller has specified a custom alignment, just set the align
1069
+ # attribute on the alloca IR directly. We don't do any additional
1070
+ # hand-holding here like checking the underlying data type's alignment
1071
+ # or rounding up to the next power of 2--those checks will have already
1072
+ # been done by the time we see the alignment value.
1073
+ if alignment is not None:
1074
+ dataptr.align = alignment
997
1075
  else:
998
1076
  lmod = builder.module
999
1077
 
@@ -1001,11 +1079,25 @@ def _generic_array(
1001
1079
  gvmem = cgutils.add_global_variable(
1002
1080
  lmod, laryty, symbol_name, addrspace
1003
1081
  )
1004
- # Specify alignment to avoid misalignment bug
1005
- align = context.get_abi_sizeof(lldtype)
1006
- # Alignment is required to be a power of 2 for shared memory. If it is
1007
- # not a power of 2 (e.g. for a Record array) then round up accordingly.
1008
- gvmem.align = 1 << (align - 1).bit_length()
1082
+
1083
+ # If the caller hasn't specified a custom alignment, obtain the
1084
+ # underlying dtype alignment from the ABI and then round it up to
1085
+ # a power of two. Otherwise, just use the caller's alignment.
1086
+ #
1087
+ # N.B. The caller *could* provide a valid-but-smaller-than-natural
1088
+ # alignment here; we'll assume the caller knows what they're
1089
+ # doing and let that through without error.
1090
+
1091
+ if alignment is None:
1092
+ abi_alignment = context.get_abi_alignment(lldtype)
1093
+ # Alignment is required to be a power of 2 for shared memory.
1094
+ # If it is not a power of 2 (e.g. for a Record array) then round
1095
+ # up accordingly.
1096
+ actual_alignment = 1 << (abi_alignment - 1).bit_length()
1097
+ else:
1098
+ actual_alignment = alignment
1099
+
1100
+ gvmem.align = actual_alignment
1009
1101
 
1010
1102
  if dynamic_smem:
1011
1103
  gvmem.linkage = "external"
@@ -17,6 +17,7 @@ def jit(
17
17
  func_or_sig=None,
18
18
  device=False,
19
19
  inline="never",
20
+ forceinline=False,
20
21
  link=[],
21
22
  debug=None,
22
23
  opt=None,
@@ -39,6 +40,14 @@ def jit(
39
40
  .. note:: A kernel cannot have any return value.
40
41
  :param device: Indicates whether this is a device function.
41
42
  :type device: bool
43
+ :param inline: Enables inlining at the Numba IR level when set to
44
+ ``"always"``. See `Notes on Inlining
45
+ <https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
46
+ :type inline: str
47
+ :param forceinline: Enables inlining at the NVVM IR level when set to
48
+ ``True``. This is accomplished by adding the ``alwaysinline`` function
49
+ attribute to the function definition.
50
+ :type forceinline: bool
42
51
  :param link: A list of files containing PTX or CUDA C/C++ source to link
43
52
  with the function
44
53
  :type link: list
@@ -85,7 +94,9 @@ def jit(
85
94
  DeprecationWarning(
86
95
  "Passing bool to inline argument is deprecated, please refer to "
87
96
  "Numba's documentation on inlining: "
88
- "https://numba.readthedocs.io/en/stable/developer/inlining.html"
97
+ "https://numba.readthedocs.io/en/stable/developer/inlining.html. "
98
+ "You may have wanted the forceinline argument instead, to force "
99
+ "inlining at the NVVM IR level."
89
100
  )
90
101
 
91
102
  inline = "always" if inline else "never"
@@ -140,6 +151,7 @@ def jit(
140
151
  targetoptions["fastmath"] = fastmath
141
152
  targetoptions["device"] = device
142
153
  targetoptions["inline"] = inline
154
+ targetoptions["forceinline"] = forceinline
143
155
  targetoptions["extensions"] = extensions
144
156
 
145
157
  disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -182,6 +194,7 @@ def jit(
182
194
  func,
183
195
  device=device,
184
196
  inline=inline,
197
+ forceinline=forceinline,
185
198
  debug=debug,
186
199
  opt=opt,
187
200
  lineinfo=lineinfo,
@@ -206,6 +219,7 @@ def jit(
206
219
  targetoptions["fastmath"] = fastmath
207
220
  targetoptions["device"] = device
208
221
  targetoptions["inline"] = inline
222
+ targetoptions["forceinline"] = forceinline
209
223
  targetoptions["extensions"] = extensions
210
224
  disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
211
225
 
@@ -236,4 +250,6 @@ def declare_device(name, sig, link=None):
236
250
  msg = "Return type must be provided for device declarations"
237
251
  raise TypeError(msg)
238
252
 
239
- return declare_device_function(name, restype, argtypes, link)
253
+ template = declare_device_function(name, restype, argtypes, link)
254
+
255
+ return template.key
@@ -1,27 +1,25 @@
1
1
  import numpy as np
2
2
  import os
3
- import re
4
3
  import sys
5
4
  import ctypes
6
5
  import functools
7
- from collections import defaultdict
8
6
 
9
- from numba.core import config, ir, serialize, sigutils, types, typing, utils
7
+ from numba.core import config, serialize, sigutils, types, typing, utils
10
8
  from numba.core.caching import Cache, CacheImpl
11
9
  from numba.core.compiler_lock import global_compiler_lock
12
10
  from numba.core.dispatcher import Dispatcher
13
11
  from numba.core.errors import NumbaPerformanceWarning
14
12
  from numba.core.typing.typeof import Purpose, typeof
15
- from numba.core.types.functions import Function
16
13
  from numba.cuda.api import get_current_device
17
14
  from numba.cuda.args import wrap_arg
18
15
  from numba.cuda.compiler import (
19
16
  compile_cuda,
20
17
  CUDACompiler,
21
18
  kernel_fixup,
22
- ExternFunction,
23
19
  )
20
+ import re
24
21
  from numba.cuda.cudadrv import driver
22
+ from numba.cuda.cudadrv.linkable_code import LinkableCode
25
23
  from numba.cuda.cudadrv.devices import get_context
26
24
  from numba.cuda.descriptor import cuda_target
27
25
  from numba.cuda.errors import (
@@ -29,7 +27,7 @@ from numba.cuda.errors import (
29
27
  normalize_kernel_dimensions,
30
28
  )
31
29
  from numba.cuda import types as cuda_types
32
- from numba.cuda.runtime.nrt import rtsys
30
+ from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
33
31
  from numba.cuda.locks import module_init_lock
34
32
 
35
33
  from numba import cuda
@@ -59,54 +57,6 @@ cuda_fp16_math_funcs = [
59
57
  reshape_funcs = ["nocopy_empty_reshape", "numba_attempt_nocopy_reshape"]
60
58
 
61
59
 
62
- def get_cres_link_objects(cres):
63
- """Given a compile result, return a set of all linkable code objects that
64
- are required for it to be fully linked."""
65
-
66
- link_objects = set()
67
-
68
- # List of calls into declared device functions
69
- device_func_calls = [
70
- (name, v)
71
- for name, v in cres.fndesc.typemap.items()
72
- if (isinstance(v, cuda_types.CUDADispatcher))
73
- ]
74
-
75
- # List of tuples with SSA name of calls and corresponding signature
76
- call_signatures = [
77
- (call.func.name, sig)
78
- for call, sig in cres.fndesc.calltypes.items()
79
- if (isinstance(call, ir.Expr) and call.op == "call")
80
- ]
81
-
82
- # Map SSA names to all invoked signatures
83
- call_signature_d = defaultdict(list)
84
- for name, sig in call_signatures:
85
- call_signature_d[name].append(sig)
86
-
87
- # Add the link objects from the current function's callees
88
- for name, v in device_func_calls:
89
- for sig in call_signature_d.get(name, []):
90
- called_cres = v.dispatcher.overloads[sig.args]
91
- called_link_objects = get_cres_link_objects(called_cres)
92
- link_objects.update(called_link_objects)
93
-
94
- # From this point onwards, we are only interested in ExternFunction
95
- # declarations - these are the calls made directly in this function to
96
- # them.
97
- for name, v in cres.fndesc.typemap.items():
98
- if not isinstance(v, Function):
99
- continue
100
-
101
- if not isinstance(v.typing_key, ExternFunction):
102
- continue
103
-
104
- for obj in v.typing_key.link:
105
- link_objects.add(obj)
106
-
107
- return link_objects
108
-
109
-
110
60
  class _Kernel(serialize.ReduceMixin):
111
61
  """
112
62
  CUDA Kernel specialized for a given set of argument types. When called, this
@@ -137,6 +87,7 @@ class _Kernel(serialize.ReduceMixin):
137
87
  debug=False,
138
88
  lineinfo=False,
139
89
  inline=False,
90
+ forceinline=False,
140
91
  fastmath=False,
141
92
  extensions=None,
142
93
  max_registers=None,
@@ -182,7 +133,7 @@ class _Kernel(serialize.ReduceMixin):
182
133
  self.argtypes,
183
134
  debug=self.debug,
184
135
  lineinfo=lineinfo,
185
- inline=inline,
136
+ forceinline=forceinline,
186
137
  fastmath=fastmath,
187
138
  nvvm_options=nvvm_options,
188
139
  cc=cc,
@@ -237,9 +188,6 @@ class _Kernel(serialize.ReduceMixin):
237
188
 
238
189
  self.maybe_link_nrt(link, tgt_ctx, asm)
239
190
 
240
- for obj in get_cres_link_objects(cres):
241
- lib.add_linking_file(obj)
242
-
243
191
  for filepath in link:
244
192
  lib.add_linking_file(filepath)
245
193
 
@@ -262,6 +210,13 @@ class _Kernel(serialize.ReduceMixin):
262
210
  self.reload_init = []
263
211
 
264
212
  def maybe_link_nrt(self, link, tgt_ctx, asm):
213
+ """
214
+ Add the NRT source code to the link if the neccesary conditions are met.
215
+ NRT must be enabled for the CUDATargetContext, and either NRT functions
216
+ must be detected in the kernel asm or an NRT enabled LinkableCode object
217
+ must be passed.
218
+ """
219
+
265
220
  if not tgt_ctx.enable_nrt:
266
221
  return
267
222
 
@@ -271,13 +226,19 @@ class _Kernel(serialize.ReduceMixin):
271
226
  + all_nrt
272
227
  + r")\s*\([^)]*\)\s*;"
273
228
  )
274
-
229
+ link_nrt = False
275
230
  nrt_in_asm = re.findall(pattern, asm)
276
-
277
- basedir = os.path.dirname(os.path.abspath(__file__))
278
- if nrt_in_asm:
279
- nrt_path = os.path.join(basedir, "runtime", "nrt.cu")
280
- link.append(nrt_path)
231
+ if len(nrt_in_asm) > 0:
232
+ link_nrt = True
233
+ if not link_nrt:
234
+ for file in link:
235
+ if isinstance(file, LinkableCode):
236
+ if file.nrt:
237
+ link_nrt = True
238
+ break
239
+
240
+ if link_nrt:
241
+ link.append(NRT_LIBRARY)
281
242
 
282
243
  @property
283
244
  def library(self):
@@ -1073,7 +1034,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1073
1034
  with self._compiling_counter:
1074
1035
  debug = self.targetoptions.get("debug")
1075
1036
  lineinfo = self.targetoptions.get("lineinfo")
1076
- inline = self.targetoptions.get("inline")
1037
+ forceinline = self.targetoptions.get("forceinline")
1077
1038
  fastmath = self.targetoptions.get("fastmath")
1078
1039
 
1079
1040
  nvvm_options = {
@@ -1091,7 +1052,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1091
1052
  args,
1092
1053
  debug=debug,
1093
1054
  lineinfo=lineinfo,
1094
- inline=inline,
1055
+ forceinline=forceinline,
1095
1056
  fastmath=fastmath,
1096
1057
  nvvm_options=nvvm_options,
1097
1058
  cc=cc,
@@ -4,30 +4,14 @@
4
4
  #include <cuda/atomic>
5
5
 
6
6
  #include "memsys.cuh"
7
+ #include "nrt.cuh"
7
8
 
8
- typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
9
- typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
10
-
11
- typedef struct MemInfo NRT_MemInfo;
12
-
13
- extern "C" {
14
- struct MemInfo {
15
- cuda::atomic<size_t, cuda::thread_scope_device> refct;
16
- NRT_dtor_function dtor;
17
- void* dtor_info;
18
- void* data;
19
- size_t size;
20
- };
21
- }
22
9
 
23
10
  extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
24
11
  {
25
12
  TheMSys = memsys_ptr;
26
13
  }
27
14
 
28
- static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
29
- static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
30
- extern "C" __device__ void* NRT_Allocate_External(size_t size);
31
15
 
32
16
  extern "C" __device__ void* NRT_Allocate(size_t size)
33
17
  {
@@ -177,6 +161,7 @@ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
177
161
  }
178
162
  }
179
163
 
164
+
180
165
  #endif
181
166
 
182
167
  extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
@@ -0,0 +1,41 @@
1
+ #include <cuda/atomic>
2
+
3
+ typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
4
+ typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
5
+
6
+ extern "C"
7
+ struct MemInfo {
8
+ cuda::atomic<size_t, cuda::thread_scope_device> refct;
9
+ NRT_dtor_function dtor;
10
+ void* dtor_info;
11
+ void* data;
12
+ size_t size;
13
+ };
14
+ typedef struct MemInfo NRT_MemInfo;
15
+
16
+ extern "C" __device__ void* NRT_Allocate(size_t size);
17
+ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
18
+ void* data,
19
+ size_t size,
20
+ NRT_dtor_function dtor,
21
+ void* dtor_info);
22
+ static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
23
+ static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
24
+ extern "C" __device__ void* NRT_Allocate_External(size_t size);
25
+ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi);
26
+ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi);
27
+ extern "C" __device__ void* NRT_Allocate_External(size_t size);
28
+ static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
29
+ static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
30
+ extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align);
31
+ extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi);
32
+ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi);
33
+ extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi);
34
+ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi);
35
+ extern "C" __device__ void NRT_Free(void* ptr);
36
+ extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new(void* data, size_t size, NRT_dtor_function dtor, void* dtor_info);
37
+ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
38
+ void* data,
39
+ size_t size,
40
+ NRT_dtor_function dtor,
41
+ void* dtor_info);
@@ -13,7 +13,8 @@ from numba.cuda.cudadrv.driver import (
13
13
  )
14
14
  from numba.cuda.cudadrv import devices
15
15
  from numba.cuda.api import get_current_device
16
- from numba.cuda.utils import _readenv
16
+ from numba.cuda.utils import _readenv, cached_file_read
17
+ from numba.cuda.cudadrv.linkable_code import CUSource
17
18
 
18
19
 
19
20
  # Check environment variable or config for NRT statistics enablement
@@ -32,6 +33,11 @@ if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
32
33
  config.CUDA_ENABLE_NRT = ENABLE_NRT
33
34
 
34
35
 
36
+ def get_include():
37
+ """Return the include path for the NRT header"""
38
+ return os.path.dirname(os.path.abspath(__file__))
39
+
40
+
35
41
  # Protect method to ensure NRT memory allocation and initialization
36
42
  def _alloc_init_guard(method):
37
43
  """
@@ -340,3 +346,9 @@ class _Runtime:
340
346
 
341
347
  # Create an instance of the runtime
342
348
  rtsys = _Runtime()
349
+
350
+
351
+ basedir = os.path.dirname(os.path.abspath(__file__))
352
+ nrt_path = os.path.join(basedir, "nrt.cu")
353
+ nrt_src = cached_file_read(nrt_path)
354
+ NRT_LIBRARY = CUSource(nrt_src, name="nrt.cu", nrt=True)
@@ -129,12 +129,16 @@ class shared(Stub):
129
129
  _description_ = "<shared>"
130
130
 
131
131
  @stub_function
132
- def array(shape, dtype):
132
+ def array(shape, dtype, alignment=None):
133
133
  """
134
- Allocate a shared array of the given *shape* and *type*. *shape* is
135
- either an integer or a tuple of integers representing the array's
136
- dimensions. *type* is a :ref:`Numba type <numba-types>` of the
137
- elements needing to be stored in the array.
134
+ Allocate a shared array of the given *shape*, *type*, and, optionally,
135
+ *alignment*. *shape* is either an integer or a tuple of integers
136
+ representing the array's dimensions. *type* is a :ref:`Numba type
137
+ <numba-types>` of the elements needing to be stored in the array.
138
+ *alignment* is an optional integer specifying the byte alignment of
139
+ the array. When specified, it must be a power of two, and a multiple
140
+ of the size of a pointer (8 bytes). When not specified, the array is
141
+ allocated with an alignment appropriate for the supplied *dtype*.
138
142
 
139
143
  The returned array-like object can be read and written to like any
140
144
  normal device array (e.g. through indexing).
@@ -149,12 +153,20 @@ class local(Stub):
149
153
  _description_ = "<local>"
150
154
 
151
155
  @stub_function
152
- def array(shape, dtype):
153
- """
154
- Allocate a local array of the given *shape* and *type*. The array is
155
- private to the current thread, and resides in global memory. An
156
- array-like object is returned which can be read and written to like any
157
- standard array (e.g. through indexing).
156
+ def array(shape, dtype, alignment=None):
157
+ """
158
+ Allocate a local array of the given *shape*, *type*, and, optionally,
159
+ *alignment*. *shape* is either an integer or a tuple of integers
160
+ representing the array's dimensions. *type* is a :ref:`Numba type
161
+ <numba-types>` of the elements needing to be stored in the array.
162
+ *alignment* is an optional integer specifying the byte alignment of
163
+ the array. When specified, it must be a power of two, and a multiple
164
+ of the size of a pointer (8 bytes). When not specified, the array is
165
+ allocated with an alignment appropriate for the supplied *dtype*.
166
+
167
+ The array is private to the current thread, and resides in global
168
+ memory. An array-like object is returned which can be read and
169
+ written to like any standard array (e.g. through indexing).
158
170
  """
159
171
 
160
172