numba-cuda 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
  3. numba_cuda/numba/cuda/api.py +13 -0
  4. numba_cuda/numba/cuda/bf16.py +112 -0
  5. numba_cuda/numba/cuda/cg.py +2 -0
  6. numba_cuda/numba/cuda/codegen.py +77 -2
  7. numba_cuda/numba/cuda/compiler.py +22 -16
  8. numba_cuda/numba/cuda/cudadecl.py +21 -6
  9. numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
  10. numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
  11. numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
  12. numba_cuda/numba/cuda/cudaimpl.py +103 -11
  13. numba_cuda/numba/cuda/debuginfo.py +27 -0
  14. numba_cuda/numba/cuda/decorators.py +7 -2
  15. numba_cuda/numba/cuda/dispatcher.py +25 -65
  16. numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
  17. numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
  18. numba_cuda/numba/cuda/runtime/nrt.py +13 -1
  19. numba_cuda/numba/cuda/stubs.py +23 -11
  20. numba_cuda/numba/cuda/target.py +10 -1
  21. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  22. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  23. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
  24. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
  25. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
  26. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  27. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
  28. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  29. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
  30. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  31. numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  32. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  33. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  34. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  35. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
  36. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
  37. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
  38. numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
  39. numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
  40. numba_cuda/numba/cuda/utils.py +7 -0
  41. {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
  42. {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +45 -35
  43. {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
  44. {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
  45. {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,10 @@
1
1
  from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
2
+ from llvmlite import ir
2
3
 
3
4
  import numpy as np
5
+ import os
4
6
  from numba import config, cuda, njit, types
7
+ from numba.extending import overload
5
8
 
6
9
 
7
10
  class Interval:
@@ -160,5 +163,142 @@ class TestExtending(CUDATestCase):
160
163
  np.testing.assert_allclose(r, expected)
161
164
 
162
165
 
166
+ TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
167
+ if TEST_BIN_DIR:
168
+ test_device_functions_a = os.path.join(
169
+ TEST_BIN_DIR, "test_device_functions.a"
170
+ )
171
+ test_device_functions_cubin = os.path.join(
172
+ TEST_BIN_DIR, "test_device_functions.cubin"
173
+ )
174
+ test_device_functions_cu = os.path.join(
175
+ TEST_BIN_DIR, "test_device_functions.cu"
176
+ )
177
+ test_device_functions_fatbin = os.path.join(
178
+ TEST_BIN_DIR, "test_device_functions.fatbin"
179
+ )
180
+ test_device_functions_fatbin_multi = os.path.join(
181
+ TEST_BIN_DIR, "test_device_functions_multi.fatbin"
182
+ )
183
+ test_device_functions_o = os.path.join(
184
+ TEST_BIN_DIR, "test_device_functions.o"
185
+ )
186
+ test_device_functions_ptx = os.path.join(
187
+ TEST_BIN_DIR, "test_device_functions.ptx"
188
+ )
189
+ test_device_functions_ltoir = os.path.join(
190
+ TEST_BIN_DIR, "test_device_functions.ltoir"
191
+ )
192
+
193
+
194
+ class TestExtendingLinkage(CUDATestCase):
195
+ def test_extension_adds_linkable_code(self):
196
+ cuda_major_version = cuda.runtime.get_version()[0]
197
+
198
+ if cuda_major_version < 12:
199
+ self.skipTest("CUDA 12 required for linking in-memory data")
200
+
201
+ files = (
202
+ (test_device_functions_a, cuda.Archive),
203
+ (test_device_functions_cubin, cuda.Cubin),
204
+ (test_device_functions_cu, cuda.CUSource),
205
+ (test_device_functions_fatbin, cuda.Fatbin),
206
+ (test_device_functions_o, cuda.Object),
207
+ (test_device_functions_ptx, cuda.PTXSource),
208
+ (test_device_functions_ltoir, cuda.LTOIR),
209
+ )
210
+
211
+ lto = config.CUDA_ENABLE_PYNVJITLINK
212
+
213
+ for path, ctor in files:
214
+ if ctor == cuda.LTOIR and not lto:
215
+ # Don't try to test with LTOIR if LTO is not enabled
216
+ continue
217
+
218
+ with open(path, "rb") as f:
219
+ code_object = ctor(f.read())
220
+
221
+ def external_add(x, y):
222
+ return x + y
223
+
224
+ @type_callable(external_add)
225
+ def type_external_add(context):
226
+ def typer(x, y):
227
+ if x == types.uint32 and y == types.uint32:
228
+ return types.uint32
229
+
230
+ return typer
231
+
232
+ @lower_builtin(external_add, types.uint32, types.uint32)
233
+ def lower_external_add(context, builder, sig, args):
234
+ context.active_code_library.add_linking_file(code_object)
235
+ i32 = ir.IntType(32)
236
+ fnty = ir.FunctionType(i32, [i32, i32])
237
+ fn = cgutils.get_or_insert_function(
238
+ builder.module, fnty, "add_cabi"
239
+ )
240
+ return builder.call(fn, args)
241
+
242
+ @cuda.jit(lto=lto)
243
+ def use_external_add(r, x, y):
244
+ r[0] = external_add(x[0], y[0])
245
+
246
+ r = np.zeros(1, dtype=np.uint32)
247
+ x = np.ones(1, dtype=np.uint32)
248
+ y = np.ones(1, dtype=np.uint32) * 2
249
+
250
+ use_external_add[1, 1](r, x, y)
251
+
252
+ np.testing.assert_equal(r[0], 3)
253
+
254
+ @cuda.jit(lto=lto)
255
+ def use_external_add_device(x, y):
256
+ return external_add(x, y)
257
+
258
+ @cuda.jit(lto=lto)
259
+ def use_external_add_kernel(r, x, y):
260
+ r[0] = use_external_add_device(x[0], y[0])
261
+
262
+ r = np.zeros(1, dtype=np.uint32)
263
+ x = np.ones(1, dtype=np.uint32)
264
+ y = np.ones(1, dtype=np.uint32) * 2
265
+
266
+ use_external_add_kernel[1, 1](r, x, y)
267
+
268
+ np.testing.assert_equal(r[0], 3)
269
+
270
+ def test_linked_called_through_overload(self):
271
+ cu_code = cuda.CUSource("""
272
+ extern "C" __device__
273
+ int bar(int *out, int a)
274
+ {
275
+ *out = a * 2;
276
+ return 0;
277
+ }
278
+ """)
279
+
280
+ bar = cuda.declare_device("bar", "int32(int32)", link=cu_code)
281
+
282
+ def bar_call(val):
283
+ pass
284
+
285
+ @overload(bar_call, target="cuda")
286
+ def ol_bar_call(a):
287
+ return lambda a: bar(a)
288
+
289
+ @cuda.jit("void(int32[::1], int32[::1])")
290
+ def foo(r, x):
291
+ i = cuda.grid(1)
292
+ if i < len(r):
293
+ r[i] = bar_call(x[i])
294
+
295
+ x = np.arange(10, dtype=np.int32)
296
+ r = np.empty_like(x)
297
+
298
+ foo[1, 32](r, x)
299
+
300
+ np.testing.assert_equal(r, x * 2)
301
+
302
+
163
303
  if __name__ == "__main__":
164
304
  unittest.main()
@@ -0,0 +1,23 @@
1
+ #include <cooperative_groups.h>
2
+ #include <cuda/barrier>
3
+
4
+ namespace cg = cooperative_groups;
5
+
6
+ __device__ void _wait_on_tile(cuda::barrier<cuda::thread_scope_block> &tile)
7
+ {
8
+ auto token = tile.arrive();
9
+ tile.wait(std::move(token));
10
+ }
11
+
12
+ extern "C"
13
+ __device__ int cta_barrier(int *ret) {
14
+ auto cta = cg::this_thread_block();
15
+ cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
16
+ __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
17
+ if (threadIdx.x == 0) {
18
+ init(&barrier, blockDim.x);
19
+ }
20
+
21
+ _wait_on_tile(barrier);
22
+ return 0;
23
+ }
@@ -0,0 +1,3 @@
1
+ // Templated addition function: myadd
2
+ template <typename T>
3
+ __device__ T myadd(T a, T b) { return a + b; }
@@ -0,0 +1,3 @@
1
+ // Templated multiplication function: mymul
2
+ template <typename T>
3
+ __device__ T mymul(T a, T b) { return a * b; }
@@ -0,0 +1,9 @@
1
+ #include <add.cuh> // In numba/cuda/tests/data/include
2
+ #include <mul.cuh> // In numba/cuda/tests/doc_examples/ffi/include
3
+
4
+ extern "C"
5
+ __device__ int saxpy(float *ret, float a, float x, float y)
6
+ {
7
+ *ret = myadd(mymul(a, x), y);
8
+ return 0;
9
+ }
@@ -3,7 +3,7 @@
3
3
 
4
4
  import unittest
5
5
  from numba.cuda.testing import CUDATestCase, skip_on_cudasim
6
- from numba.tests.support import skip_unless_cffi
6
+ from numba.tests.support import skip_unless_cffi, override_config
7
7
 
8
8
 
9
9
  @skip_unless_cffi
@@ -85,6 +85,53 @@ class TestFFI(CUDATestCase):
85
85
  actual = r[()]
86
86
  np.testing.assert_allclose(expected, actual)
87
87
 
88
+ def test_ex_extra_includes(self):
89
+ import numpy as np
90
+ from numba import cuda, config
91
+ import os
92
+
93
+ basedir = os.path.dirname(os.path.abspath(__file__))
94
+ mul_dir = os.path.join(basedir, "ffi", "include")
95
+ saxpy_cu = os.path.join(basedir, "ffi", "saxpy.cu")
96
+
97
+ testdir = os.path.dirname(basedir)
98
+ add_dir = os.path.join(testdir, "data", "include")
99
+
100
+ includedir = ":".join([mul_dir, add_dir])
101
+ with override_config("CUDA_NVRTC_EXTRA_SEARCH_PATHS", includedir):
102
+ # magictoken.ex_extra_search_paths.begin
103
+ from numba import config
104
+
105
+ includedir = ":".join([mul_dir, add_dir])
106
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = includedir
107
+ # magictoken.ex_extra_search_paths.end
108
+
109
+ # magictoken.ex_extra_search_paths_kernel.begin
110
+ sig = "float32(float32, float32, float32)"
111
+ saxpy = cuda.declare_device("saxpy", sig=sig, link=saxpy_cu)
112
+
113
+ @cuda.jit
114
+ def vector_saxpy(a, x, y, res):
115
+ i = cuda.grid(1)
116
+ if i < len(res):
117
+ res[i] = saxpy(a, x[i], y[i])
118
+
119
+ # magictoken.ex_extra_search_paths_kernel.end
120
+
121
+ size = 10_000
122
+ a = 3.0
123
+ X = np.ones((size,), dtype="float32")
124
+ Y = np.ones((size,), dtype="float32")
125
+ R = np.zeros((size,), dtype="float32")
126
+
127
+ block_size = 32
128
+ num_blocks = (size // block_size) + 1
129
+
130
+ vector_saxpy[num_blocks, block_size](a, X, Y, R)
131
+
132
+ expected = a * X + Y
133
+ np.testing.assert_equal(R, expected)
134
+
88
135
 
89
136
  if __name__ == "__main__":
90
137
  unittest.main()
@@ -4,11 +4,86 @@ import os
4
4
  import numpy as np
5
5
  import unittest
6
6
  from numba.cuda.testing import CUDATestCase
7
-
8
7
  from numba.tests.support import run_in_subprocess, override_config
9
-
8
+ from numba.cuda import get_current_device
9
+ from numba.cuda.cudadrv.nvrtc import compile
10
+ from numba import types
11
+ from numba.cuda.cudadecl import registry as cuda_decl_registry
12
+ from numba.core.typing import signature
13
+ from numba.cuda.cudaimpl import lower as cuda_lower
10
14
  from numba import cuda
11
- from numba.cuda.runtime.nrt import rtsys
15
+ from numba.cuda.runtime.nrt import rtsys, get_include
16
+ from numba.core.typing.templates import AbstractTemplate
17
+ from numba.cuda.cudadrv.linkable_code import (
18
+ CUSource,
19
+ PTXSource,
20
+ Fatbin,
21
+ Cubin,
22
+ Archive,
23
+ Object,
24
+ )
25
+
26
+
27
+ TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
28
+
29
+ if TEST_BIN_DIR:
30
+
31
+ def make_linkable_code(name, kind, mode):
32
+ path = os.path.join(TEST_BIN_DIR, name)
33
+ with open(path, mode) as f:
34
+ contents = f.read()
35
+ return kind(contents, nrt=True)
36
+
37
+ nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
38
+ nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
39
+ nrt_extern_cu = make_linkable_code(
40
+ "nrt_extern.cu",
41
+ CUSource,
42
+ "rb",
43
+ )
44
+ nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
45
+ nrt_extern_fatbin_multi = make_linkable_code(
46
+ "nrt_extern_multi.fatbin", Fatbin, "rb"
47
+ )
48
+ nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
49
+ nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
50
+
51
+
52
+ def allocate_deallocate_handle():
53
+ """
54
+ Handle to call NRT_Allocate and NRT_Free
55
+ """
56
+ pass
57
+
58
+
59
+ @cuda_decl_registry.register_global(allocate_deallocate_handle)
60
+ class AllocateShimImpl(AbstractTemplate):
61
+ def generic(self, args, kws):
62
+ return signature(types.void)
63
+
64
+
65
+ device_fun_shim = cuda.declare_device(
66
+ "device_allocate_deallocate", types.int32()
67
+ )
68
+
69
+
70
+ # wrapper to turn the above into a python callable
71
+ def call_device_fun_shim():
72
+ return device_fun_shim()
73
+
74
+
75
+ @cuda_lower(allocate_deallocate_handle)
76
+ def allocate_deallocate_impl(context, builder, sig, args):
77
+ sig_ = types.int32()
78
+ # call the external function, passing the pointer
79
+ result = context.compile_internal(
80
+ builder,
81
+ call_device_fun_shim,
82
+ sig_,
83
+ (),
84
+ )
85
+
86
+ return result
12
87
 
13
88
 
14
89
  class TestNrtBasic(CUDATestCase):
@@ -77,6 +152,50 @@ class TestNrtBasic(CUDATestCase):
77
152
  self.assertEqual(out_ary[0], 1)
78
153
 
79
154
 
155
+ class TestNrtLinking(CUDATestCase):
156
+ def run(self, result=None):
157
+ with override_config("CUDA_ENABLE_NRT", True):
158
+ super(TestNrtLinking, self).run(result)
159
+
160
+ def test_nrt_detect_linked_ptx_file(self):
161
+ src = f"#include <{get_include()}/nrt.cuh>"
162
+ src += """
163
+ extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
164
+ auto ptr = NRT_Allocate(1);
165
+ NRT_Free(ptr);
166
+ return 0;
167
+ }
168
+ """
169
+ cc = get_current_device().compute_capability
170
+ ptx, _ = compile(src, "external_nrt.cu", cc)
171
+
172
+ @cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)])
173
+ def kernel():
174
+ allocate_deallocate_handle()
175
+
176
+ kernel[1, 1]()
177
+
178
+ @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
179
+ def test_nrt_detect_linkable_code(self):
180
+ codes = (
181
+ nrt_extern_a,
182
+ nrt_extern_cubin,
183
+ nrt_extern_cu,
184
+ nrt_extern_fatbin,
185
+ nrt_extern_fatbin_multi,
186
+ nrt_extern_o,
187
+ nrt_extern_ptx,
188
+ )
189
+ for code in codes:
190
+ with self.subTest(code=code):
191
+
192
+ @cuda.jit(link=[code])
193
+ def kernel():
194
+ allocate_deallocate_handle()
195
+
196
+ kernel[1, 1]()
197
+
198
+
80
199
  class TestNrtStatistics(CUDATestCase):
81
200
  def setUp(self):
82
201
  self._stream = cuda.default_stream()
@@ -40,6 +40,8 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
40
40
 
41
41
  OUTPUT_DIR := ./
42
42
 
43
+ NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
44
+
43
45
  all:
44
46
  @echo "GPU CC: $(GPU_CC)"
45
47
  @echo "Alternative CC: $(ALT_CC)"
@@ -52,7 +54,16 @@ all:
52
54
  nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
53
55
  nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
54
56
 
57
+ nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.cubin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
58
+ nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
59
+ nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern_multi.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
60
+ nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ptx nrt_extern.cu -I$(NRT_INCLUDE_DIR)
61
+ nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
62
+ nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.a nrt_extern.cu -I$(NRT_INCLUDE_DIR)
63
+
55
64
  # Generate LTO-IR wrapped in a fatbin
56
65
  nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
66
+ nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ltoir.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
57
67
  # Generate LTO-IR in a "raw" LTO-IR container
58
68
  python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
69
+ python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/nrt_extern.ltoir nrt_extern.cu --nrt
@@ -7,6 +7,7 @@ import subprocess
7
7
  import sys
8
8
 
9
9
  from cuda import nvrtc
10
+ from numba.cuda.runtime.nrt import get_include
10
11
 
11
12
  # Magic number found at the start of an LTO-IR file
12
13
  LTOIR_MAGIC = 0x7F4E43ED
@@ -88,7 +89,9 @@ def get_ltoir(source, name, arch):
88
89
  nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
89
90
  )
90
91
 
91
- cuda_include_flags = determine_include_flags()
92
+ cuda_include_flags = determine_include_flags() + (
93
+ [f"-I{get_include()}"] if args.nrt else []
94
+ )
92
95
  if cuda_include_flags is None:
93
96
  print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
94
97
  sys.exit(1)
@@ -160,7 +163,7 @@ if __name__ == "__main__":
160
163
  help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
161
164
  default="sm_50",
162
165
  )
163
-
166
+ parser.add_argument("--nrt", action="store_true")
164
167
  args = parser.parse_args()
165
168
  outputpath = args.output
166
169
 
@@ -0,0 +1,7 @@
1
+ #include <nrt.cuh>
2
+
3
+ extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
4
+ auto ptr = NRT_Allocate(1);
5
+ NRT_Free(ptr);
6
+ return 0;
7
+ }
@@ -17,3 +17,7 @@ extern "C" __device__ int add_from_numba(uint32_t *result, uint32_t a,
17
17
  *result = a + b;
18
18
  return 0;
19
19
  }
20
+
21
+ extern "C" __device__ uint32_t add_cabi(uint32_t a, uint32_t b) {
22
+ return a + b;
23
+ }
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import warnings
3
3
  import traceback
4
+ import functools
4
5
 
5
6
 
6
7
  def _readenv(name, ctor, default):
@@ -20,3 +21,9 @@ def _readenv(name, ctor, default):
20
21
  RuntimeWarning,
21
22
  )
22
23
  return default
24
+
25
+
26
+ @functools.lru_cache(maxsize=None)
27
+ def cached_file_read(filepath, how="r"):
28
+ with open(filepath, how) as f:
29
+ return f.read()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.10.1
3
+ Version: 0.12.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause