numba-cuda 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/compiler.py +7 -6
  3. numba_cuda/numba/cuda/cudadecl.py +6 -2
  4. numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
  5. numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +13 -9
  7. numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
  8. numba_cuda/numba/cuda/cudadrv/nvvm.py +6 -1
  9. numba_cuda/numba/cuda/decorators.py +9 -2
  10. numba_cuda/numba/cuda/dispatcher.py +22 -3
  11. numba_cuda/numba/cuda/runtime/__init__.py +1 -0
  12. numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
  13. numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
  14. numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
  15. numba_cuda/numba/cuda/runtime/nrt.py +318 -0
  16. numba_cuda/numba/cuda/testing.py +11 -1
  17. numba_cuda/numba/cuda/tests/__init__.py +1 -0
  18. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
  19. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +145 -11
  20. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +10 -7
  21. numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
  22. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
  23. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
  24. numba_cuda/numba/cuda/tests/support.py +11 -0
  25. numba_cuda/numba/cuda/utils.py +22 -0
  26. {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA +21 -3
  27. {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/RECORD +30 -23
  28. {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/WHEEL +1 -1
  29. {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/LICENSE +0 -0
  30. {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,318 @@
1
+ import ctypes
2
+ import os
3
+ from functools import wraps
4
+ import numpy as np
5
+
6
+ from numba import cuda, config
7
+ from numba.core.runtime.nrt import _nrt_mstats
8
+ from numba.cuda.cudadrv.driver import Linker, driver, launch_kernel
9
+ from numba.cuda.cudadrv import devices
10
+ from numba.cuda.api import get_current_device
11
+ from numba.cuda.utils import _readenv
12
+
13
+
14
+ # Check environment variable or config for NRT statistics enablement
15
+ NRT_STATS = (
16
+ _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
17
+ getattr(config, "NUMBA_CUDA_NRT_STATS", False)
18
+ )
19
+ if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
20
+ config.CUDA_NRT_STATS = NRT_STATS
21
+
22
+
23
+ # Check environment variable or config for NRT enablement
24
+ ENABLE_NRT = (
25
+ _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
26
+ getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
27
+ )
28
+ if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
29
+ config.CUDA_ENABLE_NRT = ENABLE_NRT
30
+
31
+
32
+ # Protect method to ensure NRT memory allocation and initialization
33
+ def _alloc_init_guard(method):
34
+ """
35
+ Ensure NRT memory allocation and initialization before running the method
36
+ """
37
+ @wraps(method)
38
+ def wrapper(self, *args, **kwargs):
39
+ self.ensure_allocated()
40
+ self.ensure_initialized()
41
+ return method(self, *args, **kwargs)
42
+ return wrapper
43
+
44
+
45
+ class _Runtime:
46
+ """Singleton class for Numba CUDA runtime"""
47
+ _instance = None
48
+
49
+ def __new__(cls, *args, **kwargs):
50
+ if cls._instance is None:
51
+ cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs)
52
+ return cls._instance
53
+
54
+ def __init__(self):
55
+ """Initialize memsys module and variable"""
56
+ self._memsys_module = None
57
+ self._memsys = None
58
+ self._initialized = False
59
+
60
+ def _compile_memsys_module(self):
61
+ """
62
+ Compile memsys.cu and create a module from it in the current context
63
+ """
64
+ # Define the path for memsys.cu
65
+ memsys_mod = os.path.join(
66
+ os.path.dirname(os.path.abspath(__file__)),
67
+ "memsys.cu"
68
+ )
69
+ cc = get_current_device().compute_capability
70
+
71
+ # Create a new linker instance and add the cu file
72
+ linker = Linker.new(cc=cc)
73
+ linker.add_cu_file(memsys_mod)
74
+
75
+ # Complete the linker and create a module from it
76
+ cubin = linker.complete()
77
+ ctx = devices.get_context()
78
+ module = ctx.create_module_image(cubin)
79
+
80
+ # Set the memsys module
81
+ self._memsys_module = module
82
+
83
+ def ensure_allocated(self, stream=None):
84
+ """
85
+ If memsys is not allocated, allocate it; otherwise, perform a no-op
86
+ """
87
+ if self._memsys is not None:
88
+ return
89
+
90
+ # Allocate the memsys
91
+ self.allocate(stream)
92
+
93
+ def allocate(self, stream=None):
94
+ """
95
+ Allocate memsys on global memory
96
+ """
97
+ from numba.cuda import device_array
98
+
99
+ # Check if memsys module is defined
100
+ if self._memsys_module is None:
101
+ # Compile the memsys module if not defined
102
+ self._compile_memsys_module()
103
+
104
+ # Allocate space for NRT_MemSys
105
+ ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
106
+ memsys_size = ctypes.c_uint64()
107
+ driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
108
+ ptr.device_ctypes_pointer, nbytes)
109
+ self._memsys = device_array(
110
+ (memsys_size.value,), dtype="i1", stream=stream)
111
+ self.set_memsys_to_module(self._memsys_module, stream=stream)
112
+
113
+ def _single_thread_launch(self, module, stream, name, params=()):
114
+ """
115
+ Launch the specified kernel with only 1 thread
116
+ """
117
+ if stream is None:
118
+ stream = cuda.default_stream()
119
+
120
+ func = module.get_function(name)
121
+ launch_kernel(
122
+ func.handle,
123
+ 1, 1, 1,
124
+ 1, 1, 1,
125
+ 0,
126
+ stream.handle,
127
+ params,
128
+ cooperative=False
129
+ )
130
+
131
+ def ensure_initialized(self, stream=None):
132
+ """
133
+ If memsys is not initialized, initialize memsys
134
+ """
135
+ if self._initialized:
136
+ return
137
+
138
+ # Initialize the memsys
139
+ self.initialize(stream)
140
+
141
+ def initialize(self, stream=None):
142
+ """
143
+ Launch memsys initialization kernel
144
+ """
145
+ self.ensure_allocated()
146
+
147
+ self._single_thread_launch(
148
+ self._memsys_module, stream, "NRT_MemSys_init")
149
+ self._initialized = True
150
+
151
+ if config.CUDA_NRT_STATS:
152
+ self.memsys_enable_stats()
153
+
154
+ @_alloc_init_guard
155
+ def memsys_enable_stats(self, stream=None):
156
+ """
157
+ Enable memsys statistics
158
+ """
159
+ self._single_thread_launch(
160
+ self._memsys_module, stream, "NRT_MemSys_enable_stats")
161
+
162
+ @_alloc_init_guard
163
+ def memsys_disable_stats(self, stream=None):
164
+ """
165
+ Disable memsys statistics
166
+ """
167
+ self._single_thread_launch(
168
+ self._memsys_module, stream, "NRT_MemSys_disable_stats")
169
+
170
+ @_alloc_init_guard
171
+ def memsys_stats_enabled(self, stream=None):
172
+ """
173
+ Return a boolean indicating whether memsys is enabled. Synchronizes
174
+ context
175
+ """
176
+ enabled_ar = cuda.managed_array(1, np.uint8)
177
+
178
+ self._single_thread_launch(
179
+ self._memsys_module,
180
+ stream,
181
+ "NRT_MemSys_stats_enabled",
182
+ (enabled_ar.device_ctypes_pointer,)
183
+ )
184
+
185
+ cuda.synchronize()
186
+ return bool(enabled_ar[0])
187
+
188
+ @_alloc_init_guard
189
+ def _copy_memsys_to_host(self, stream):
190
+ """
191
+ Copy all statistics of memsys to the host
192
+ """
193
+ dt = np.dtype([
194
+ ('alloc', np.uint64),
195
+ ('free', np.uint64),
196
+ ('mi_alloc', np.uint64),
197
+ ('mi_free', np.uint64)
198
+ ])
199
+
200
+ stats_for_read = cuda.managed_array(1, dt)
201
+
202
+ self._single_thread_launch(
203
+ self._memsys_module,
204
+ stream,
205
+ "NRT_MemSys_read",
206
+ [stats_for_read.device_ctypes_pointer]
207
+ )
208
+ cuda.synchronize()
209
+
210
+ return stats_for_read[0]
211
+
212
+ @_alloc_init_guard
213
+ def get_allocation_stats(self, stream=None):
214
+ """
215
+ Get the allocation statistics
216
+ """
217
+ enabled = self.memsys_stats_enabled(stream)
218
+ if not enabled:
219
+ raise RuntimeError("NRT stats are disabled.")
220
+ memsys = self._copy_memsys_to_host(stream)
221
+ return _nrt_mstats(
222
+ alloc=memsys["alloc"],
223
+ free=memsys["free"],
224
+ mi_alloc=memsys["mi_alloc"],
225
+ mi_free=memsys["mi_free"]
226
+ )
227
+
228
+ @_alloc_init_guard
229
+ def _get_single_stat(self, stat, stream=None):
230
+ """
231
+ Get a single stat from the memsys
232
+ """
233
+ got = cuda.managed_array(1, np.uint64)
234
+ self._single_thread_launch(
235
+ self._memsys_module,
236
+ stream,
237
+ f"NRT_MemSys_read_{stat}",
238
+ [got.device_ctypes_pointer]
239
+ )
240
+
241
+ cuda.synchronize()
242
+ return got[0]
243
+
244
+ @_alloc_init_guard
245
+ def memsys_get_stats_alloc(self, stream=None):
246
+ """
247
+ Get the allocation statistic
248
+ """
249
+ enabled = self.memsys_stats_enabled(stream)
250
+ if not enabled:
251
+ raise RuntimeError("NRT stats are disabled.")
252
+
253
+ return self._get_single_stat("alloc")
254
+
255
+ @_alloc_init_guard
256
+ def memsys_get_stats_free(self, stream=None):
257
+ """
258
+ Get the free statistic
259
+ """
260
+ enabled = self.memsys_stats_enabled(stream)
261
+ if not enabled:
262
+ raise RuntimeError("NRT stats are disabled.")
263
+
264
+ return self._get_single_stat("free")
265
+
266
+ @_alloc_init_guard
267
+ def memsys_get_stats_mi_alloc(self, stream=None):
268
+ """
269
+ Get the mi alloc statistic
270
+ """
271
+ enabled = self.memsys_stats_enabled(stream)
272
+ if not enabled:
273
+ raise RuntimeError("NRT stats are disabled.")
274
+
275
+ return self._get_single_stat("mi_alloc")
276
+
277
+ @_alloc_init_guard
278
+ def memsys_get_stats_mi_free(self, stream=None):
279
+ """
280
+ Get the mi free statistic
281
+ """
282
+ enabled = self.memsys_stats_enabled(stream)
283
+ if not enabled:
284
+ raise RuntimeError("NRT stats are disabled.")
285
+
286
+ return self._get_single_stat("mi_free")
287
+
288
+ def set_memsys_to_module(self, module, stream=None):
289
+ """
290
+ Set the memsys module. The module must contain `NRT_MemSys_set` kernel,
291
+ and declare a pointer to NRT_MemSys structure.
292
+ """
293
+ if self._memsys is None:
294
+ raise RuntimeError(
295
+ "Please allocate NRT Memsys first before setting to module.")
296
+
297
+ self._single_thread_launch(
298
+ module,
299
+ stream,
300
+ "NRT_MemSys_set",
301
+ [self._memsys.device_ctypes_pointer,]
302
+ )
303
+
304
+ @_alloc_init_guard
305
+ def print_memsys(self, stream=None):
306
+ """
307
+ Print the current statistics of memsys, for debugging purposes
308
+ """
309
+ cuda.synchronize()
310
+ self._single_thread_launch(
311
+ self._memsys_module,
312
+ stream,
313
+ "NRT_MemSys_print"
314
+ )
315
+
316
+
317
+ # Create an instance of the runtime
318
+ rtsys = _Runtime()
@@ -115,12 +115,22 @@ def skip_on_arm(reason):
115
115
  def skip_if_cuda_includes_missing(fn):
116
116
  # Skip when cuda.h is not available - generally this should indicate
117
117
  # whether the CUDA includes are available or not
118
- cuda_h = os.path.join(config.CUDA_INCLUDE_PATH, 'cuda.h')
118
+ cuda_include_path = libs.get_cuda_include_dir()
119
+ cuda_h = os.path.join(cuda_include_path, 'cuda.h')
119
120
  cuda_h_file = (os.path.exists(cuda_h) and os.path.isfile(cuda_h))
120
121
  reason = 'CUDA include dir not available on this system'
121
122
  return unittest.skipUnless(cuda_h_file, reason)(fn)
122
123
 
123
124
 
125
+ def skip_if_curand_kernel_missing(fn):
126
+ cuda_include_path = libs.get_cuda_include_dir()
127
+ curand_kernel_h = os.path.join(cuda_include_path, 'curand_kernel.h')
128
+ curand_kernel_h_file = (os.path.exists(curand_kernel_h) and
129
+ os.path.isfile(curand_kernel_h))
130
+ reason = 'curand_kernel.h not available on this system'
131
+ return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
132
+
133
+
124
134
  def skip_if_mvc_enabled(reason):
125
135
  """Skip a test if Minor Version Compatibility is enabled"""
126
136
  return unittest.skipIf(config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY,
@@ -49,6 +49,7 @@ def load_tests(loader, tests, pattern):
49
49
  if gpus and gpus[0].compute_capability >= (2, 0):
50
50
  suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
51
51
  suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
52
+ suite.addTests(load_testsuite(loader, join(this_dir, 'nrt')))
52
53
  suite.addTests(load_testsuite(loader, join(this_dir,
53
54
  'doc_examples')))
54
55
  else:
@@ -4,6 +4,7 @@ from numba.cuda.cudadrv import devicearray
4
4
  from numba import cuda
5
5
  from numba.cuda.testing import unittest, CUDATestCase
6
6
  from numba.cuda.testing import skip_on_cudasim
7
+ from numba.tests.support import IS_NUMPY_2
7
8
 
8
9
 
9
10
  class TestCudaNDArray(CUDATestCase):
@@ -456,6 +457,36 @@ class TestCudaNDArray(CUDATestCase):
456
457
  dev_array_from_host.copy_to_device(dev_array)
457
458
 
458
459
 
460
+ class TestArrayMethod(CUDATestCase):
461
+ """Tests of the __array__() method via np.array"""
462
+
463
+ def test_np_array(self):
464
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
465
+ host_array = np.array(dev_array)
466
+ np.testing.assert_equal(dev_array.copy_to_host(), host_array)
467
+
468
+ def test_np_array_dtype(self):
469
+ dtype = np.int32
470
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
471
+ host_array = np.array(dev_array, dtype=dtype)
472
+ np.testing.assert_equal(
473
+ host_array,
474
+ dev_array.copy_to_host().astype(dtype)
475
+ )
476
+
477
+ @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
478
+ def test_np_array_copy_false(self):
479
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
480
+ with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
481
+ np.array(dev_array, copy=False)
482
+
483
+ @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
484
+ def test_np_array_copy_true(self):
485
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
486
+ host_array = np.array(dev_array)
487
+ np.testing.assert_equal(dev_array.copy_to_host(), host_array)
488
+
489
+
459
490
  class TestRecarray(CUDATestCase):
460
491
  def test_recarray(self):
461
492
  # From issue #4111
@@ -1,11 +1,14 @@
1
1
  import re
2
- import types
2
+ import cffi
3
3
 
4
4
  import numpy as np
5
5
 
6
- from numba.cuda.testing import unittest, skip_on_cudasim, CUDATestCase
7
- from numba import cuda, jit, float32, int32
6
+ from numba.cuda.testing import (skip_if_curand_kernel_missing, skip_on_cudasim,
7
+ test_data_dir, unittest, CUDATestCase)
8
+ from numba import cuda, jit, float32, int32, types
8
9
  from numba.core.errors import TypingError
10
+ from numba.tests.support import skip_unless_cffi
11
+ from types import ModuleType
9
12
 
10
13
 
11
14
  class TestDeviceFunc(CUDATestCase):
@@ -92,7 +95,7 @@ class TestDeviceFunc(CUDATestCase):
92
95
  def add(a, b):
93
96
  return a + b
94
97
 
95
- mymod = types.ModuleType(name='mymod')
98
+ mymod = ModuleType(name='mymod')
96
99
  mymod.add = add
97
100
  del add
98
101
 
@@ -192,31 +195,162 @@ class TestDeviceFunc(CUDATestCase):
192
195
 
193
196
  self.assertEqual(0x04010203, x[0])
194
197
 
195
- def _test_declare_device(self, decl):
198
+
199
+ times2_cu = cuda.CUSource("""
200
+ extern "C" __device__
201
+ int times2(int *out, int a)
202
+ {
203
+ *out = a * 2;
204
+ return 0;
205
+ }
206
+ """)
207
+
208
+
209
+ times4_cu = cuda.CUSource("""
210
+ extern "C" __device__
211
+ int times2(int *out, int a);
212
+
213
+ extern "C" __device__
214
+ int times4(int *out, int a)
215
+ {
216
+ int tmp;
217
+ times2(&tmp, a);
218
+ *out = tmp * 2;
219
+ return 0;
220
+ }
221
+ """)
222
+
223
+ jitlink_user_cu = cuda.CUSource("""
224
+ extern "C" __device__
225
+ int array_mutator(void *out, int *a);
226
+
227
+ extern "C" __device__
228
+ int use_array_mutator(void *out, int *a) {
229
+ array_mutator(out, a);
230
+ return 0;
231
+ }
232
+ """)
233
+
234
+ rng_cu = cuda.CUSource("""
235
+ #include <curand_kernel.h>
236
+
237
+ extern "C" __device__
238
+ int random_number(unsigned int *out, unsigned long long seed)
239
+ {
240
+ // Initialize state
241
+ curandStateXORWOW_t state;
242
+ unsigned long long sequence = 1;
243
+ unsigned long long offset = 0;
244
+ curand_init(seed, sequence, offset, &state);
245
+
246
+ // Generate one random number
247
+ *out = curand(&state);
248
+
249
+ // Report no exception
250
+ return 0;
251
+ }""")
252
+
253
+
254
+ @skip_on_cudasim('External functions unsupported in the simulator')
255
+ class TestDeclareDevice(CUDATestCase):
256
+
257
+ def check_api(self, decl):
196
258
  self.assertEqual(decl.name, 'f1')
197
259
  self.assertEqual(decl.sig.args, (float32[:],))
198
260
  self.assertEqual(decl.sig.return_type, int32)
199
261
 
200
- @skip_on_cudasim('cudasim does not check signatures')
201
262
  def test_declare_device_signature(self):
202
263
  f1 = cuda.declare_device('f1', int32(float32[:]))
203
- self._test_declare_device(f1)
264
+ self.check_api(f1)
204
265
 
205
- @skip_on_cudasim('cudasim does not check signatures')
206
266
  def test_declare_device_string(self):
207
267
  f1 = cuda.declare_device('f1', 'int32(float32[:])')
208
- self._test_declare_device(f1)
268
+ self.check_api(f1)
209
269
 
210
- @skip_on_cudasim('cudasim does not check signatures')
211
270
  def test_bad_declare_device_tuple(self):
212
271
  with self.assertRaisesRegex(TypeError, 'Return type'):
213
272
  cuda.declare_device('f1', (float32[:],))
214
273
 
215
- @skip_on_cudasim('cudasim does not check signatures')
216
274
  def test_bad_declare_device_string(self):
217
275
  with self.assertRaisesRegex(TypeError, 'Return type'):
218
276
  cuda.declare_device('f1', '(float32[:],)')
219
277
 
278
+ def test_link_cu_source(self):
279
+ times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
280
+
281
+ @cuda.jit
282
+ def kernel(r, x):
283
+ i = cuda.grid(1)
284
+ if i < len(r):
285
+ r[i] = times2(x[i])
286
+
287
+ x = np.arange(10, dtype=np.int32)
288
+ r = np.empty_like(x)
289
+
290
+ kernel[1, 32](r, x)
291
+
292
+ np.testing.assert_equal(r, x * 2)
293
+
294
+ def _test_link_multiple_sources(self, link_type):
295
+ link = link_type([times2_cu, times4_cu])
296
+ times4 = cuda.declare_device('times4', 'int32(int32)', link=link)
297
+
298
+ @cuda.jit
299
+ def kernel(r, x):
300
+ i = cuda.grid(1)
301
+ if i < len(r):
302
+ r[i] = times4(x[i])
303
+
304
+ x = np.arange(10, dtype=np.int32)
305
+ r = np.empty_like(x)
306
+
307
+ kernel[1, 32](r, x)
308
+
309
+ np.testing.assert_equal(r, x * 4)
310
+
311
+ def test_link_multiple_sources_set(self):
312
+ self._test_link_multiple_sources(set)
313
+
314
+ def test_link_multiple_sources_tuple(self):
315
+ self._test_link_multiple_sources(tuple)
316
+
317
+ def test_link_multiple_sources_list(self):
318
+ self._test_link_multiple_sources(list)
319
+
320
+ @skip_unless_cffi
321
+ def test_link_sources_in_memory_and_on_disk(self):
322
+ jitlink_cu = str(test_data_dir / "jitlink.cu")
323
+ link = [jitlink_cu, jitlink_user_cu]
324
+ sig = types.void(types.CPointer(types.int32))
325
+ ext_fn = cuda.declare_device("use_array_mutator", sig, link=link)
326
+
327
+ ffi = cffi.FFI()
328
+
329
+ @cuda.jit
330
+ def kernel(x):
331
+ ptr = ffi.from_buffer(x)
332
+ ext_fn(ptr)
333
+
334
+ x = np.arange(2, dtype=np.int32)
335
+ kernel[1, 1](x)
336
+
337
+ expected = np.ones(2, dtype=np.int32)
338
+ np.testing.assert_equal(x, expected)
339
+
340
+ @skip_if_curand_kernel_missing
341
+ def test_include_cuda_header(self):
342
+ sig = types.int32(types.uint64)
343
+ link = [rng_cu]
344
+ random_number = cuda.declare_device("random_number", sig, link=link)
345
+
346
+ @cuda.jit
347
+ def kernel(x, seed):
348
+ x[0] = random_number(seed)
349
+
350
+ x = np.zeros(1, dtype=np.uint32)
351
+ kernel[1, 1](x, 1)
352
+ np.testing.assert_equal(x[0], 323845807)
353
+
220
354
 
221
355
  if __name__ == '__main__':
222
356
  unittest.main()
@@ -15,16 +15,18 @@ class TestFFI(CUDATestCase):
15
15
  import numpy as np
16
16
  import os
17
17
 
18
- # Declaration of the foreign function
19
- mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
20
-
21
18
  # Path to the source containing the foreign function
22
19
  # (here assumed to be in a subdirectory called "ffi")
23
20
  basedir = os.path.dirname(os.path.abspath(__file__))
24
21
  functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
25
22
 
26
- # Kernel that links in functions.cu and calls mul
27
- @cuda.jit(link=[functions_cu])
23
+ # Declaration of the foreign function
24
+ mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)',
25
+ link=functions_cu)
26
+
27
+ # A kernel that calls mul; functions.cu is linked automatically due to
28
+ # the call to mul.
29
+ @cuda.jit
28
30
  def multiply_vectors(r, x, y):
29
31
  i = cuda.grid(1)
30
32
 
@@ -54,14 +56,15 @@ class TestFFI(CUDATestCase):
54
56
 
55
57
  # magictoken.ex_from_buffer_decl.begin
56
58
  signature = 'float32(CPointer(float32), int32)'
57
- sum_reduce = cuda.declare_device('sum_reduce', signature)
59
+ sum_reduce = cuda.declare_device('sum_reduce', signature,
60
+ link=functions_cu)
58
61
  # magictoken.ex_from_buffer_decl.end
59
62
 
60
63
  # magictoken.ex_from_buffer_kernel.begin
61
64
  import cffi
62
65
  ffi = cffi.FFI()
63
66
 
64
- @cuda.jit(link=[functions_cu])
67
+ @cuda.jit
65
68
  def reduction_caller(result, array):
66
69
  array_ptr = ffi.from_buffer(array)
67
70
  result[()] = sum_reduce(array_ptr, len(array))