numba-cuda 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +7 -6
- numba_cuda/numba/cuda/cudadecl.py +6 -2
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +13 -9
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +6 -1
- numba_cuda/numba/cuda/decorators.py +9 -2
- numba_cuda/numba/cuda/dispatcher.py +22 -3
- numba_cuda/numba/cuda/runtime/__init__.py +1 -0
- numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
- numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
- numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
- numba_cuda/numba/cuda/runtime/nrt.py +318 -0
- numba_cuda/numba/cuda/testing.py +11 -1
- numba_cuda/numba/cuda/tests/__init__.py +1 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +145 -11
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +10 -7
- numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/utils.py +22 -0
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA +21 -3
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/RECORD +30 -23
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/WHEEL +1 -1
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/LICENSE +0 -0
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,318 @@
|
|
1
|
+
import ctypes
|
2
|
+
import os
|
3
|
+
from functools import wraps
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from numba import cuda, config
|
7
|
+
from numba.core.runtime.nrt import _nrt_mstats
|
8
|
+
from numba.cuda.cudadrv.driver import Linker, driver, launch_kernel
|
9
|
+
from numba.cuda.cudadrv import devices
|
10
|
+
from numba.cuda.api import get_current_device
|
11
|
+
from numba.cuda.utils import _readenv
|
12
|
+
|
13
|
+
|
14
|
+
# Check environment variable or config for NRT statistics enablement
|
15
|
+
NRT_STATS = (
|
16
|
+
_readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
|
17
|
+
getattr(config, "NUMBA_CUDA_NRT_STATS", False)
|
18
|
+
)
|
19
|
+
if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
|
20
|
+
config.CUDA_NRT_STATS = NRT_STATS
|
21
|
+
|
22
|
+
|
23
|
+
# Check environment variable or config for NRT enablement
|
24
|
+
ENABLE_NRT = (
|
25
|
+
_readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
|
26
|
+
getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
|
27
|
+
)
|
28
|
+
if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
|
29
|
+
config.CUDA_ENABLE_NRT = ENABLE_NRT
|
30
|
+
|
31
|
+
|
32
|
+
# Protect method to ensure NRT memory allocation and initialization
|
33
|
+
def _alloc_init_guard(method):
|
34
|
+
"""
|
35
|
+
Ensure NRT memory allocation and initialization before running the method
|
36
|
+
"""
|
37
|
+
@wraps(method)
|
38
|
+
def wrapper(self, *args, **kwargs):
|
39
|
+
self.ensure_allocated()
|
40
|
+
self.ensure_initialized()
|
41
|
+
return method(self, *args, **kwargs)
|
42
|
+
return wrapper
|
43
|
+
|
44
|
+
|
45
|
+
class _Runtime:
|
46
|
+
"""Singleton class for Numba CUDA runtime"""
|
47
|
+
_instance = None
|
48
|
+
|
49
|
+
def __new__(cls, *args, **kwargs):
|
50
|
+
if cls._instance is None:
|
51
|
+
cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs)
|
52
|
+
return cls._instance
|
53
|
+
|
54
|
+
def __init__(self):
|
55
|
+
"""Initialize memsys module and variable"""
|
56
|
+
self._memsys_module = None
|
57
|
+
self._memsys = None
|
58
|
+
self._initialized = False
|
59
|
+
|
60
|
+
def _compile_memsys_module(self):
|
61
|
+
"""
|
62
|
+
Compile memsys.cu and create a module from it in the current context
|
63
|
+
"""
|
64
|
+
# Define the path for memsys.cu
|
65
|
+
memsys_mod = os.path.join(
|
66
|
+
os.path.dirname(os.path.abspath(__file__)),
|
67
|
+
"memsys.cu"
|
68
|
+
)
|
69
|
+
cc = get_current_device().compute_capability
|
70
|
+
|
71
|
+
# Create a new linker instance and add the cu file
|
72
|
+
linker = Linker.new(cc=cc)
|
73
|
+
linker.add_cu_file(memsys_mod)
|
74
|
+
|
75
|
+
# Complete the linker and create a module from it
|
76
|
+
cubin = linker.complete()
|
77
|
+
ctx = devices.get_context()
|
78
|
+
module = ctx.create_module_image(cubin)
|
79
|
+
|
80
|
+
# Set the memsys module
|
81
|
+
self._memsys_module = module
|
82
|
+
|
83
|
+
def ensure_allocated(self, stream=None):
|
84
|
+
"""
|
85
|
+
If memsys is not allocated, allocate it; otherwise, perform a no-op
|
86
|
+
"""
|
87
|
+
if self._memsys is not None:
|
88
|
+
return
|
89
|
+
|
90
|
+
# Allocate the memsys
|
91
|
+
self.allocate(stream)
|
92
|
+
|
93
|
+
def allocate(self, stream=None):
|
94
|
+
"""
|
95
|
+
Allocate memsys on global memory
|
96
|
+
"""
|
97
|
+
from numba.cuda import device_array
|
98
|
+
|
99
|
+
# Check if memsys module is defined
|
100
|
+
if self._memsys_module is None:
|
101
|
+
# Compile the memsys module if not defined
|
102
|
+
self._compile_memsys_module()
|
103
|
+
|
104
|
+
# Allocate space for NRT_MemSys
|
105
|
+
ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
|
106
|
+
memsys_size = ctypes.c_uint64()
|
107
|
+
driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
|
108
|
+
ptr.device_ctypes_pointer, nbytes)
|
109
|
+
self._memsys = device_array(
|
110
|
+
(memsys_size.value,), dtype="i1", stream=stream)
|
111
|
+
self.set_memsys_to_module(self._memsys_module, stream=stream)
|
112
|
+
|
113
|
+
def _single_thread_launch(self, module, stream, name, params=()):
|
114
|
+
"""
|
115
|
+
Launch the specified kernel with only 1 thread
|
116
|
+
"""
|
117
|
+
if stream is None:
|
118
|
+
stream = cuda.default_stream()
|
119
|
+
|
120
|
+
func = module.get_function(name)
|
121
|
+
launch_kernel(
|
122
|
+
func.handle,
|
123
|
+
1, 1, 1,
|
124
|
+
1, 1, 1,
|
125
|
+
0,
|
126
|
+
stream.handle,
|
127
|
+
params,
|
128
|
+
cooperative=False
|
129
|
+
)
|
130
|
+
|
131
|
+
def ensure_initialized(self, stream=None):
|
132
|
+
"""
|
133
|
+
If memsys is not initialized, initialize memsys
|
134
|
+
"""
|
135
|
+
if self._initialized:
|
136
|
+
return
|
137
|
+
|
138
|
+
# Initialize the memsys
|
139
|
+
self.initialize(stream)
|
140
|
+
|
141
|
+
def initialize(self, stream=None):
|
142
|
+
"""
|
143
|
+
Launch memsys initialization kernel
|
144
|
+
"""
|
145
|
+
self.ensure_allocated()
|
146
|
+
|
147
|
+
self._single_thread_launch(
|
148
|
+
self._memsys_module, stream, "NRT_MemSys_init")
|
149
|
+
self._initialized = True
|
150
|
+
|
151
|
+
if config.CUDA_NRT_STATS:
|
152
|
+
self.memsys_enable_stats()
|
153
|
+
|
154
|
+
@_alloc_init_guard
|
155
|
+
def memsys_enable_stats(self, stream=None):
|
156
|
+
"""
|
157
|
+
Enable memsys statistics
|
158
|
+
"""
|
159
|
+
self._single_thread_launch(
|
160
|
+
self._memsys_module, stream, "NRT_MemSys_enable_stats")
|
161
|
+
|
162
|
+
@_alloc_init_guard
|
163
|
+
def memsys_disable_stats(self, stream=None):
|
164
|
+
"""
|
165
|
+
Disable memsys statistics
|
166
|
+
"""
|
167
|
+
self._single_thread_launch(
|
168
|
+
self._memsys_module, stream, "NRT_MemSys_disable_stats")
|
169
|
+
|
170
|
+
@_alloc_init_guard
|
171
|
+
def memsys_stats_enabled(self, stream=None):
|
172
|
+
"""
|
173
|
+
Return a boolean indicating whether memsys is enabled. Synchronizes
|
174
|
+
context
|
175
|
+
"""
|
176
|
+
enabled_ar = cuda.managed_array(1, np.uint8)
|
177
|
+
|
178
|
+
self._single_thread_launch(
|
179
|
+
self._memsys_module,
|
180
|
+
stream,
|
181
|
+
"NRT_MemSys_stats_enabled",
|
182
|
+
(enabled_ar.device_ctypes_pointer,)
|
183
|
+
)
|
184
|
+
|
185
|
+
cuda.synchronize()
|
186
|
+
return bool(enabled_ar[0])
|
187
|
+
|
188
|
+
@_alloc_init_guard
|
189
|
+
def _copy_memsys_to_host(self, stream):
|
190
|
+
"""
|
191
|
+
Copy all statistics of memsys to the host
|
192
|
+
"""
|
193
|
+
dt = np.dtype([
|
194
|
+
('alloc', np.uint64),
|
195
|
+
('free', np.uint64),
|
196
|
+
('mi_alloc', np.uint64),
|
197
|
+
('mi_free', np.uint64)
|
198
|
+
])
|
199
|
+
|
200
|
+
stats_for_read = cuda.managed_array(1, dt)
|
201
|
+
|
202
|
+
self._single_thread_launch(
|
203
|
+
self._memsys_module,
|
204
|
+
stream,
|
205
|
+
"NRT_MemSys_read",
|
206
|
+
[stats_for_read.device_ctypes_pointer]
|
207
|
+
)
|
208
|
+
cuda.synchronize()
|
209
|
+
|
210
|
+
return stats_for_read[0]
|
211
|
+
|
212
|
+
@_alloc_init_guard
|
213
|
+
def get_allocation_stats(self, stream=None):
|
214
|
+
"""
|
215
|
+
Get the allocation statistics
|
216
|
+
"""
|
217
|
+
enabled = self.memsys_stats_enabled(stream)
|
218
|
+
if not enabled:
|
219
|
+
raise RuntimeError("NRT stats are disabled.")
|
220
|
+
memsys = self._copy_memsys_to_host(stream)
|
221
|
+
return _nrt_mstats(
|
222
|
+
alloc=memsys["alloc"],
|
223
|
+
free=memsys["free"],
|
224
|
+
mi_alloc=memsys["mi_alloc"],
|
225
|
+
mi_free=memsys["mi_free"]
|
226
|
+
)
|
227
|
+
|
228
|
+
@_alloc_init_guard
|
229
|
+
def _get_single_stat(self, stat, stream=None):
|
230
|
+
"""
|
231
|
+
Get a single stat from the memsys
|
232
|
+
"""
|
233
|
+
got = cuda.managed_array(1, np.uint64)
|
234
|
+
self._single_thread_launch(
|
235
|
+
self._memsys_module,
|
236
|
+
stream,
|
237
|
+
f"NRT_MemSys_read_{stat}",
|
238
|
+
[got.device_ctypes_pointer]
|
239
|
+
)
|
240
|
+
|
241
|
+
cuda.synchronize()
|
242
|
+
return got[0]
|
243
|
+
|
244
|
+
@_alloc_init_guard
|
245
|
+
def memsys_get_stats_alloc(self, stream=None):
|
246
|
+
"""
|
247
|
+
Get the allocation statistic
|
248
|
+
"""
|
249
|
+
enabled = self.memsys_stats_enabled(stream)
|
250
|
+
if not enabled:
|
251
|
+
raise RuntimeError("NRT stats are disabled.")
|
252
|
+
|
253
|
+
return self._get_single_stat("alloc")
|
254
|
+
|
255
|
+
@_alloc_init_guard
|
256
|
+
def memsys_get_stats_free(self, stream=None):
|
257
|
+
"""
|
258
|
+
Get the free statistic
|
259
|
+
"""
|
260
|
+
enabled = self.memsys_stats_enabled(stream)
|
261
|
+
if not enabled:
|
262
|
+
raise RuntimeError("NRT stats are disabled.")
|
263
|
+
|
264
|
+
return self._get_single_stat("free")
|
265
|
+
|
266
|
+
@_alloc_init_guard
|
267
|
+
def memsys_get_stats_mi_alloc(self, stream=None):
|
268
|
+
"""
|
269
|
+
Get the mi alloc statistic
|
270
|
+
"""
|
271
|
+
enabled = self.memsys_stats_enabled(stream)
|
272
|
+
if not enabled:
|
273
|
+
raise RuntimeError("NRT stats are disabled.")
|
274
|
+
|
275
|
+
return self._get_single_stat("mi_alloc")
|
276
|
+
|
277
|
+
@_alloc_init_guard
|
278
|
+
def memsys_get_stats_mi_free(self, stream=None):
|
279
|
+
"""
|
280
|
+
Get the mi free statistic
|
281
|
+
"""
|
282
|
+
enabled = self.memsys_stats_enabled(stream)
|
283
|
+
if not enabled:
|
284
|
+
raise RuntimeError("NRT stats are disabled.")
|
285
|
+
|
286
|
+
return self._get_single_stat("mi_free")
|
287
|
+
|
288
|
+
def set_memsys_to_module(self, module, stream=None):
|
289
|
+
"""
|
290
|
+
Set the memsys module. The module must contain `NRT_MemSys_set` kernel,
|
291
|
+
and declare a pointer to NRT_MemSys structure.
|
292
|
+
"""
|
293
|
+
if self._memsys is None:
|
294
|
+
raise RuntimeError(
|
295
|
+
"Please allocate NRT Memsys first before setting to module.")
|
296
|
+
|
297
|
+
self._single_thread_launch(
|
298
|
+
module,
|
299
|
+
stream,
|
300
|
+
"NRT_MemSys_set",
|
301
|
+
[self._memsys.device_ctypes_pointer,]
|
302
|
+
)
|
303
|
+
|
304
|
+
@_alloc_init_guard
|
305
|
+
def print_memsys(self, stream=None):
|
306
|
+
"""
|
307
|
+
Print the current statistics of memsys, for debugging purposes
|
308
|
+
"""
|
309
|
+
cuda.synchronize()
|
310
|
+
self._single_thread_launch(
|
311
|
+
self._memsys_module,
|
312
|
+
stream,
|
313
|
+
"NRT_MemSys_print"
|
314
|
+
)
|
315
|
+
|
316
|
+
|
317
|
+
# Create an instance of the runtime
|
318
|
+
rtsys = _Runtime()
|
numba_cuda/numba/cuda/testing.py
CHANGED
@@ -115,12 +115,22 @@ def skip_on_arm(reason):
|
|
115
115
|
def skip_if_cuda_includes_missing(fn):
|
116
116
|
# Skip when cuda.h is not available - generally this should indicate
|
117
117
|
# whether the CUDA includes are available or not
|
118
|
-
|
118
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
119
|
+
cuda_h = os.path.join(cuda_include_path, 'cuda.h')
|
119
120
|
cuda_h_file = (os.path.exists(cuda_h) and os.path.isfile(cuda_h))
|
120
121
|
reason = 'CUDA include dir not available on this system'
|
121
122
|
return unittest.skipUnless(cuda_h_file, reason)(fn)
|
122
123
|
|
123
124
|
|
125
|
+
def skip_if_curand_kernel_missing(fn):
|
126
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
127
|
+
curand_kernel_h = os.path.join(cuda_include_path, 'curand_kernel.h')
|
128
|
+
curand_kernel_h_file = (os.path.exists(curand_kernel_h) and
|
129
|
+
os.path.isfile(curand_kernel_h))
|
130
|
+
reason = 'curand_kernel.h not available on this system'
|
131
|
+
return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
|
132
|
+
|
133
|
+
|
124
134
|
def skip_if_mvc_enabled(reason):
|
125
135
|
"""Skip a test if Minor Version Compatibility is enabled"""
|
126
136
|
return unittest.skipIf(config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY,
|
@@ -49,6 +49,7 @@ def load_tests(loader, tests, pattern):
|
|
49
49
|
if gpus and gpus[0].compute_capability >= (2, 0):
|
50
50
|
suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
|
51
51
|
suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
|
52
|
+
suite.addTests(load_testsuite(loader, join(this_dir, 'nrt')))
|
52
53
|
suite.addTests(load_testsuite(loader, join(this_dir,
|
53
54
|
'doc_examples')))
|
54
55
|
else:
|
@@ -4,6 +4,7 @@ from numba.cuda.cudadrv import devicearray
|
|
4
4
|
from numba import cuda
|
5
5
|
from numba.cuda.testing import unittest, CUDATestCase
|
6
6
|
from numba.cuda.testing import skip_on_cudasim
|
7
|
+
from numba.tests.support import IS_NUMPY_2
|
7
8
|
|
8
9
|
|
9
10
|
class TestCudaNDArray(CUDATestCase):
|
@@ -456,6 +457,36 @@ class TestCudaNDArray(CUDATestCase):
|
|
456
457
|
dev_array_from_host.copy_to_device(dev_array)
|
457
458
|
|
458
459
|
|
460
|
+
class TestArrayMethod(CUDATestCase):
|
461
|
+
"""Tests of the __array__() method via np.array"""
|
462
|
+
|
463
|
+
def test_np_array(self):
|
464
|
+
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
465
|
+
host_array = np.array(dev_array)
|
466
|
+
np.testing.assert_equal(dev_array.copy_to_host(), host_array)
|
467
|
+
|
468
|
+
def test_np_array_dtype(self):
|
469
|
+
dtype = np.int32
|
470
|
+
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
471
|
+
host_array = np.array(dev_array, dtype=dtype)
|
472
|
+
np.testing.assert_equal(
|
473
|
+
host_array,
|
474
|
+
dev_array.copy_to_host().astype(dtype)
|
475
|
+
)
|
476
|
+
|
477
|
+
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
478
|
+
def test_np_array_copy_false(self):
|
479
|
+
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
480
|
+
with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
|
481
|
+
np.array(dev_array, copy=False)
|
482
|
+
|
483
|
+
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
484
|
+
def test_np_array_copy_true(self):
|
485
|
+
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
486
|
+
host_array = np.array(dev_array)
|
487
|
+
np.testing.assert_equal(dev_array.copy_to_host(), host_array)
|
488
|
+
|
489
|
+
|
459
490
|
class TestRecarray(CUDATestCase):
|
460
491
|
def test_recarray(self):
|
461
492
|
# From issue #4111
|
@@ -1,11 +1,14 @@
|
|
1
1
|
import re
|
2
|
-
import
|
2
|
+
import cffi
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
|
6
|
-
from numba.cuda.testing import
|
7
|
-
|
6
|
+
from numba.cuda.testing import (skip_if_curand_kernel_missing, skip_on_cudasim,
|
7
|
+
test_data_dir, unittest, CUDATestCase)
|
8
|
+
from numba import cuda, jit, float32, int32, types
|
8
9
|
from numba.core.errors import TypingError
|
10
|
+
from numba.tests.support import skip_unless_cffi
|
11
|
+
from types import ModuleType
|
9
12
|
|
10
13
|
|
11
14
|
class TestDeviceFunc(CUDATestCase):
|
@@ -92,7 +95,7 @@ class TestDeviceFunc(CUDATestCase):
|
|
92
95
|
def add(a, b):
|
93
96
|
return a + b
|
94
97
|
|
95
|
-
mymod =
|
98
|
+
mymod = ModuleType(name='mymod')
|
96
99
|
mymod.add = add
|
97
100
|
del add
|
98
101
|
|
@@ -192,31 +195,162 @@ class TestDeviceFunc(CUDATestCase):
|
|
192
195
|
|
193
196
|
self.assertEqual(0x04010203, x[0])
|
194
197
|
|
195
|
-
|
198
|
+
|
199
|
+
times2_cu = cuda.CUSource("""
|
200
|
+
extern "C" __device__
|
201
|
+
int times2(int *out, int a)
|
202
|
+
{
|
203
|
+
*out = a * 2;
|
204
|
+
return 0;
|
205
|
+
}
|
206
|
+
""")
|
207
|
+
|
208
|
+
|
209
|
+
times4_cu = cuda.CUSource("""
|
210
|
+
extern "C" __device__
|
211
|
+
int times2(int *out, int a);
|
212
|
+
|
213
|
+
extern "C" __device__
|
214
|
+
int times4(int *out, int a)
|
215
|
+
{
|
216
|
+
int tmp;
|
217
|
+
times2(&tmp, a);
|
218
|
+
*out = tmp * 2;
|
219
|
+
return 0;
|
220
|
+
}
|
221
|
+
""")
|
222
|
+
|
223
|
+
jitlink_user_cu = cuda.CUSource("""
|
224
|
+
extern "C" __device__
|
225
|
+
int array_mutator(void *out, int *a);
|
226
|
+
|
227
|
+
extern "C" __device__
|
228
|
+
int use_array_mutator(void *out, int *a) {
|
229
|
+
array_mutator(out, a);
|
230
|
+
return 0;
|
231
|
+
}
|
232
|
+
""")
|
233
|
+
|
234
|
+
rng_cu = cuda.CUSource("""
|
235
|
+
#include <curand_kernel.h>
|
236
|
+
|
237
|
+
extern "C" __device__
|
238
|
+
int random_number(unsigned int *out, unsigned long long seed)
|
239
|
+
{
|
240
|
+
// Initialize state
|
241
|
+
curandStateXORWOW_t state;
|
242
|
+
unsigned long long sequence = 1;
|
243
|
+
unsigned long long offset = 0;
|
244
|
+
curand_init(seed, sequence, offset, &state);
|
245
|
+
|
246
|
+
// Generate one random number
|
247
|
+
*out = curand(&state);
|
248
|
+
|
249
|
+
// Report no exception
|
250
|
+
return 0;
|
251
|
+
}""")
|
252
|
+
|
253
|
+
|
254
|
+
@skip_on_cudasim('External functions unsupported in the simulator')
|
255
|
+
class TestDeclareDevice(CUDATestCase):
|
256
|
+
|
257
|
+
def check_api(self, decl):
|
196
258
|
self.assertEqual(decl.name, 'f1')
|
197
259
|
self.assertEqual(decl.sig.args, (float32[:],))
|
198
260
|
self.assertEqual(decl.sig.return_type, int32)
|
199
261
|
|
200
|
-
@skip_on_cudasim('cudasim does not check signatures')
|
201
262
|
def test_declare_device_signature(self):
|
202
263
|
f1 = cuda.declare_device('f1', int32(float32[:]))
|
203
|
-
self.
|
264
|
+
self.check_api(f1)
|
204
265
|
|
205
|
-
@skip_on_cudasim('cudasim does not check signatures')
|
206
266
|
def test_declare_device_string(self):
|
207
267
|
f1 = cuda.declare_device('f1', 'int32(float32[:])')
|
208
|
-
self.
|
268
|
+
self.check_api(f1)
|
209
269
|
|
210
|
-
@skip_on_cudasim('cudasim does not check signatures')
|
211
270
|
def test_bad_declare_device_tuple(self):
|
212
271
|
with self.assertRaisesRegex(TypeError, 'Return type'):
|
213
272
|
cuda.declare_device('f1', (float32[:],))
|
214
273
|
|
215
|
-
@skip_on_cudasim('cudasim does not check signatures')
|
216
274
|
def test_bad_declare_device_string(self):
|
217
275
|
with self.assertRaisesRegex(TypeError, 'Return type'):
|
218
276
|
cuda.declare_device('f1', '(float32[:],)')
|
219
277
|
|
278
|
+
def test_link_cu_source(self):
|
279
|
+
times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
|
280
|
+
|
281
|
+
@cuda.jit
|
282
|
+
def kernel(r, x):
|
283
|
+
i = cuda.grid(1)
|
284
|
+
if i < len(r):
|
285
|
+
r[i] = times2(x[i])
|
286
|
+
|
287
|
+
x = np.arange(10, dtype=np.int32)
|
288
|
+
r = np.empty_like(x)
|
289
|
+
|
290
|
+
kernel[1, 32](r, x)
|
291
|
+
|
292
|
+
np.testing.assert_equal(r, x * 2)
|
293
|
+
|
294
|
+
def _test_link_multiple_sources(self, link_type):
|
295
|
+
link = link_type([times2_cu, times4_cu])
|
296
|
+
times4 = cuda.declare_device('times4', 'int32(int32)', link=link)
|
297
|
+
|
298
|
+
@cuda.jit
|
299
|
+
def kernel(r, x):
|
300
|
+
i = cuda.grid(1)
|
301
|
+
if i < len(r):
|
302
|
+
r[i] = times4(x[i])
|
303
|
+
|
304
|
+
x = np.arange(10, dtype=np.int32)
|
305
|
+
r = np.empty_like(x)
|
306
|
+
|
307
|
+
kernel[1, 32](r, x)
|
308
|
+
|
309
|
+
np.testing.assert_equal(r, x * 4)
|
310
|
+
|
311
|
+
def test_link_multiple_sources_set(self):
|
312
|
+
self._test_link_multiple_sources(set)
|
313
|
+
|
314
|
+
def test_link_multiple_sources_tuple(self):
|
315
|
+
self._test_link_multiple_sources(tuple)
|
316
|
+
|
317
|
+
def test_link_multiple_sources_list(self):
|
318
|
+
self._test_link_multiple_sources(list)
|
319
|
+
|
320
|
+
@skip_unless_cffi
|
321
|
+
def test_link_sources_in_memory_and_on_disk(self):
|
322
|
+
jitlink_cu = str(test_data_dir / "jitlink.cu")
|
323
|
+
link = [jitlink_cu, jitlink_user_cu]
|
324
|
+
sig = types.void(types.CPointer(types.int32))
|
325
|
+
ext_fn = cuda.declare_device("use_array_mutator", sig, link=link)
|
326
|
+
|
327
|
+
ffi = cffi.FFI()
|
328
|
+
|
329
|
+
@cuda.jit
|
330
|
+
def kernel(x):
|
331
|
+
ptr = ffi.from_buffer(x)
|
332
|
+
ext_fn(ptr)
|
333
|
+
|
334
|
+
x = np.arange(2, dtype=np.int32)
|
335
|
+
kernel[1, 1](x)
|
336
|
+
|
337
|
+
expected = np.ones(2, dtype=np.int32)
|
338
|
+
np.testing.assert_equal(x, expected)
|
339
|
+
|
340
|
+
@skip_if_curand_kernel_missing
|
341
|
+
def test_include_cuda_header(self):
|
342
|
+
sig = types.int32(types.uint64)
|
343
|
+
link = [rng_cu]
|
344
|
+
random_number = cuda.declare_device("random_number", sig, link=link)
|
345
|
+
|
346
|
+
@cuda.jit
|
347
|
+
def kernel(x, seed):
|
348
|
+
x[0] = random_number(seed)
|
349
|
+
|
350
|
+
x = np.zeros(1, dtype=np.uint32)
|
351
|
+
kernel[1, 1](x, 1)
|
352
|
+
np.testing.assert_equal(x[0], 323845807)
|
353
|
+
|
220
354
|
|
221
355
|
if __name__ == '__main__':
|
222
356
|
unittest.main()
|
@@ -15,16 +15,18 @@ class TestFFI(CUDATestCase):
|
|
15
15
|
import numpy as np
|
16
16
|
import os
|
17
17
|
|
18
|
-
# Declaration of the foreign function
|
19
|
-
mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
|
20
|
-
|
21
18
|
# Path to the source containing the foreign function
|
22
19
|
# (here assumed to be in a subdirectory called "ffi")
|
23
20
|
basedir = os.path.dirname(os.path.abspath(__file__))
|
24
21
|
functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
|
25
22
|
|
26
|
-
#
|
27
|
-
|
23
|
+
# Declaration of the foreign function
|
24
|
+
mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)',
|
25
|
+
link=functions_cu)
|
26
|
+
|
27
|
+
# A kernel that calls mul; functions.cu is linked automatically due to
|
28
|
+
# the call to mul.
|
29
|
+
@cuda.jit
|
28
30
|
def multiply_vectors(r, x, y):
|
29
31
|
i = cuda.grid(1)
|
30
32
|
|
@@ -54,14 +56,15 @@ class TestFFI(CUDATestCase):
|
|
54
56
|
|
55
57
|
# magictoken.ex_from_buffer_decl.begin
|
56
58
|
signature = 'float32(CPointer(float32), int32)'
|
57
|
-
sum_reduce = cuda.declare_device('sum_reduce', signature
|
59
|
+
sum_reduce = cuda.declare_device('sum_reduce', signature,
|
60
|
+
link=functions_cu)
|
58
61
|
# magictoken.ex_from_buffer_decl.end
|
59
62
|
|
60
63
|
# magictoken.ex_from_buffer_kernel.begin
|
61
64
|
import cffi
|
62
65
|
ffi = cffi.FFI()
|
63
66
|
|
64
|
-
@cuda.jit
|
67
|
+
@cuda.jit
|
65
68
|
def reduction_caller(result, array):
|
66
69
|
array_ptr = ffi.from_buffer(array)
|
67
70
|
result[()] = sum_reduce(array_ptr, len(array))
|