numba-cuda 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,318 @@
1
+ import ctypes
2
+ import os
3
+ from functools import wraps
4
+ import numpy as np
5
+
6
+ from numba import cuda, config
7
+ from numba.core.runtime.nrt import _nrt_mstats
8
+ from numba.cuda.cudadrv.driver import Linker, driver, launch_kernel
9
+ from numba.cuda.cudadrv import devices
10
+ from numba.cuda.api import get_current_device
11
+ from numba.cuda.utils import _readenv
12
+
13
+
14
+ # Check environment variable or config for NRT statistics enablement
15
+ NRT_STATS = (
16
+ _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
17
+ getattr(config, "NUMBA_CUDA_NRT_STATS", False)
18
+ )
19
+ if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
20
+ config.CUDA_NRT_STATS = NRT_STATS
21
+
22
+
23
+ # Check environment variable or config for NRT enablement
24
+ ENABLE_NRT = (
25
+ _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
26
+ getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
27
+ )
28
+ if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
29
+ config.CUDA_ENABLE_NRT = ENABLE_NRT
30
+
31
+
32
+ # Protect method to ensure NRT memory allocation and initialization
33
+ def _alloc_init_guard(method):
34
+ """
35
+ Ensure NRT memory allocation and initialization before running the method
36
+ """
37
+ @wraps(method)
38
+ def wrapper(self, *args, **kwargs):
39
+ self.ensure_allocated()
40
+ self.ensure_initialized()
41
+ return method(self, *args, **kwargs)
42
+ return wrapper
43
+
44
+
45
+ class _Runtime:
46
+ """Singleton class for Numba CUDA runtime"""
47
+ _instance = None
48
+
49
+ def __new__(cls, *args, **kwargs):
50
+ if cls._instance is None:
51
+ cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs)
52
+ return cls._instance
53
+
54
+ def __init__(self):
55
+ """Initialize memsys module and variable"""
56
+ self._memsys_module = None
57
+ self._memsys = None
58
+ self._initialized = False
59
+
60
+ def _compile_memsys_module(self):
61
+ """
62
+ Compile memsys.cu and create a module from it in the current context
63
+ """
64
+ # Define the path for memsys.cu
65
+ memsys_mod = os.path.join(
66
+ os.path.dirname(os.path.abspath(__file__)),
67
+ "memsys.cu"
68
+ )
69
+ cc = get_current_device().compute_capability
70
+
71
+ # Create a new linker instance and add the cu file
72
+ linker = Linker.new(cc=cc)
73
+ linker.add_cu_file(memsys_mod)
74
+
75
+ # Complete the linker and create a module from it
76
+ cubin = linker.complete()
77
+ ctx = devices.get_context()
78
+ module = ctx.create_module_image(cubin)
79
+
80
+ # Set the memsys module
81
+ self._memsys_module = module
82
+
83
+ def ensure_allocated(self, stream=None):
84
+ """
85
+ If memsys is not allocated, allocate it; otherwise, perform a no-op
86
+ """
87
+ if self._memsys is not None:
88
+ return
89
+
90
+ # Allocate the memsys
91
+ self.allocate(stream)
92
+
93
+ def allocate(self, stream=None):
94
+ """
95
+ Allocate memsys on global memory
96
+ """
97
+ from numba.cuda import device_array
98
+
99
+ # Check if memsys module is defined
100
+ if self._memsys_module is None:
101
+ # Compile the memsys module if not defined
102
+ self._compile_memsys_module()
103
+
104
+ # Allocate space for NRT_MemSys
105
+ ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
106
+ memsys_size = ctypes.c_uint64()
107
+ driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
108
+ ptr.device_ctypes_pointer, nbytes)
109
+ self._memsys = device_array(
110
+ (memsys_size.value,), dtype="i1", stream=stream)
111
+ self.set_memsys_to_module(self._memsys_module, stream=stream)
112
+
113
+ def _single_thread_launch(self, module, stream, name, params=()):
114
+ """
115
+ Launch the specified kernel with only 1 thread
116
+ """
117
+ if stream is None:
118
+ stream = cuda.default_stream()
119
+
120
+ func = module.get_function(name)
121
+ launch_kernel(
122
+ func.handle,
123
+ 1, 1, 1,
124
+ 1, 1, 1,
125
+ 0,
126
+ stream.handle,
127
+ params,
128
+ cooperative=False
129
+ )
130
+
131
+ def ensure_initialized(self, stream=None):
132
+ """
133
+ If memsys is not initialized, initialize memsys
134
+ """
135
+ if self._initialized:
136
+ return
137
+
138
+ # Initialize the memsys
139
+ self.initialize(stream)
140
+
141
+ def initialize(self, stream=None):
142
+ """
143
+ Launch memsys initialization kernel
144
+ """
145
+ self.ensure_allocated()
146
+
147
+ self._single_thread_launch(
148
+ self._memsys_module, stream, "NRT_MemSys_init")
149
+ self._initialized = True
150
+
151
+ if config.CUDA_NRT_STATS:
152
+ self.memsys_enable_stats()
153
+
154
+ @_alloc_init_guard
155
+ def memsys_enable_stats(self, stream=None):
156
+ """
157
+ Enable memsys statistics
158
+ """
159
+ self._single_thread_launch(
160
+ self._memsys_module, stream, "NRT_MemSys_enable_stats")
161
+
162
+ @_alloc_init_guard
163
+ def memsys_disable_stats(self, stream=None):
164
+ """
165
+ Disable memsys statistics
166
+ """
167
+ self._single_thread_launch(
168
+ self._memsys_module, stream, "NRT_MemSys_disable_stats")
169
+
170
+ @_alloc_init_guard
171
+ def memsys_stats_enabled(self, stream=None):
172
+ """
173
+ Return a boolean indicating whether memsys is enabled. Synchronizes
174
+ context
175
+ """
176
+ enabled_ar = cuda.managed_array(1, np.uint8)
177
+
178
+ self._single_thread_launch(
179
+ self._memsys_module,
180
+ stream,
181
+ "NRT_MemSys_stats_enabled",
182
+ (enabled_ar.device_ctypes_pointer,)
183
+ )
184
+
185
+ cuda.synchronize()
186
+ return bool(enabled_ar[0])
187
+
188
+ @_alloc_init_guard
189
+ def _copy_memsys_to_host(self, stream):
190
+ """
191
+ Copy all statistics of memsys to the host
192
+ """
193
+ dt = np.dtype([
194
+ ('alloc', np.uint64),
195
+ ('free', np.uint64),
196
+ ('mi_alloc', np.uint64),
197
+ ('mi_free', np.uint64)
198
+ ])
199
+
200
+ stats_for_read = cuda.managed_array(1, dt)
201
+
202
+ self._single_thread_launch(
203
+ self._memsys_module,
204
+ stream,
205
+ "NRT_MemSys_read",
206
+ [stats_for_read.device_ctypes_pointer]
207
+ )
208
+ cuda.synchronize()
209
+
210
+ return stats_for_read[0]
211
+
212
+ @_alloc_init_guard
213
+ def get_allocation_stats(self, stream=None):
214
+ """
215
+ Get the allocation statistics
216
+ """
217
+ enabled = self.memsys_stats_enabled(stream)
218
+ if not enabled:
219
+ raise RuntimeError("NRT stats are disabled.")
220
+ memsys = self._copy_memsys_to_host(stream)
221
+ return _nrt_mstats(
222
+ alloc=memsys["alloc"],
223
+ free=memsys["free"],
224
+ mi_alloc=memsys["mi_alloc"],
225
+ mi_free=memsys["mi_free"]
226
+ )
227
+
228
+ @_alloc_init_guard
229
+ def _get_single_stat(self, stat, stream=None):
230
+ """
231
+ Get a single stat from the memsys
232
+ """
233
+ got = cuda.managed_array(1, np.uint64)
234
+ self._single_thread_launch(
235
+ self._memsys_module,
236
+ stream,
237
+ f"NRT_MemSys_read_{stat}",
238
+ [got.device_ctypes_pointer]
239
+ )
240
+
241
+ cuda.synchronize()
242
+ return got[0]
243
+
244
+ @_alloc_init_guard
245
+ def memsys_get_stats_alloc(self, stream=None):
246
+ """
247
+ Get the allocation statistic
248
+ """
249
+ enabled = self.memsys_stats_enabled(stream)
250
+ if not enabled:
251
+ raise RuntimeError("NRT stats are disabled.")
252
+
253
+ return self._get_single_stat("alloc")
254
+
255
+ @_alloc_init_guard
256
+ def memsys_get_stats_free(self, stream=None):
257
+ """
258
+ Get the free statistic
259
+ """
260
+ enabled = self.memsys_stats_enabled(stream)
261
+ if not enabled:
262
+ raise RuntimeError("NRT stats are disabled.")
263
+
264
+ return self._get_single_stat("free")
265
+
266
+ @_alloc_init_guard
267
+ def memsys_get_stats_mi_alloc(self, stream=None):
268
+ """
269
+ Get the mi alloc statistic
270
+ """
271
+ enabled = self.memsys_stats_enabled(stream)
272
+ if not enabled:
273
+ raise RuntimeError("NRT stats are disabled.")
274
+
275
+ return self._get_single_stat("mi_alloc")
276
+
277
+ @_alloc_init_guard
278
+ def memsys_get_stats_mi_free(self, stream=None):
279
+ """
280
+ Get the mi free statistic
281
+ """
282
+ enabled = self.memsys_stats_enabled(stream)
283
+ if not enabled:
284
+ raise RuntimeError("NRT stats are disabled.")
285
+
286
+ return self._get_single_stat("mi_free")
287
+
288
+ def set_memsys_to_module(self, module, stream=None):
289
+ """
290
+ Set the memsys module. The module must contain `NRT_MemSys_set` kernel,
291
+ and declare a pointer to NRT_MemSys structure.
292
+ """
293
+ if self._memsys is None:
294
+ raise RuntimeError(
295
+ "Please allocate NRT Memsys first before setting to module.")
296
+
297
+ self._single_thread_launch(
298
+ module,
299
+ stream,
300
+ "NRT_MemSys_set",
301
+ [self._memsys.device_ctypes_pointer,]
302
+ )
303
+
304
+ @_alloc_init_guard
305
+ def print_memsys(self, stream=None):
306
+ """
307
+ Print the current statistics of memsys, for debugging purposes
308
+ """
309
+ cuda.synchronize()
310
+ self._single_thread_launch(
311
+ self._memsys_module,
312
+ stream,
313
+ "NRT_MemSys_print"
314
+ )
315
+
316
+
317
+ # Create an instance of the runtime
318
+ rtsys = _Runtime()
@@ -49,6 +49,7 @@ def load_tests(loader, tests, pattern):
49
49
  if gpus and gpus[0].compute_capability >= (2, 0):
50
50
  suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
51
51
  suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
52
+ suite.addTests(load_testsuite(loader, join(this_dir, 'nrt')))
52
53
  suite.addTests(load_testsuite(loader, join(this_dir,
53
54
  'doc_examples')))
54
55
  else:
@@ -4,6 +4,7 @@ from numba.cuda.cudadrv import devicearray
4
4
  from numba import cuda
5
5
  from numba.cuda.testing import unittest, CUDATestCase
6
6
  from numba.cuda.testing import skip_on_cudasim
7
+ from numba.tests.support import IS_NUMPY_2
7
8
 
8
9
 
9
10
  class TestCudaNDArray(CUDATestCase):
@@ -456,6 +457,36 @@ class TestCudaNDArray(CUDATestCase):
456
457
  dev_array_from_host.copy_to_device(dev_array)
457
458
 
458
459
 
460
+ class TestArrayMethod(CUDATestCase):
461
+ """Tests of the __array__() method via np.array"""
462
+
463
+ def test_np_array(self):
464
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
465
+ host_array = np.array(dev_array)
466
+ np.testing.assert_equal(dev_array.copy_to_host(), host_array)
467
+
468
+ def test_np_array_dtype(self):
469
+ dtype = np.int32
470
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
471
+ host_array = np.array(dev_array, dtype=dtype)
472
+ np.testing.assert_equal(
473
+ host_array,
474
+ dev_array.copy_to_host().astype(dtype)
475
+ )
476
+
477
+ @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
478
+ def test_np_array_copy_false(self):
479
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
480
+ with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
481
+ np.array(dev_array, copy=False)
482
+
483
+ @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
484
+ def test_np_array_copy_true(self):
485
+ dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
486
+ host_array = np.array(dev_array)
487
+ np.testing.assert_equal(dev_array.copy_to_host(), host_array)
488
+
489
+
459
490
  class TestRecarray(CUDATestCase):
460
491
  def test_recarray(self):
461
492
  # From issue #4111
@@ -12,6 +12,31 @@ else:
12
12
  cuda.pinned_array_like)
13
13
 
14
14
 
15
+ def array_reshape1d(arr, newshape, got):
16
+ y = arr.reshape(newshape)
17
+ for i in range(y.shape[0]):
18
+ got[i] = y[i]
19
+
20
+
21
+ def array_reshape2d(arr, newshape, got):
22
+ y = arr.reshape(newshape)
23
+ for i in range(y.shape[0]):
24
+ for j in range(y.shape[1]):
25
+ got[i, j] = y[i, j]
26
+
27
+
28
+ def array_reshape3d(arr, newshape, got):
29
+ y = arr.reshape(newshape)
30
+ for i in range(y.shape[0]):
31
+ for j in range(y.shape[1]):
32
+ for k in range(y.shape[2]):
33
+ got[i, j, k] = y[i, j, k]
34
+
35
+
36
+ def array_reshape(arr, newshape):
37
+ return arr.reshape(newshape)
38
+
39
+
15
40
  class TestCudaArray(CUDATestCase):
16
41
  def test_gpu_array_zero_length(self):
17
42
  x = np.arange(0)
@@ -255,6 +280,54 @@ class TestCudaArray(CUDATestCase):
255
280
 
256
281
  self.assertEqual(1, len(func.overloads))
257
282
 
283
+ def test_array_reshape(self):
284
+ def check(pyfunc, kernelfunc, arr, shape):
285
+ kernel = cuda.jit(kernelfunc)
286
+ expected = pyfunc(arr, shape)
287
+ got = np.zeros(expected.shape, dtype=arr.dtype)
288
+ kernel[1, 1](arr, shape, got)
289
+ self.assertPreciseEqual(got, expected)
290
+
291
+ def check_only_shape(kernelfunc, arr, shape, expected_shape):
292
+ kernel = cuda.jit(kernelfunc)
293
+ got = np.zeros(expected_shape, dtype=arr.dtype)
294
+ kernel[1, 1](arr, shape, got)
295
+ self.assertEqual(got.shape, expected_shape)
296
+ self.assertEqual(got.size, arr.size)
297
+
298
+ # 0-sized arrays
299
+ def check_empty(arr):
300
+ check(array_reshape, array_reshape1d, arr, 0)
301
+ check(array_reshape, array_reshape1d, arr, (0,))
302
+ check(array_reshape, array_reshape3d, arr, (1, 0, 2))
303
+ check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
304
+ check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
305
+ check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
306
+
307
+ # C-contiguous
308
+ arr = np.arange(24)
309
+ check(array_reshape, array_reshape1d, arr, (24,))
310
+ check(array_reshape, array_reshape2d, arr, (4, 6))
311
+ check(array_reshape, array_reshape2d, arr, (8, 3))
312
+ check(array_reshape, array_reshape3d, arr, (8, 1, 3))
313
+
314
+ arr = np.arange(24).reshape((1, 8, 1, 1, 3, 1))
315
+ check(array_reshape, array_reshape1d, arr, (24,))
316
+ check(array_reshape, array_reshape2d, arr, (4, 6))
317
+ check(array_reshape, array_reshape2d, arr, (8, 3))
318
+ check(array_reshape, array_reshape3d, arr, (8, 1, 3))
319
+
320
+ # Test negative shape value
321
+ arr = np.arange(25).reshape(5,5)
322
+ check(array_reshape, array_reshape1d, arr, -1)
323
+ check(array_reshape, array_reshape1d, arr, (-1,))
324
+ check(array_reshape, array_reshape2d, arr, (-1, 5))
325
+ check(array_reshape, array_reshape3d, arr, (5, -1, 5))
326
+ check(array_reshape, array_reshape3d, arr, (5, 5, -1))
327
+
328
+ arr = np.array([])
329
+ check_empty(arr)
330
+
258
331
 
259
332
  if __name__ == '__main__':
260
333
  unittest.main()
@@ -1,8 +1,12 @@
1
+ import math
2
+
3
+ import numpy as np
1
4
 
2
5
  from numba.core import errors, types
3
6
  from numba.core.extending import overload
4
7
  from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
5
- ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
8
+ ty_parse_dtype, ty_parse_shape, numpy_empty_nd,
9
+ numpy_empty_like_nd)
6
10
 
7
11
 
8
12
  # Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
@@ -20,6 +24,18 @@ def cuda_empty(shape, dtype):
20
24
  pass
21
25
 
22
26
 
27
+ def cuda_empty_like(arr):
28
+ pass
29
+
30
+
31
+ def cuda_arange(start):
32
+ pass
33
+
34
+
35
+ def cuda_ones(shape):
36
+ pass
37
+
38
+
23
39
  @overload(cuda_empty)
24
40
  def ol_cuda_empty(shape, dtype):
25
41
  _check_const_str_dtype("empty", dtype)
@@ -40,3 +56,91 @@ def ol_cuda_empty(shape, dtype):
40
56
  else:
41
57
  msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
42
58
  raise errors.TypingError(msg)
59
+
60
+
61
+ @overload(cuda_empty_like)
62
+ def ol_cuda_empty_like(arr):
63
+
64
+ if isinstance(arr, types.Array):
65
+ nb_dtype = arr.dtype
66
+ else:
67
+ nb_dtype = arr
68
+
69
+ if isinstance(arr, types.Array):
70
+ layout = arr.layout if arr.layout != 'A' else 'C'
71
+ retty = arr.copy(dtype=nb_dtype, layout=layout, readonly=False)
72
+ else:
73
+ retty = types.Array(nb_dtype, 0, 'C')
74
+
75
+ def impl(arr):
76
+ dtype = None
77
+ return numpy_empty_like_nd(arr, dtype, retty)
78
+ return impl
79
+
80
+
81
+ def _arange_dtype(*args):
82
+ bounds = [a for a in args if not isinstance(a, types.NoneType)]
83
+
84
+ if any(isinstance(a, types.Complex) for a in bounds):
85
+ dtype = types.complex128
86
+ elif any(isinstance(a, types.Float) for a in bounds):
87
+ dtype = types.float64
88
+ else:
89
+ # `np.arange(10).dtype` is always `np.dtype(int)`, aka `np.int_`, which
90
+ # in all released versions of numpy corresponds to the C `long` type.
91
+ # Windows 64 is broken by default here because Numba (as of 0.47) does
92
+ # not differentiate between Python and NumPy integers, so a `typeof(1)`
93
+ # on w64 is `int64`, i.e. `intp`. This means an arange(<some int>) will
94
+ # be typed as arange(int64) and the following will yield int64 opposed
95
+ # to int32. Example: without a load of analysis to work out of the args
96
+ # were wrapped in NumPy int*() calls it's not possible to detect the
97
+ # difference between `np.arange(10)` and `np.arange(np.int64(10)`.
98
+ NPY_TY = getattr(types, "int%s" % (8 * np.dtype(int).itemsize))
99
+
100
+ # unliteral these types such that `max` works.
101
+ unliteral_bounds = [types.unliteral(x) for x in bounds]
102
+ dtype = max(unliteral_bounds + [NPY_TY,])
103
+
104
+ return dtype
105
+
106
+
107
+ @overload(cuda_arange)
108
+ def ol_cuda_arange(start):
109
+ """Simplified arange with just 1 argument."""
110
+ if (not isinstance(start, types.Number)):
111
+ return
112
+
113
+ start_value = getattr(start, "literal_value", None)
114
+
115
+ def impl(start):
116
+ # Allow for improved performance if given literal arguments.
117
+ lit_start = start_value if start_value is not None else start
118
+
119
+ _step = 1
120
+ _start, _stop = 0, lit_start
121
+
122
+ nitems_c = (_stop - _start) / _step
123
+ nitems_r = int(math.ceil(nitems_c.real))
124
+
125
+ # Binary operator needed for compiler branch pruning.
126
+ nitems = max(nitems_r, 0)
127
+
128
+ arr = cuda_empty(nitems, np.int64)
129
+ val = _start
130
+ for i in range(nitems):
131
+ arr[i] = val + (i * _step)
132
+ return arr
133
+
134
+ return impl
135
+
136
+
137
+ @overload(cuda_ones)
138
+ def ol_cuda_ones(shape):
139
+
140
+ def impl(shape):
141
+ arr = cuda_empty(shape, np.float64)
142
+ arr_flat = arr.flat
143
+ for idx in range(len(arr_flat)):
144
+ arr_flat[idx] = 1
145
+ return arr
146
+ return impl