numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.13.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.13.dist-info/METADATA +69 -0
  229. numba_cuda-0.0.13.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,904 @@
1
+ """
2
+ A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
3
+ on the object. If it exists and evaluate to True, it must define shape,
4
+ strides, dtype and size attributes similar to a NumPy ndarray.
5
+ """
6
+
7
+ import math
8
+ import functools
9
+ import operator
10
+ import copy
11
+ from ctypes import c_void_p
12
+
13
+ import numpy as np
14
+
15
+ import numba
16
+ from numba import _devicearray
17
+ from numba.cuda.cudadrv import devices, dummyarray
18
+ from numba.cuda.cudadrv import driver as _driver
19
+ from numba.core import types, config
20
+ from numba.np.unsafe.ndarray import to_fixed_tuple
21
+ from numba.np.numpy_support import numpy_version
22
+ from numba.np import numpy_support
23
+ from numba.cuda.api_util import prepare_shape_strides_dtype
24
+ from numba.core.errors import NumbaPerformanceWarning
25
+ from warnings import warn
26
+
27
+ try:
28
+ lru_cache = getattr(functools, 'lru_cache')(None)
29
+ except AttributeError:
30
+ # Python 3.1 or lower
31
+ def lru_cache(func):
32
+ return func
33
+
34
+
35
+ def is_cuda_ndarray(obj):
36
+ "Check if an object is a CUDA ndarray"
37
+ return getattr(obj, '__cuda_ndarray__', False)
38
+
39
+
40
+ def verify_cuda_ndarray_interface(obj):
41
+ "Verify the CUDA ndarray interface for an obj"
42
+ require_cuda_ndarray(obj)
43
+
44
+ def requires_attr(attr, typ):
45
+ if not hasattr(obj, attr):
46
+ raise AttributeError(attr)
47
+ if not isinstance(getattr(obj, attr), typ):
48
+ raise AttributeError('%s must be of type %s' % (attr, typ))
49
+
50
+ requires_attr('shape', tuple)
51
+ requires_attr('strides', tuple)
52
+ requires_attr('dtype', np.dtype)
53
+ requires_attr('size', int)
54
+
55
+
56
+ def require_cuda_ndarray(obj):
57
+ "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
58
+ if not is_cuda_ndarray(obj):
59
+ raise ValueError('require an cuda ndarray object')
60
+
61
+
62
+ class DeviceNDArrayBase(_devicearray.DeviceArray):
63
+ """A on GPU NDArray representation
64
+ """
65
+ __cuda_memory__ = True
66
+ __cuda_ndarray__ = True # There must be gpu_data attribute
67
+
68
+ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
69
+ """
70
+ Args
71
+ ----
72
+
73
+ shape
74
+ array shape.
75
+ strides
76
+ array strides.
77
+ dtype
78
+ data type as np.dtype coercible object.
79
+ stream
80
+ cuda stream.
81
+ gpu_data
82
+ user provided device memory for the ndarray data buffer
83
+ """
84
+ if isinstance(shape, int):
85
+ shape = (shape,)
86
+ if isinstance(strides, int):
87
+ strides = (strides,)
88
+ dtype = np.dtype(dtype)
89
+ self.ndim = len(shape)
90
+ if len(strides) != self.ndim:
91
+ raise ValueError('strides not match ndim')
92
+ self._dummy = dummyarray.Array.from_desc(0, shape, strides,
93
+ dtype.itemsize)
94
+ self.shape = tuple(shape)
95
+ self.strides = tuple(strides)
96
+ self.dtype = dtype
97
+ self.size = int(functools.reduce(operator.mul, self.shape, 1))
98
+ # prepare gpu memory
99
+ if self.size > 0:
100
+ if gpu_data is None:
101
+ self.alloc_size = _driver.memory_size_from_info(
102
+ self.shape, self.strides, self.dtype.itemsize)
103
+ gpu_data = devices.get_context().memalloc(self.alloc_size)
104
+ else:
105
+ self.alloc_size = _driver.device_memory_size(gpu_data)
106
+ else:
107
+ # Make NULL pointer for empty allocation
108
+ if _driver.USE_NV_BINDING:
109
+ null = _driver.binding.CUdeviceptr(0)
110
+ else:
111
+ null = c_void_p(0)
112
+ gpu_data = _driver.MemoryPointer(context=devices.get_context(),
113
+ pointer=null, size=0)
114
+ self.alloc_size = 0
115
+
116
+ self.gpu_data = gpu_data
117
+ self.stream = stream
118
+
119
+ @property
120
+ def __cuda_array_interface__(self):
121
+ if _driver.USE_NV_BINDING:
122
+ if self.device_ctypes_pointer is not None:
123
+ ptr = int(self.device_ctypes_pointer)
124
+ else:
125
+ ptr = 0
126
+ else:
127
+ if self.device_ctypes_pointer.value is not None:
128
+ ptr = self.device_ctypes_pointer.value
129
+ else:
130
+ ptr = 0
131
+
132
+ return {
133
+ 'shape': tuple(self.shape),
134
+ 'strides': None if is_contiguous(self) else tuple(self.strides),
135
+ 'data': (ptr, False),
136
+ 'typestr': self.dtype.str,
137
+ 'stream': int(self.stream) if self.stream != 0 else None,
138
+ 'version': 3,
139
+ }
140
+
141
+ def bind(self, stream=0):
142
+ """Bind a CUDA stream to this object so that all subsequent operation
143
+ on this array defaults to the given stream.
144
+ """
145
+ clone = copy.copy(self)
146
+ clone.stream = stream
147
+ return clone
148
+
149
+ @property
150
+ def T(self):
151
+ return self.transpose()
152
+
153
+ def transpose(self, axes=None):
154
+ if axes and tuple(axes) == tuple(range(self.ndim)):
155
+ return self
156
+ elif self.ndim != 2:
157
+ msg = "transposing a non-2D DeviceNDArray isn't supported"
158
+ raise NotImplementedError(msg)
159
+ elif axes is not None and set(axes) != set(range(self.ndim)):
160
+ raise ValueError("invalid axes list %r" % (axes,))
161
+ else:
162
+ from numba.cuda.kernels.transpose import transpose
163
+ return transpose(self)
164
+
165
+ def _default_stream(self, stream):
166
+ return self.stream if not stream else stream
167
+
168
+ @property
169
+ def _numba_type_(self):
170
+ """
171
+ Magic attribute expected by Numba to get the numba type that
172
+ represents this object.
173
+ """
174
+ # Typing considerations:
175
+ #
176
+ # 1. The preference is to use 'C' or 'F' layout since this enables
177
+ # hardcoding stride values into compiled kernels, which is more
178
+ # efficient than storing a passed-in value in a register.
179
+ #
180
+ # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
181
+ # the more likely / common case.
182
+ #
183
+ # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
184
+ # or 'F' does not apply for broadcast arrays, because the strides, some
185
+ # of which will be 0, will not match those hardcoded in for 'C' or 'F'
186
+ # layouts.
187
+
188
+ broadcast = 0 in self.strides
189
+ if self.flags['C_CONTIGUOUS'] and not broadcast:
190
+ layout = 'C'
191
+ elif self.flags['F_CONTIGUOUS'] and not broadcast:
192
+ layout = 'F'
193
+ else:
194
+ layout = 'A'
195
+
196
+ dtype = numpy_support.from_dtype(self.dtype)
197
+ return types.Array(dtype, self.ndim, layout)
198
+
199
+ @property
200
+ def device_ctypes_pointer(self):
201
+ """Returns the ctypes pointer to the GPU data buffer
202
+ """
203
+ if self.gpu_data is None:
204
+ if _driver.USE_NV_BINDING:
205
+ return _driver.binding.CUdeviceptr(0)
206
+ else:
207
+ return c_void_p(0)
208
+ else:
209
+ return self.gpu_data.device_ctypes_pointer
210
+
211
+ @devices.require_context
212
+ def copy_to_device(self, ary, stream=0):
213
+ """Copy `ary` to `self`.
214
+
215
+ If `ary` is a CUDA memory, perform a device-to-device transfer.
216
+ Otherwise, perform a a host-to-device transfer.
217
+ """
218
+ if ary.size == 0:
219
+ # Nothing to do
220
+ return
221
+
222
+ sentry_contiguous(self)
223
+ stream = self._default_stream(stream)
224
+
225
+ self_core, ary_core = array_core(self), array_core(ary)
226
+ if _driver.is_device_memory(ary):
227
+ sentry_contiguous(ary)
228
+ check_array_compatibility(self_core, ary_core)
229
+ _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
230
+ else:
231
+ # Ensure same contiguity. Only makes a host-side copy if necessary
232
+ # (i.e., in order to materialize a writable strided view)
233
+ ary_core = np.array(
234
+ ary_core,
235
+ order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
236
+ subok=True,
237
+ copy=(not ary_core.flags['WRITEABLE'])
238
+ if numpy_version < (2, 0) else None)
239
+ check_array_compatibility(self_core, ary_core)
240
+ _driver.host_to_device(self, ary_core, self.alloc_size,
241
+ stream=stream)
242
+
243
+ @devices.require_context
244
+ def copy_to_host(self, ary=None, stream=0):
245
+ """Copy ``self`` to ``ary`` or create a new Numpy ndarray
246
+ if ``ary`` is ``None``.
247
+
248
+ If a CUDA ``stream`` is given, then the transfer will be made
249
+ asynchronously as part as the given stream. Otherwise, the transfer is
250
+ synchronous: the function returns after the copy is finished.
251
+
252
+ Always returns the host array.
253
+
254
+ Example::
255
+
256
+ import numpy as np
257
+ from numba import cuda
258
+
259
+ arr = np.arange(1000)
260
+ d_arr = cuda.to_device(arr)
261
+
262
+ my_kernel[100, 100](d_arr)
263
+
264
+ result_array = d_arr.copy_to_host()
265
+ """
266
+ if any(s < 0 for s in self.strides):
267
+ msg = 'D->H copy not implemented for negative strides: {}'
268
+ raise NotImplementedError(msg.format(self.strides))
269
+ assert self.alloc_size >= 0, "Negative memory size"
270
+ stream = self._default_stream(stream)
271
+ if ary is None:
272
+ hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
273
+ else:
274
+ check_array_compatibility(self, ary)
275
+ hostary = ary
276
+
277
+ if self.alloc_size != 0:
278
+ _driver.device_to_host(hostary, self, self.alloc_size,
279
+ stream=stream)
280
+
281
+ if ary is None:
282
+ if self.size == 0:
283
+ hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
284
+ buffer=hostary)
285
+ else:
286
+ hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
287
+ strides=self.strides, buffer=hostary)
288
+ return hostary
289
+
290
+ def split(self, section, stream=0):
291
+ """Split the array into equal partition of the `section` size.
292
+ If the array cannot be equally divided, the last section will be
293
+ smaller.
294
+ """
295
+ stream = self._default_stream(stream)
296
+ if self.ndim != 1:
297
+ raise ValueError("only support 1d array")
298
+ if self.strides[0] != self.dtype.itemsize:
299
+ raise ValueError("only support unit stride")
300
+ nsect = int(math.ceil(float(self.size) / section))
301
+ strides = self.strides
302
+ itemsize = self.dtype.itemsize
303
+ for i in range(nsect):
304
+ begin = i * section
305
+ end = min(begin + section, self.size)
306
+ shape = (end - begin,)
307
+ gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
308
+ yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
309
+ gpu_data=gpu_data)
310
+
311
+ def as_cuda_arg(self):
312
+ """Returns a device memory object that is used as the argument.
313
+ """
314
+ return self.gpu_data
315
+
316
+ def get_ipc_handle(self):
317
+ """
318
+ Returns a *IpcArrayHandle* object that is safe to serialize and transfer
319
+ to another process to share the local allocation.
320
+
321
+ Note: this feature is only available on Linux.
322
+ """
323
+ ipch = devices.get_context().get_ipc_handle(self.gpu_data)
324
+ desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
325
+ return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
326
+
327
+ def squeeze(self, axis=None, stream=0):
328
+ """
329
+ Remove axes of size one from the array shape.
330
+
331
+ Parameters
332
+ ----------
333
+ axis : None or int or tuple of ints, optional
334
+ Subset of dimensions to remove. A `ValueError` is raised if an axis
335
+ with size greater than one is selected. If `None`, all axes with
336
+ size one are removed.
337
+ stream : cuda stream or 0, optional
338
+ Default stream for the returned view of the array.
339
+
340
+ Returns
341
+ -------
342
+ DeviceNDArray
343
+ Squeezed view into the array.
344
+
345
+ """
346
+ new_dummy, _ = self._dummy.squeeze(axis=axis)
347
+ return DeviceNDArray(
348
+ shape=new_dummy.shape,
349
+ strides=new_dummy.strides,
350
+ dtype=self.dtype,
351
+ stream=self._default_stream(stream),
352
+ gpu_data=self.gpu_data,
353
+ )
354
+
355
+ def view(self, dtype):
356
+ """Returns a new object by reinterpretting the dtype without making a
357
+ copy of the data.
358
+ """
359
+ dtype = np.dtype(dtype)
360
+ shape = list(self.shape)
361
+ strides = list(self.strides)
362
+
363
+ if self.dtype.itemsize != dtype.itemsize:
364
+ if not self.is_c_contiguous():
365
+ raise ValueError(
366
+ "To change to a dtype of a different size,"
367
+ " the array must be C-contiguous"
368
+ )
369
+
370
+ shape[-1], rem = divmod(
371
+ shape[-1] * self.dtype.itemsize,
372
+ dtype.itemsize
373
+ )
374
+
375
+ if rem != 0:
376
+ raise ValueError(
377
+ "When changing to a larger dtype,"
378
+ " its size must be a divisor of the total size in bytes"
379
+ " of the last axis of the array."
380
+ )
381
+
382
+ strides[-1] = dtype.itemsize
383
+
384
+ return DeviceNDArray(
385
+ shape=shape,
386
+ strides=strides,
387
+ dtype=dtype,
388
+ stream=self.stream,
389
+ gpu_data=self.gpu_data,
390
+ )
391
+
392
+ @property
393
+ def nbytes(self):
394
+ # Note: not using `alloc_size`. `alloc_size` reports memory
395
+ # consumption of the allocation, not the size of the array
396
+ # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
397
+ return self.dtype.itemsize * self.size
398
+
399
+
400
+ class DeviceRecord(DeviceNDArrayBase):
401
+ '''
402
+ An on-GPU record type
403
+ '''
404
+ def __init__(self, dtype, stream=0, gpu_data=None):
405
+ shape = ()
406
+ strides = ()
407
+ super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
408
+ gpu_data)
409
+
410
+ @property
411
+ def flags(self):
412
+ """
413
+ For `numpy.ndarray` compatibility. Ideally this would return a
414
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
415
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
416
+ aren't writeable).
417
+ """
418
+ return dict(self._dummy.flags) # defensive copy
419
+
420
+ @property
421
+ def _numba_type_(self):
422
+ """
423
+ Magic attribute expected by Numba to get the numba type that
424
+ represents this object.
425
+ """
426
+ return numpy_support.from_dtype(self.dtype)
427
+
428
+ @devices.require_context
429
+ def __getitem__(self, item):
430
+ return self._do_getitem(item)
431
+
432
+ @devices.require_context
433
+ def getitem(self, item, stream=0):
434
+ """Do `__getitem__(item)` with CUDA stream
435
+ """
436
+ return self._do_getitem(item, stream)
437
+
438
+ def _do_getitem(self, item, stream=0):
439
+ stream = self._default_stream(stream)
440
+ typ, offset = self.dtype.fields[item]
441
+ newdata = self.gpu_data.view(offset)
442
+
443
+ if typ.shape == ():
444
+ if typ.names is not None:
445
+ return DeviceRecord(dtype=typ, stream=stream,
446
+ gpu_data=newdata)
447
+ else:
448
+ hostary = np.empty(1, dtype=typ)
449
+ _driver.device_to_host(dst=hostary, src=newdata,
450
+ size=typ.itemsize,
451
+ stream=stream)
452
+ return hostary[0]
453
+ else:
454
+ shape, strides, dtype = \
455
+ prepare_shape_strides_dtype(typ.shape,
456
+ None,
457
+ typ.subdtype[0], 'C')
458
+ return DeviceNDArray(shape=shape, strides=strides,
459
+ dtype=dtype, gpu_data=newdata,
460
+ stream=stream)
461
+
462
+ @devices.require_context
463
+ def __setitem__(self, key, value):
464
+ return self._do_setitem(key, value)
465
+
466
+ @devices.require_context
467
+ def setitem(self, key, value, stream=0):
468
+ """Do `__setitem__(key, value)` with CUDA stream
469
+ """
470
+ return self._do_setitem(key, value, stream=stream)
471
+
472
+ def _do_setitem(self, key, value, stream=0):
473
+
474
+ stream = self._default_stream(stream)
475
+
476
+ # If the record didn't have a default stream, and the user didn't
477
+ # provide a stream, then we will use the default stream for the
478
+ # assignment kernel and synchronize on it.
479
+ synchronous = not stream
480
+ if synchronous:
481
+ ctx = devices.get_context()
482
+ stream = ctx.get_default_stream()
483
+
484
+ # (1) prepare LHS
485
+
486
+ typ, offset = self.dtype.fields[key]
487
+ newdata = self.gpu_data.view(offset)
488
+
489
+ lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
490
+
491
+ # (2) prepare RHS
492
+
493
+ rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
494
+
495
+ # (3) do the copy
496
+
497
+ _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
498
+
499
+ if synchronous:
500
+ stream.synchronize()
501
+
502
+
503
+ @lru_cache
504
+ def _assign_kernel(ndim):
505
+ """
506
+ A separate method so we don't need to compile code every assignment (!).
507
+
508
+ :param ndim: We need to have static array sizes for cuda.local.array, so
509
+ bake in the number of dimensions into the kernel
510
+ """
511
+ from numba import cuda # circular!
512
+
513
+ if ndim == 0:
514
+ # the (2, ndim) allocation below is not yet supported, so avoid it
515
+ @cuda.jit
516
+ def kernel(lhs, rhs):
517
+ lhs[()] = rhs[()]
518
+ return kernel
519
+
520
+ @cuda.jit
521
+ def kernel(lhs, rhs):
522
+ location = cuda.grid(1)
523
+
524
+ n_elements = 1
525
+ for i in range(lhs.ndim):
526
+ n_elements *= lhs.shape[i]
527
+ if location >= n_elements:
528
+ # bake n_elements into the kernel, better than passing it in
529
+ # as another argument.
530
+ return
531
+
532
+ # [0, :] is the to-index (into `lhs`)
533
+ # [1, :] is the from-index (into `rhs`)
534
+ idx = cuda.local.array(
535
+ shape=(2, ndim),
536
+ dtype=types.int64)
537
+
538
+ for i in range(ndim - 1, -1, -1):
539
+ idx[0, i] = location % lhs.shape[i]
540
+ idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
541
+ location //= lhs.shape[i]
542
+
543
+ lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
544
+ return kernel
545
+
546
+
547
+ class DeviceNDArray(DeviceNDArrayBase):
548
+ '''
549
+ An on-GPU array type
550
+ '''
551
+ def is_f_contiguous(self):
552
+ '''
553
+ Return true if the array is Fortran-contiguous.
554
+ '''
555
+ return self._dummy.is_f_contig
556
+
557
+ @property
558
+ def flags(self):
559
+ """
560
+ For `numpy.ndarray` compatibility. Ideally this would return a
561
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
562
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
563
+ aren't writeable).
564
+ """
565
+ return dict(self._dummy.flags) # defensive copy
566
+
567
+ def is_c_contiguous(self):
568
+ '''
569
+ Return true if the array is C-contiguous.
570
+ '''
571
+ return self._dummy.is_c_contig
572
+
573
+ def __array__(self, dtype=None):
574
+ """
575
+ :return: an `numpy.ndarray`, so copies to the host.
576
+ """
577
+ if dtype:
578
+ return self.copy_to_host().__array__(dtype)
579
+ else:
580
+ return self.copy_to_host().__array__()
581
+
582
+ def __len__(self):
583
+ return self.shape[0]
584
+
585
+ def reshape(self, *newshape, **kws):
586
+ """
587
+ Reshape the array without changing its contents, similarly to
588
+ :meth:`numpy.ndarray.reshape`. Example::
589
+
590
+ d_arr = d_arr.reshape(20, 50, order='F')
591
+ """
592
+ if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
593
+ newshape = newshape[0]
594
+
595
+ cls = type(self)
596
+ if newshape == self.shape:
597
+ # nothing to do
598
+ return cls(shape=self.shape, strides=self.strides,
599
+ dtype=self.dtype, gpu_data=self.gpu_data)
600
+
601
+ newarr, extents = self._dummy.reshape(*newshape, **kws)
602
+
603
+ if extents == [self._dummy.extent]:
604
+ return cls(shape=newarr.shape, strides=newarr.strides,
605
+ dtype=self.dtype, gpu_data=self.gpu_data)
606
+ else:
607
+ raise NotImplementedError("operation requires copying")
608
+
609
+ def ravel(self, order='C', stream=0):
610
+ '''
611
+ Flattens a contiguous array without changing its contents, similar to
612
+ :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
613
+ exception.
614
+ '''
615
+ stream = self._default_stream(stream)
616
+ cls = type(self)
617
+ newarr, extents = self._dummy.ravel(order=order)
618
+
619
+ if extents == [self._dummy.extent]:
620
+ return cls(shape=newarr.shape, strides=newarr.strides,
621
+ dtype=self.dtype, gpu_data=self.gpu_data,
622
+ stream=stream)
623
+
624
+ else:
625
+ raise NotImplementedError("operation requires copying")
626
+
627
+ @devices.require_context
628
+ def __getitem__(self, item):
629
+ return self._do_getitem(item)
630
+
631
+ @devices.require_context
632
+ def getitem(self, item, stream=0):
633
+ """Do `__getitem__(item)` with CUDA stream
634
+ """
635
+ return self._do_getitem(item, stream)
636
+
637
+ def _do_getitem(self, item, stream=0):
638
+ stream = self._default_stream(stream)
639
+
640
+ arr = self._dummy.__getitem__(item)
641
+ extents = list(arr.iter_contiguous_extent())
642
+ cls = type(self)
643
+ if len(extents) == 1:
644
+ newdata = self.gpu_data.view(*extents[0])
645
+
646
+ if not arr.is_array:
647
+ # Check for structured array type (record)
648
+ if self.dtype.names is not None:
649
+ return DeviceRecord(dtype=self.dtype, stream=stream,
650
+ gpu_data=newdata)
651
+ else:
652
+ # Element indexing
653
+ hostary = np.empty(1, dtype=self.dtype)
654
+ _driver.device_to_host(dst=hostary, src=newdata,
655
+ size=self._dummy.itemsize,
656
+ stream=stream)
657
+ return hostary[0]
658
+ else:
659
+ return cls(shape=arr.shape, strides=arr.strides,
660
+ dtype=self.dtype, gpu_data=newdata, stream=stream)
661
+ else:
662
+ newdata = self.gpu_data.view(*arr.extent)
663
+ return cls(shape=arr.shape, strides=arr.strides,
664
+ dtype=self.dtype, gpu_data=newdata, stream=stream)
665
+
666
+ @devices.require_context
667
+ def __setitem__(self, key, value):
668
+ return self._do_setitem(key, value)
669
+
670
+ @devices.require_context
671
+ def setitem(self, key, value, stream=0):
672
+ """Do `__setitem__(key, value)` with CUDA stream
673
+ """
674
+ return self._do_setitem(key, value, stream=stream)
675
+
676
+ def _do_setitem(self, key, value, stream=0):
677
+
678
+ stream = self._default_stream(stream)
679
+
680
+ # If the array didn't have a default stream, and the user didn't provide
681
+ # a stream, then we will use the default stream for the assignment
682
+ # kernel and synchronize on it.
683
+ synchronous = not stream
684
+ if synchronous:
685
+ ctx = devices.get_context()
686
+ stream = ctx.get_default_stream()
687
+
688
+ # (1) prepare LHS
689
+
690
+ arr = self._dummy.__getitem__(key)
691
+ newdata = self.gpu_data.view(*arr.extent)
692
+
693
+ if isinstance(arr, dummyarray.Element):
694
+ # convert to a 0d array
695
+ shape = ()
696
+ strides = ()
697
+ else:
698
+ shape = arr.shape
699
+ strides = arr.strides
700
+
701
+ lhs = type(self)(
702
+ shape=shape,
703
+ strides=strides,
704
+ dtype=self.dtype,
705
+ gpu_data=newdata,
706
+ stream=stream)
707
+
708
+ # (2) prepare RHS
709
+
710
+ rhs, _ = auto_device(value, stream=stream, user_explicit=True)
711
+ if rhs.ndim > lhs.ndim:
712
+ raise ValueError("Can't assign %s-D array to %s-D self" % (
713
+ rhs.ndim,
714
+ lhs.ndim))
715
+ rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
716
+ # negative indices would not work if rhs.ndim == 0
717
+ rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
718
+ rhs = rhs.reshape(*rhs_shape)
719
+ for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
720
+ if r != 1 and l != r:
721
+ raise ValueError("Can't copy sequence with size %d to array "
722
+ "axis %d with dimension %d" % ( r, i, l))
723
+
724
+ # (3) do the copy
725
+
726
+ n_elements = functools.reduce(operator.mul, lhs.shape, 1)
727
+ _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
728
+ if synchronous:
729
+ stream.synchronize()
730
+
731
+
732
+ class IpcArrayHandle(object):
733
+ """
734
+ An IPC array handle that can be serialized and transfer to another process
735
+ in the same machine for share a GPU allocation.
736
+
737
+ On the destination process, use the *.open()* method to creates a new
738
+ *DeviceNDArray* object that shares the allocation from the original process.
739
+ To release the resources, call the *.close()* method. After that, the
740
+ destination can no longer use the shared array object. (Note: the
741
+ underlying weakref to the resource is now dead.)
742
+
743
+ This object implements the context-manager interface that calls the
744
+ *.open()* and *.close()* method automatically::
745
+
746
+ with the_ipc_array_handle as ipc_array:
747
+ # use ipc_array here as a normal gpu array object
748
+ some_code(ipc_array)
749
+ # ipc_array is dead at this point
750
+ """
751
+ def __init__(self, ipc_handle, array_desc):
752
+ self._array_desc = array_desc
753
+ self._ipc_handle = ipc_handle
754
+
755
+ def open(self):
756
+ """
757
+ Returns a new *DeviceNDArray* that shares the allocation from the
758
+ original process. Must not be used on the original process.
759
+ """
760
+ dptr = self._ipc_handle.open(devices.get_context())
761
+ return DeviceNDArray(gpu_data=dptr, **self._array_desc)
762
+
763
+ def close(self):
764
+ """
765
+ Closes the IPC handle to the array.
766
+ """
767
+ self._ipc_handle.close()
768
+
769
+ def __enter__(self):
770
+ return self.open()
771
+
772
+ def __exit__(self, type, value, traceback):
773
+ self.close()
774
+
775
+
776
+ class MappedNDArray(DeviceNDArrayBase, np.ndarray):
777
+ """
778
+ A host array that uses CUDA mapped memory.
779
+ """
780
+
781
+ def device_setup(self, gpu_data, stream=0):
782
+ self.gpu_data = gpu_data
783
+ self.stream = stream
784
+
785
+
786
+ class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
787
+ """
788
+ A host array that uses CUDA managed memory.
789
+ """
790
+
791
+ def device_setup(self, gpu_data, stream=0):
792
+ self.gpu_data = gpu_data
793
+ self.stream = stream
794
+
795
+
796
+ def from_array_like(ary, stream=0, gpu_data=None):
797
+ "Create a DeviceNDArray object that is like ary."
798
+ return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
799
+ gpu_data=gpu_data)
800
+
801
+
802
+ def from_record_like(rec, stream=0, gpu_data=None):
803
+ "Create a DeviceRecord object that is like rec."
804
+ return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
805
+
806
+
807
+ def array_core(ary):
808
+ """
809
+ Extract the repeated core of a broadcast array.
810
+
811
+ Broadcast arrays are by definition non-contiguous due to repeated
812
+ dimensions, i.e., dimensions with stride 0. In order to ascertain memory
813
+ contiguity and copy the underlying data from such arrays, we must create
814
+ a view without the repeated dimensions.
815
+
816
+ """
817
+ if not ary.strides or not ary.size:
818
+ return ary
819
+ core_index = []
820
+ for stride in ary.strides:
821
+ core_index.append(0 if stride == 0 else slice(None))
822
+ return ary[tuple(core_index)]
823
+
824
+
825
+ def is_contiguous(ary):
826
+ """
827
+ Returns True iff `ary` is C-style contiguous while ignoring
828
+ broadcasted and 1-sized dimensions.
829
+ As opposed to array_core(), it does not call require_context(),
830
+ which can be quite expensive.
831
+ """
832
+ size = ary.dtype.itemsize
833
+ for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
834
+ if shape > 1 and stride != 0:
835
+ if size != stride:
836
+ return False
837
+ size *= shape
838
+ return True
839
+
840
+
841
+ errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
842
+ "be transferred as a single memory region. Please "
843
+ "ensure contiguous buffer with numpy "
844
+ ".ascontiguousarray()")
845
+
846
+
847
+ def sentry_contiguous(ary):
848
+ core = array_core(ary)
849
+ if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
850
+ raise ValueError(errmsg_contiguous_buffer)
851
+
852
+
853
+ def auto_device(obj, stream=0, copy=True, user_explicit=False):
854
+ """
855
+ Create a DeviceRecord or DeviceArray like obj and optionally copy data from
856
+ host to device. If obj already represents device memory, it is returned and
857
+ no copy is made.
858
+ """
859
+ if _driver.is_device_memory(obj):
860
+ return obj, False
861
+ elif hasattr(obj, '__cuda_array_interface__'):
862
+ return numba.cuda.as_cuda_array(obj), False
863
+ else:
864
+ if isinstance(obj, np.void):
865
+ devobj = from_record_like(obj, stream=stream)
866
+ else:
867
+ # This allows you to pass non-array objects like constants and
868
+ # objects implementing the array interface
869
+ # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
870
+ # into this function (with no overhead -- copies -- for `obj`s
871
+ # that are already `ndarray`s.
872
+ obj = np.array(
873
+ obj,
874
+ copy=False if numpy_version < (2, 0) else None,
875
+ subok=True)
876
+ sentry_contiguous(obj)
877
+ devobj = from_array_like(obj, stream=stream)
878
+ if copy:
879
+ if config.CUDA_WARN_ON_IMPLICIT_COPY:
880
+ if (
881
+ not user_explicit and
882
+ (not isinstance(obj, DeviceNDArray)
883
+ and isinstance(obj, np.ndarray))
884
+ ):
885
+ msg = ("Host array used in CUDA kernel will incur "
886
+ "copy overhead to/from device.")
887
+ warn(NumbaPerformanceWarning(msg))
888
+ devobj.copy_to_device(obj, stream=stream)
889
+ return devobj, True
890
+
891
+
892
+ def check_array_compatibility(ary1, ary2):
893
+ ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
894
+ if ary1.dtype != ary2.dtype:
895
+ raise TypeError('incompatible dtype: %s vs. %s' %
896
+ (ary1.dtype, ary2.dtype))
897
+ if ary1sq.shape != ary2sq.shape:
898
+ raise ValueError('incompatible shape: %s vs. %s' %
899
+ (ary1.shape, ary2.shape))
900
+ # We check strides only if the size is nonzero, because strides are
901
+ # irrelevant (and can differ) for zero-length copies.
902
+ if ary1.size and ary1sq.strides != ary2sq.strides:
903
+ raise ValueError('incompatible strides: %s vs. %s' %
904
+ (ary1.strides, ary2.strides))