numba-cuda 0.22.0__cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-313-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-313-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-313-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-313-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-313-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,954 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
6
+ on the object. If it exists and evaluate to True, it must define shape,
7
+ strides, dtype and size attributes similar to a NumPy ndarray.
8
+ """
9
+
10
+ import math
11
+ import functools
12
+ import operator
13
+ import copy
14
+ from ctypes import c_void_p
15
+
16
+ import numpy as np
17
+
18
+ from numba.cuda.cext import _devicearray
19
+ from numba.cuda.cudadrv import devices, dummyarray
20
+ from numba.cuda.cudadrv import driver as _driver
21
+ from numba.cuda import types
22
+ from numba.cuda.core import config
23
+ from numba.cuda.np.unsafe.ndarray import to_fixed_tuple
24
+ from numba.cuda.np.numpy_support import numpy_version
25
+ from numba.cuda.np import numpy_support
26
+ from numba.cuda.api_util import prepare_shape_strides_dtype
27
+ from numba.cuda.core.errors import NumbaPerformanceWarning
28
+ from warnings import warn
29
+
30
+
31
+ def is_cuda_ndarray(obj):
32
+ "Check if an object is a CUDA ndarray"
33
+ return getattr(obj, "__cuda_ndarray__", False)
34
+
35
+
36
+ def verify_cuda_ndarray_interface(obj):
37
+ "Verify the CUDA ndarray interface for an obj"
38
+ require_cuda_ndarray(obj)
39
+
40
+ def requires_attr(attr, typ):
41
+ if not hasattr(obj, attr):
42
+ raise AttributeError(attr)
43
+ if not isinstance(getattr(obj, attr), typ):
44
+ raise AttributeError("%s must be of type %s" % (attr, typ))
45
+
46
+ requires_attr("shape", tuple)
47
+ requires_attr("strides", tuple)
48
+ requires_attr("dtype", np.dtype)
49
+ requires_attr("size", int)
50
+
51
+
52
+ def require_cuda_ndarray(obj):
53
+ "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
54
+ if not is_cuda_ndarray(obj):
55
+ raise ValueError("require an cuda ndarray object")
56
+
57
+
58
+ class DeviceNDArrayBase(_devicearray.DeviceArray):
59
+ """A on GPU NDArray representation"""
60
+
61
+ __cuda_memory__ = True
62
+ __cuda_ndarray__ = True # There must be gpu_data attribute
63
+
64
+ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
65
+ """
66
+ Args
67
+ ----
68
+
69
+ shape
70
+ array shape.
71
+ strides
72
+ array strides.
73
+ dtype
74
+ data type as np.dtype coercible object.
75
+ stream
76
+ cuda stream.
77
+ gpu_data
78
+ user provided device memory for the ndarray data buffer
79
+ """
80
+ if isinstance(shape, int):
81
+ shape = (shape,)
82
+ else:
83
+ shape = tuple(shape)
84
+ if isinstance(strides, int):
85
+ strides = (strides,)
86
+ else:
87
+ if strides:
88
+ strides = tuple(strides)
89
+ dtype = np.dtype(dtype)
90
+ itemsize = dtype.itemsize
91
+ self.ndim = ndim = len(shape)
92
+ if len(strides) != ndim:
93
+ raise ValueError("strides not match ndim")
94
+ self._dummy = dummy = dummyarray.Array.from_desc(
95
+ 0, shape, strides, itemsize
96
+ )
97
+ self.shape = shape = dummy.shape
98
+ self.strides = strides = dummy.strides
99
+ self.dtype = dtype
100
+ self.size = size = dummy.size
101
+ # prepare gpu memory
102
+ if size:
103
+ self.alloc_size = alloc_size = _driver.memory_size_from_info(
104
+ shape, strides, itemsize
105
+ )
106
+ if gpu_data is None:
107
+ gpu_data = devices.get_context().memalloc(alloc_size)
108
+ else:
109
+ # Make NULL pointer for empty allocation
110
+ null = _driver.binding.CUdeviceptr(0)
111
+ gpu_data = _driver.MemoryPointer(
112
+ context=devices.get_context(), pointer=null, size=0
113
+ )
114
+ self.alloc_size = 0
115
+
116
+ self.gpu_data = gpu_data
117
+ self.stream = stream
118
+
119
+ @property
120
+ def __cuda_array_interface__(self):
121
+ if (value := self.device_ctypes_pointer.value) is not None:
122
+ ptr = value
123
+ else:
124
+ ptr = 0
125
+
126
+ return {
127
+ "shape": self.shape,
128
+ "strides": None if is_contiguous(self) else tuple(self.strides),
129
+ "data": (ptr, False),
130
+ "typestr": self.dtype.str,
131
+ "stream": int(stream) if (stream := self.stream) != 0 else None,
132
+ "version": 3,
133
+ }
134
+
135
+ def bind(self, stream=0):
136
+ """Bind a CUDA stream to this object so that all subsequent operation
137
+ on this array defaults to the given stream.
138
+ """
139
+ clone = copy.copy(self)
140
+ clone.stream = stream
141
+ return clone
142
+
143
+ @property
144
+ def T(self):
145
+ return self.transpose()
146
+
147
+ def transpose(self, axes=None):
148
+ if axes and tuple(axes) == tuple(range(self.ndim)):
149
+ return self
150
+ elif self.ndim != 2:
151
+ msg = "transposing a non-2D DeviceNDArray isn't supported"
152
+ raise NotImplementedError(msg)
153
+ elif axes is not None and set(axes) != set(range(self.ndim)):
154
+ raise ValueError("invalid axes list %r" % (axes,))
155
+ else:
156
+ from numba.cuda.kernels.transpose import transpose
157
+
158
+ return transpose(self)
159
+
160
+ def _default_stream(self, stream):
161
+ return self.stream if not stream else stream
162
+
163
+ @property
164
+ def _numba_type_(self):
165
+ """
166
+ Magic attribute expected by Numba to get the numba type that
167
+ represents this object.
168
+ """
169
+ # Typing considerations:
170
+ #
171
+ # 1. The preference is to use 'C' or 'F' layout since this enables
172
+ # hardcoding stride values into compiled kernels, which is more
173
+ # efficient than storing a passed-in value in a register.
174
+ #
175
+ # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
176
+ # the more likely / common case.
177
+ #
178
+ # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
179
+ # or 'F' does not apply for broadcast arrays, because the strides, some
180
+ # of which will be 0, will not match those hardcoded in for 'C' or 'F'
181
+ # layouts.
182
+
183
+ broadcast = 0 in self.strides and (self.size != 0)
184
+
185
+ if self.flags["C_CONTIGUOUS"] and not broadcast:
186
+ layout = "C"
187
+ elif self.flags["F_CONTIGUOUS"] and not broadcast:
188
+ layout = "F"
189
+ else:
190
+ layout = "A"
191
+
192
+ dtype = numpy_support.from_dtype(self.dtype)
193
+ return types.Array(dtype, self.ndim, layout)
194
+
195
+ @property
196
+ def device_ctypes_pointer(self):
197
+ """Returns the ctypes pointer to the GPU data buffer"""
198
+ try:
199
+ # apparently faster in the non-exceptional case
200
+ return self.gpu_data.device_ctypes_pointer
201
+ except AttributeError:
202
+ return c_void_p(0)
203
+
204
+ @devices.require_context
205
+ def copy_to_device(self, ary, stream=0):
206
+ """Copy `ary` to `self`.
207
+
208
+ If `ary` is a CUDA memory, perform a device-to-device transfer.
209
+ Otherwise, perform a a host-to-device transfer.
210
+ """
211
+ if ary.size == 0:
212
+ # Nothing to do
213
+ return
214
+
215
+ sentry_contiguous(self)
216
+ stream = self._default_stream(stream)
217
+
218
+ self_core, ary_core = array_core(self), array_core(ary)
219
+ if _driver.is_device_memory(ary):
220
+ sentry_contiguous(ary)
221
+ check_array_compatibility(self_core, ary_core)
222
+ _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
223
+ else:
224
+ # Ensure same contiguity. Only makes a host-side copy if necessary
225
+ # (i.e., in order to materialize a writable strided view)
226
+ ary_core = np.array(
227
+ ary_core,
228
+ order="C" if self_core.flags["C_CONTIGUOUS"] else "F",
229
+ subok=True,
230
+ copy=(not ary_core.flags["WRITEABLE"])
231
+ if numpy_version < (2, 0)
232
+ else None,
233
+ )
234
+ check_array_compatibility(self_core, ary_core)
235
+ _driver.host_to_device(
236
+ self, ary_core, self.alloc_size, stream=stream
237
+ )
238
+
239
+ @devices.require_context
240
+ def copy_to_host(self, ary=None, stream=0):
241
+ """Copy ``self`` to ``ary`` or create a new Numpy ndarray
242
+ if ``ary`` is ``None``.
243
+
244
+ If a CUDA ``stream`` is given, then the transfer will be made
245
+ asynchronously as part as the given stream. Otherwise, the transfer is
246
+ synchronous: the function returns after the copy is finished.
247
+
248
+ Always returns the host array.
249
+
250
+ Example::
251
+
252
+ import numpy as np
253
+ from numba import cuda
254
+
255
+ arr = np.arange(1000)
256
+ d_arr = cuda.to_device(arr)
257
+
258
+ my_kernel[100, 100](d_arr)
259
+
260
+ result_array = d_arr.copy_to_host()
261
+ """
262
+ if any(s < 0 for s in self.strides):
263
+ msg = "D->H copy not implemented for negative strides: {}"
264
+ raise NotImplementedError(msg.format(self.strides))
265
+ assert self.alloc_size >= 0, "Negative memory size"
266
+ stream = self._default_stream(stream)
267
+ if ary is None:
268
+ hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
269
+ else:
270
+ check_array_compatibility(self, ary)
271
+ hostary = ary
272
+
273
+ if self.alloc_size != 0:
274
+ _driver.device_to_host(
275
+ hostary, self, self.alloc_size, stream=stream
276
+ )
277
+
278
+ if ary is None:
279
+ if self.size == 0:
280
+ hostary = np.ndarray(
281
+ shape=self.shape, dtype=self.dtype, buffer=hostary
282
+ )
283
+ else:
284
+ hostary = np.ndarray(
285
+ shape=self.shape,
286
+ dtype=self.dtype,
287
+ strides=self.strides,
288
+ buffer=hostary,
289
+ )
290
+ return hostary
291
+
292
+ def split(self, section, stream=0):
293
+ """Split the array into equal partition of the `section` size.
294
+ If the array cannot be equally divided, the last section will be
295
+ smaller.
296
+ """
297
+ stream = self._default_stream(stream)
298
+ if self.ndim != 1:
299
+ raise ValueError("only support 1d array")
300
+ if self.strides[0] != self.dtype.itemsize:
301
+ raise ValueError("only support unit stride")
302
+ nsect = int(math.ceil(float(self.size) / section))
303
+ strides = self.strides
304
+ itemsize = self.dtype.itemsize
305
+ for i in range(nsect):
306
+ begin = i * section
307
+ end = min(begin + section, self.size)
308
+ shape = (end - begin,)
309
+ gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
310
+ yield DeviceNDArray(
311
+ shape,
312
+ strides,
313
+ dtype=self.dtype,
314
+ stream=stream,
315
+ gpu_data=gpu_data,
316
+ )
317
+
318
+ def as_cuda_arg(self):
319
+ """Returns a device memory object that is used as the argument."""
320
+ return self.gpu_data
321
+
322
+ def get_ipc_handle(self):
323
+ """
324
+ Returns a *IpcArrayHandle* object that is safe to serialize and transfer
325
+ to another process to share the local allocation.
326
+
327
+ Note: this feature is only available on Linux.
328
+ """
329
+ ipch = devices.get_context().get_ipc_handle(self.gpu_data)
330
+ desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
331
+ return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
332
+
333
+ def squeeze(self, axis=None, stream=0):
334
+ """
335
+ Remove axes of size one from the array shape.
336
+
337
+ Parameters
338
+ ----------
339
+ axis : None or int or tuple of ints, optional
340
+ Subset of dimensions to remove. A `ValueError` is raised if an axis
341
+ with size greater than one is selected. If `None`, all axes with
342
+ size one are removed.
343
+ stream : cuda stream or 0, optional
344
+ Default stream for the returned view of the array.
345
+
346
+ Returns
347
+ -------
348
+ DeviceNDArray
349
+ Squeezed view into the array.
350
+
351
+ """
352
+ new_dummy, _ = self._dummy.squeeze(axis=axis)
353
+ return DeviceNDArray(
354
+ shape=new_dummy.shape,
355
+ strides=new_dummy.strides,
356
+ dtype=self.dtype,
357
+ stream=self._default_stream(stream),
358
+ gpu_data=self.gpu_data,
359
+ )
360
+
361
+ def view(self, dtype):
362
+ """Returns a new object by reinterpretting the dtype without making a
363
+ copy of the data.
364
+ """
365
+ dtype = np.dtype(dtype)
366
+ shape = list(self.shape)
367
+ strides = list(self.strides)
368
+
369
+ if self.dtype.itemsize != dtype.itemsize:
370
+ if not self.is_c_contiguous():
371
+ raise ValueError(
372
+ "To change to a dtype of a different size,"
373
+ " the array must be C-contiguous"
374
+ )
375
+
376
+ shape[-1], rem = divmod(
377
+ shape[-1] * self.dtype.itemsize, dtype.itemsize
378
+ )
379
+
380
+ if rem != 0:
381
+ raise ValueError(
382
+ "When changing to a larger dtype,"
383
+ " its size must be a divisor of the total size in bytes"
384
+ " of the last axis of the array."
385
+ )
386
+
387
+ strides[-1] = dtype.itemsize
388
+
389
+ return DeviceNDArray(
390
+ shape=shape,
391
+ strides=strides,
392
+ dtype=dtype,
393
+ stream=self.stream,
394
+ gpu_data=self.gpu_data,
395
+ )
396
+
397
+ @property
398
+ def nbytes(self):
399
+ # Note: not using `alloc_size`. `alloc_size` reports memory
400
+ # consumption of the allocation, not the size of the array
401
+ # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
402
+ return self.dtype.itemsize * self.size
403
+
404
+
405
+ class DeviceRecord(DeviceNDArrayBase):
406
+ """
407
+ An on-GPU record type
408
+ """
409
+
410
+ def __init__(self, dtype, stream=0, gpu_data=None):
411
+ shape = ()
412
+ strides = ()
413
+ super(DeviceRecord, self).__init__(
414
+ shape, strides, dtype, stream, gpu_data
415
+ )
416
+
417
+ @property
418
+ def flags(self):
419
+ """
420
+ For `numpy.ndarray` compatibility. Ideally this would return a
421
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
422
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
423
+ aren't writeable).
424
+ """
425
+ return dict(self._dummy.flags) # defensive copy
426
+
427
+ @property
428
+ def _numba_type_(self):
429
+ """
430
+ Magic attribute expected by Numba to get the numba type that
431
+ represents this object.
432
+ """
433
+ return numpy_support.from_dtype(self.dtype)
434
+
435
+ @devices.require_context
436
+ def __getitem__(self, item):
437
+ return self._do_getitem(item)
438
+
439
+ @devices.require_context
440
+ def getitem(self, item, stream=0):
441
+ """Do `__getitem__(item)` with CUDA stream"""
442
+ return self._do_getitem(item, stream)
443
+
444
+ def _do_getitem(self, item, stream=0):
445
+ stream = self._default_stream(stream)
446
+ typ, offset = self.dtype.fields[item]
447
+ newdata = self.gpu_data.view(offset)
448
+
449
+ if typ.shape == ():
450
+ if typ.names is not None:
451
+ return DeviceRecord(dtype=typ, stream=stream, gpu_data=newdata)
452
+ else:
453
+ hostary = np.empty(1, dtype=typ)
454
+ _driver.device_to_host(
455
+ dst=hostary, src=newdata, size=typ.itemsize, stream=stream
456
+ )
457
+ return hostary[0]
458
+ else:
459
+ shape, strides, dtype = prepare_shape_strides_dtype(
460
+ typ.shape, None, typ.subdtype[0], "C"
461
+ )
462
+ return DeviceNDArray(
463
+ shape=shape,
464
+ strides=strides,
465
+ dtype=dtype,
466
+ gpu_data=newdata,
467
+ stream=stream,
468
+ )
469
+
470
+ @devices.require_context
471
+ def __setitem__(self, key, value):
472
+ return self._do_setitem(key, value)
473
+
474
+ @devices.require_context
475
+ def setitem(self, key, value, stream=0):
476
+ """Do `__setitem__(key, value)` with CUDA stream"""
477
+ return self._do_setitem(key, value, stream=stream)
478
+
479
+ def _do_setitem(self, key, value, stream=0):
480
+ stream = self._default_stream(stream)
481
+
482
+ # If the record didn't have a default stream, and the user didn't
483
+ # provide a stream, then we will use the default stream for the
484
+ # assignment kernel and synchronize on it.
485
+ synchronous = not stream
486
+ if synchronous:
487
+ ctx = devices.get_context()
488
+ stream = ctx.get_default_stream()
489
+
490
+ # (1) prepare LHS
491
+
492
+ typ, offset = self.dtype.fields[key]
493
+ newdata = self.gpu_data.view(offset)
494
+
495
+ lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
496
+
497
+ # (2) prepare RHS
498
+
499
+ rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
500
+
501
+ # (3) do the copy
502
+
503
+ _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
504
+
505
+ if synchronous:
506
+ stream.synchronize()
507
+
508
+
509
+ @functools.lru_cache
510
+ def _assign_kernel(ndim):
511
+ """
512
+ A separate method so we don't need to compile code every assignment (!).
513
+
514
+ :param ndim: We need to have static array sizes for cuda.local.array, so
515
+ bake in the number of dimensions into the kernel
516
+ """
517
+ from numba import cuda # circular!
518
+
519
+ if ndim == 0:
520
+ # the (2, ndim) allocation below is not yet supported, so avoid it
521
+ @cuda.jit
522
+ def kernel(lhs, rhs):
523
+ lhs[()] = rhs[()]
524
+
525
+ return kernel
526
+
527
+ @cuda.jit
528
+ def kernel(lhs, rhs):
529
+ location = cuda.grid(1)
530
+
531
+ n_elements = 1
532
+ for i in range(lhs.ndim):
533
+ n_elements *= lhs.shape[i]
534
+ if location >= n_elements:
535
+ # bake n_elements into the kernel, better than passing it in
536
+ # as another argument.
537
+ return
538
+
539
+ # [0, :] is the to-index (into `lhs`)
540
+ # [1, :] is the from-index (into `rhs`)
541
+ idx = cuda.local.array(shape=(2, ndim), dtype=types.int64)
542
+
543
+ for i in range(ndim - 1, -1, -1):
544
+ idx[0, i] = location % lhs.shape[i]
545
+ idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
546
+ location //= lhs.shape[i]
547
+
548
+ lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
549
+
550
+ return kernel
551
+
552
+
553
+ class DeviceNDArray(DeviceNDArrayBase):
554
+ """
555
+ An on-GPU array type
556
+ """
557
+
558
+ def is_f_contiguous(self):
559
+ """
560
+ Return true if the array is Fortran-contiguous.
561
+ """
562
+ return self._dummy.is_f_contig
563
+
564
+ @property
565
+ def flags(self):
566
+ """
567
+ For `numpy.ndarray` compatibility. Ideally this would return a
568
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
569
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
570
+ aren't writeable).
571
+ """
572
+ return dict(self._dummy.flags) # defensive copy
573
+
574
+ def is_c_contiguous(self):
575
+ """
576
+ Return true if the array is C-contiguous.
577
+ """
578
+ return self._dummy.is_c_contig
579
+
580
+ def __array__(self, dtype=None, copy=None):
581
+ """
582
+ :return: an `numpy.ndarray`, so copies to the host.
583
+ """
584
+ if copy is False:
585
+ msg = "`copy=False` is not supported. A copy is always created."
586
+ raise ValueError(msg)
587
+ if dtype:
588
+ return self.copy_to_host().__array__(dtype)
589
+ else:
590
+ return self.copy_to_host().__array__()
591
+
592
+ def __len__(self):
593
+ return self.shape[0]
594
+
595
+ def reshape(self, *newshape, **kws):
596
+ """
597
+ Reshape the array without changing its contents, similarly to
598
+ :meth:`numpy.ndarray.reshape`. Example::
599
+
600
+ d_arr = d_arr.reshape(20, 50, order="F")
601
+ """
602
+ if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
603
+ newshape = newshape[0]
604
+
605
+ cls = type(self)
606
+ if newshape == self.shape:
607
+ # nothing to do
608
+ return cls(
609
+ shape=self.shape,
610
+ strides=self.strides,
611
+ dtype=self.dtype,
612
+ gpu_data=self.gpu_data,
613
+ )
614
+
615
+ newarr, extents = self._dummy.reshape(*newshape, **kws)
616
+
617
+ if extents == [self._dummy.extent]:
618
+ return cls(
619
+ shape=newarr.shape,
620
+ strides=newarr.strides,
621
+ dtype=self.dtype,
622
+ gpu_data=self.gpu_data,
623
+ )
624
+ else:
625
+ raise NotImplementedError("operation requires copying")
626
+
627
+ def ravel(self, order="C", stream=0):
628
+ """
629
+ Flattens a contiguous array without changing its contents, similar to
630
+ :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
631
+ exception.
632
+ """
633
+ stream = self._default_stream(stream)
634
+ cls = type(self)
635
+ newarr, extents = self._dummy.ravel(order=order)
636
+
637
+ if extents == [self._dummy.extent]:
638
+ return cls(
639
+ shape=newarr.shape,
640
+ strides=newarr.strides,
641
+ dtype=self.dtype,
642
+ gpu_data=self.gpu_data,
643
+ stream=stream,
644
+ )
645
+
646
+ else:
647
+ raise NotImplementedError("operation requires copying")
648
+
649
+ @devices.require_context
650
+ def __getitem__(self, item):
651
+ return self._do_getitem(item)
652
+
653
+ @devices.require_context
654
+ def getitem(self, item, stream=0):
655
+ """Do `__getitem__(item)` with CUDA stream"""
656
+ return self._do_getitem(item, stream)
657
+
658
+ def _do_getitem(self, item, stream=0):
659
+ stream = self._default_stream(stream)
660
+
661
+ arr = self._dummy.__getitem__(item)
662
+ extents = list(arr.iter_contiguous_extent())
663
+ cls = type(self)
664
+ if len(extents) == 1:
665
+ newdata = self.gpu_data.view(*extents[0])
666
+
667
+ if not arr.is_array:
668
+ # Check for structured array type (record)
669
+ if self.dtype.names is not None:
670
+ return DeviceRecord(
671
+ dtype=self.dtype, stream=stream, gpu_data=newdata
672
+ )
673
+ else:
674
+ # Element indexing
675
+ hostary = np.empty(1, dtype=self.dtype)
676
+ _driver.device_to_host(
677
+ dst=hostary,
678
+ src=newdata,
679
+ size=self._dummy.itemsize,
680
+ stream=stream,
681
+ )
682
+ return hostary[0]
683
+ else:
684
+ return cls(
685
+ shape=arr.shape,
686
+ strides=arr.strides,
687
+ dtype=self.dtype,
688
+ gpu_data=newdata,
689
+ stream=stream,
690
+ )
691
+ else:
692
+ newdata = self.gpu_data.view(*arr.extent)
693
+ return cls(
694
+ shape=arr.shape,
695
+ strides=arr.strides,
696
+ dtype=self.dtype,
697
+ gpu_data=newdata,
698
+ stream=stream,
699
+ )
700
+
701
+ @devices.require_context
702
+ def __setitem__(self, key, value):
703
+ return self._do_setitem(key, value)
704
+
705
+ @devices.require_context
706
+ def setitem(self, key, value, stream=0):
707
+ """Do `__setitem__(key, value)` with CUDA stream"""
708
+ return self._do_setitem(key, value, stream=stream)
709
+
710
+ def _do_setitem(self, key, value, stream=0):
711
+ stream = self._default_stream(stream)
712
+
713
+ # If the array didn't have a default stream, and the user didn't provide
714
+ # a stream, then we will use the default stream for the assignment
715
+ # kernel and synchronize on it.
716
+ synchronous = not stream
717
+ if synchronous:
718
+ ctx = devices.get_context()
719
+ stream = ctx.get_default_stream()
720
+
721
+ # (1) prepare LHS
722
+
723
+ arr = self._dummy.__getitem__(key)
724
+ newdata = self.gpu_data.view(*arr.extent)
725
+
726
+ if isinstance(arr, dummyarray.Element):
727
+ # convert to a 0d array
728
+ shape = ()
729
+ strides = ()
730
+ else:
731
+ shape = arr.shape
732
+ strides = arr.strides
733
+
734
+ lhs = type(self)(
735
+ shape=shape,
736
+ strides=strides,
737
+ dtype=self.dtype,
738
+ gpu_data=newdata,
739
+ stream=stream,
740
+ )
741
+
742
+ # (2) prepare RHS
743
+
744
+ rhs, _ = auto_device(value, stream=stream, user_explicit=True)
745
+ if rhs.ndim > lhs.ndim:
746
+ raise ValueError(
747
+ "Can't assign %s-D array to %s-D self" % (rhs.ndim, lhs.ndim)
748
+ )
749
+ rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
750
+ # negative indices would not work if rhs.ndim == 0
751
+ rhs_shape[lhs.ndim - rhs.ndim :] = rhs.shape
752
+ rhs = rhs.reshape(*rhs_shape)
753
+ for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
754
+ if r != 1 and l != r:
755
+ raise ValueError(
756
+ "Can't copy sequence with size %d to array "
757
+ "axis %d with dimension %d" % (r, i, l)
758
+ )
759
+
760
+ # (3) do the copy
761
+
762
+ n_elements = functools.reduce(operator.mul, lhs.shape, 1)
763
+ _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
764
+ if synchronous:
765
+ stream.synchronize()
766
+
767
+
768
+ class IpcArrayHandle(object):
769
+ """
770
+ An IPC array handle that can be serialized and transfer to another process
771
+ in the same machine for share a GPU allocation.
772
+
773
+ On the destination process, use the *.open()* method to creates a new
774
+ *DeviceNDArray* object that shares the allocation from the original process.
775
+ To release the resources, call the *.close()* method. After that, the
776
+ destination can no longer use the shared array object. (Note: the
777
+ underlying weakref to the resource is now dead.)
778
+
779
+ This object implements the context-manager interface that calls the
780
+ *.open()* and *.close()* method automatically::
781
+
782
+ with the_ipc_array_handle as ipc_array:
783
+ # use ipc_array here as a normal gpu array object
784
+ some_code(ipc_array)
785
+ # ipc_array is dead at this point
786
+ """
787
+
788
+ def __init__(self, ipc_handle, array_desc):
789
+ self._array_desc = array_desc
790
+ self._ipc_handle = ipc_handle
791
+
792
+ def open(self):
793
+ """
794
+ Returns a new *DeviceNDArray* that shares the allocation from the
795
+ original process. Must not be used on the original process.
796
+ """
797
+ dptr = self._ipc_handle.open(devices.get_context())
798
+ return DeviceNDArray(gpu_data=dptr, **self._array_desc)
799
+
800
+ def close(self):
801
+ """
802
+ Closes the IPC handle to the array.
803
+ """
804
+ self._ipc_handle.close()
805
+
806
+ def __enter__(self):
807
+ return self.open()
808
+
809
+ def __exit__(self, type, value, traceback):
810
+ self.close()
811
+
812
+
813
+ class MappedNDArray(DeviceNDArrayBase, np.ndarray):
814
+ """
815
+ A host array that uses CUDA mapped memory.
816
+ """
817
+
818
+ def device_setup(self, gpu_data, stream=0):
819
+ self.gpu_data = gpu_data
820
+ self.stream = stream
821
+
822
+
823
+ class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
824
+ """
825
+ A host array that uses CUDA managed memory.
826
+ """
827
+
828
+ def device_setup(self, gpu_data, stream=0):
829
+ self.gpu_data = gpu_data
830
+ self.stream = stream
831
+
832
+
833
+ def from_array_like(ary, stream=0, gpu_data=None):
834
+ "Create a DeviceNDArray object that is like ary."
835
+ return DeviceNDArray(
836
+ ary.shape, ary.strides, ary.dtype, stream=stream, gpu_data=gpu_data
837
+ )
838
+
839
+
840
+ def from_record_like(rec, stream=0, gpu_data=None):
841
+ "Create a DeviceRecord object that is like rec."
842
+ return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
843
+
844
+
845
+ def array_core(ary):
846
+ """
847
+ Extract the repeated core of a broadcast array.
848
+
849
+ Broadcast arrays are by definition non-contiguous due to repeated
850
+ dimensions, i.e., dimensions with stride 0. In order to ascertain memory
851
+ contiguity and copy the underlying data from such arrays, we must create
852
+ a view without the repeated dimensions.
853
+
854
+ """
855
+ if not ary.strides or not ary.size:
856
+ return ary
857
+ core_index = []
858
+ for stride in ary.strides:
859
+ core_index.append(0 if stride == 0 else slice(None))
860
+ return ary[tuple(core_index)]
861
+
862
+
863
+ def is_contiguous(ary):
864
+ """
865
+ Returns True iff `ary` is C-style contiguous while ignoring
866
+ broadcasted and 1-sized dimensions.
867
+ As opposed to array_core(), it does not call require_context(),
868
+ which can be quite expensive.
869
+ """
870
+ size = ary.dtype.itemsize
871
+ for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
872
+ if shape > 1 and stride != 0:
873
+ if size != stride:
874
+ return False
875
+ size *= shape
876
+ return True
877
+
878
+
879
+ errmsg_contiguous_buffer = (
880
+ "Array contains non-contiguous buffer and cannot "
881
+ "be transferred as a single memory region. Please "
882
+ "ensure contiguous buffer with numpy "
883
+ ".ascontiguousarray()"
884
+ )
885
+
886
+
887
+ def sentry_contiguous(ary):
888
+ core = array_core(ary)
889
+ if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]:
890
+ raise ValueError(errmsg_contiguous_buffer)
891
+
892
+
893
+ def auto_device(obj, stream=0, copy=True, user_explicit=False):
894
+ """
895
+ Create a DeviceRecord or DeviceArray like obj and optionally copy data from
896
+ host to device. If obj already represents device memory, it is returned and
897
+ no copy is made.
898
+ """
899
+ if _driver.is_device_memory(obj):
900
+ return obj, False
901
+ elif (
902
+ interface := getattr(obj, "__cuda_array_interface__", None)
903
+ ) is not None:
904
+ from numba.cuda.api import from_cuda_array_interface
905
+
906
+ return from_cuda_array_interface(interface, owner=obj), False
907
+ else:
908
+ if isinstance(obj, np.void):
909
+ devobj = from_record_like(obj, stream=stream)
910
+ else:
911
+ # This allows you to pass non-array objects like constants and
912
+ # objects implementing the array interface
913
+ # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
914
+ # into this function (with no overhead -- copies -- for `obj`s
915
+ # that are already `ndarray`s.
916
+ obj = np.array(
917
+ obj, copy=False if numpy_version < (2, 0) else None, subok=True
918
+ )
919
+ sentry_contiguous(obj)
920
+ devobj = from_array_like(obj, stream=stream)
921
+ if copy:
922
+ if (
923
+ config.CUDA_WARN_ON_IMPLICIT_COPY
924
+ and not config.DISABLE_PERFORMANCE_WARNINGS
925
+ ):
926
+ if not user_explicit and (
927
+ not isinstance(obj, DeviceNDArray)
928
+ and isinstance(obj, np.ndarray)
929
+ ):
930
+ msg = (
931
+ "Host array used in CUDA kernel will incur "
932
+ "copy overhead to/from device."
933
+ )
934
+ warn(NumbaPerformanceWarning(msg))
935
+ devobj.copy_to_device(obj, stream=stream)
936
+ return devobj, True
937
+
938
+
939
+ def check_array_compatibility(ary1, ary2):
940
+ ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
941
+ if ary1.dtype != ary2.dtype:
942
+ raise TypeError(
943
+ "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype)
944
+ )
945
+ if ary1sq.shape != ary2sq.shape:
946
+ raise ValueError(
947
+ "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape)
948
+ )
949
+ # We check strides only if the size is nonzero, because strides are
950
+ # irrelevant (and can differ) for zero-length copies.
951
+ if ary1.size and ary1sq.strides != ary2sq.strides:
952
+ raise ValueError(
953
+ "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides)
954
+ )