numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,951 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
6
+ on the object. If it exists and evaluate to True, it must define shape,
7
+ strides, dtype and size attributes similar to a NumPy ndarray.
8
+ """
9
+
10
+ import math
11
+ import functools
12
+ import operator
13
+ import copy
14
+ from ctypes import c_void_p
15
+
16
+ import numpy as np
17
+
18
+ from numba.cuda.cext import _devicearray
19
+ from numba.cuda.cudadrv import devices, dummyarray
20
+ from numba.cuda.cudadrv import driver as _driver
21
+ from numba.cuda import types
22
+ from numba.cuda.core import config
23
+ from numba.cuda.np.unsafe.ndarray import to_fixed_tuple
24
+ from numba.cuda.np.numpy_support import numpy_version
25
+ from numba.cuda.np import numpy_support
26
+ from numba.cuda.api_util import prepare_shape_strides_dtype
27
+ from numba.cuda.core.errors import NumbaPerformanceWarning
28
+ from warnings import warn
29
+
30
+
31
+ def is_cuda_ndarray(obj):
32
+ "Check if an object is a CUDA ndarray"
33
+ return getattr(obj, "__cuda_ndarray__", False)
34
+
35
+
36
+ def verify_cuda_ndarray_interface(obj):
37
+ "Verify the CUDA ndarray interface for an obj"
38
+ require_cuda_ndarray(obj)
39
+
40
+ def requires_attr(attr, typ):
41
+ if not hasattr(obj, attr):
42
+ raise AttributeError(attr)
43
+ if not isinstance(getattr(obj, attr), typ):
44
+ raise AttributeError("%s must be of type %s" % (attr, typ))
45
+
46
+ requires_attr("shape", tuple)
47
+ requires_attr("strides", tuple)
48
+ requires_attr("dtype", np.dtype)
49
+ requires_attr("size", int)
50
+
51
+
52
+ def require_cuda_ndarray(obj):
53
+ "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
54
+ if not is_cuda_ndarray(obj):
55
+ raise ValueError("require an cuda ndarray object")
56
+
57
+
58
+ class DeviceNDArrayBase(_devicearray.DeviceArray):
59
+ """A on GPU NDArray representation"""
60
+
61
+ __cuda_memory__ = True
62
+ __cuda_ndarray__ = True # There must be gpu_data attribute
63
+
64
+ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
65
+ """
66
+ Args
67
+ ----
68
+
69
+ shape
70
+ array shape.
71
+ strides
72
+ array strides.
73
+ dtype
74
+ data type as np.dtype coercible object.
75
+ stream
76
+ cuda stream.
77
+ gpu_data
78
+ user provided device memory for the ndarray data buffer
79
+ """
80
+ if isinstance(shape, int):
81
+ shape = (shape,)
82
+ else:
83
+ shape = tuple(shape)
84
+ if isinstance(strides, int):
85
+ strides = (strides,)
86
+ else:
87
+ if strides:
88
+ strides = tuple(strides)
89
+ dtype = np.dtype(dtype)
90
+ itemsize = dtype.itemsize
91
+ self.ndim = ndim = len(shape)
92
+ if len(strides) != ndim:
93
+ raise ValueError("strides not match ndim")
94
+ self._dummy = dummy = dummyarray.Array.from_desc(
95
+ 0, shape, strides, itemsize
96
+ )
97
+ self.shape = shape = dummy.shape
98
+ self.strides = strides = dummy.strides
99
+ self.dtype = dtype
100
+ self.size = size = dummy.size
101
+ # prepare gpu memory
102
+ if size:
103
+ self.alloc_size = alloc_size = _driver.memory_size_from_info(
104
+ shape, strides, itemsize
105
+ )
106
+ if gpu_data is None:
107
+ gpu_data = devices.get_context().memalloc(alloc_size)
108
+ else:
109
+ # Make NULL pointer for empty allocation
110
+ null = _driver.binding.CUdeviceptr(0)
111
+ gpu_data = _driver.MemoryPointer(pointer=null, size=0)
112
+ self.alloc_size = 0
113
+
114
+ self.gpu_data = gpu_data
115
+ self.stream = stream
116
+
117
+ @property
118
+ def __cuda_array_interface__(self):
119
+ if (value := self.device_ctypes_pointer.value) is not None:
120
+ ptr = value
121
+ else:
122
+ ptr = 0
123
+
124
+ return {
125
+ "shape": self.shape,
126
+ "strides": None if is_contiguous(self) else tuple(self.strides),
127
+ "data": (ptr, False),
128
+ "typestr": self.dtype.str,
129
+ "stream": int(stream) if (stream := self.stream) != 0 else None,
130
+ "version": 3,
131
+ }
132
+
133
+ def bind(self, stream=0):
134
+ """Bind a CUDA stream to this object so that all subsequent operation
135
+ on this array defaults to the given stream.
136
+ """
137
+ clone = copy.copy(self)
138
+ clone.stream = stream
139
+ return clone
140
+
141
+ @property
142
+ def T(self):
143
+ return self.transpose()
144
+
145
+ def transpose(self, axes=None):
146
+ if axes and tuple(axes) == tuple(range(self.ndim)):
147
+ return self
148
+ elif self.ndim != 2:
149
+ msg = "transposing a non-2D DeviceNDArray isn't supported"
150
+ raise NotImplementedError(msg)
151
+ elif axes is not None and set(axes) != set(range(self.ndim)):
152
+ raise ValueError("invalid axes list %r" % (axes,))
153
+ else:
154
+ from numba.cuda.kernels.transpose import transpose
155
+
156
+ return transpose(self)
157
+
158
+ def _default_stream(self, stream):
159
+ return self.stream if not stream else stream
160
+
161
+ @property
162
+ def _numba_type_(self):
163
+ """
164
+ Magic attribute expected by Numba to get the numba type that
165
+ represents this object.
166
+ """
167
+ # Typing considerations:
168
+ #
169
+ # 1. The preference is to use 'C' or 'F' layout since this enables
170
+ # hardcoding stride values into compiled kernels, which is more
171
+ # efficient than storing a passed-in value in a register.
172
+ #
173
+ # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
174
+ # the more likely / common case.
175
+ #
176
+ # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
177
+ # or 'F' does not apply for broadcast arrays, because the strides, some
178
+ # of which will be 0, will not match those hardcoded in for 'C' or 'F'
179
+ # layouts.
180
+
181
+ broadcast = 0 in self.strides
182
+ if self.flags["C_CONTIGUOUS"] and not broadcast:
183
+ layout = "C"
184
+ elif self.flags["F_CONTIGUOUS"] and not broadcast:
185
+ layout = "F"
186
+ else:
187
+ layout = "A"
188
+
189
+ dtype = numpy_support.from_dtype(self.dtype)
190
+ return types.Array(dtype, self.ndim, layout)
191
+
192
+ @property
193
+ def device_ctypes_pointer(self):
194
+ """Returns the ctypes pointer to the GPU data buffer"""
195
+ try:
196
+ # apparently faster in the non-exceptional case
197
+ return self.gpu_data.device_ctypes_pointer
198
+ except AttributeError:
199
+ return c_void_p(0)
200
+
201
+ @devices.require_context
202
+ def copy_to_device(self, ary, stream=0):
203
+ """Copy `ary` to `self`.
204
+
205
+ If `ary` is a CUDA memory, perform a device-to-device transfer.
206
+ Otherwise, perform a a host-to-device transfer.
207
+ """
208
+ if ary.size == 0:
209
+ # Nothing to do
210
+ return
211
+
212
+ sentry_contiguous(self)
213
+ stream = self._default_stream(stream)
214
+
215
+ self_core, ary_core = array_core(self), array_core(ary)
216
+ if _driver.is_device_memory(ary):
217
+ sentry_contiguous(ary)
218
+ check_array_compatibility(self_core, ary_core)
219
+ _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
220
+ else:
221
+ # Ensure same contiguity. Only makes a host-side copy if necessary
222
+ # (i.e., in order to materialize a writable strided view)
223
+ ary_core = np.array(
224
+ ary_core,
225
+ order="C" if self_core.flags["C_CONTIGUOUS"] else "F",
226
+ subok=True,
227
+ copy=(not ary_core.flags["WRITEABLE"])
228
+ if numpy_version < (2, 0)
229
+ else None,
230
+ )
231
+ check_array_compatibility(self_core, ary_core)
232
+ _driver.host_to_device(
233
+ self, ary_core, self.alloc_size, stream=stream
234
+ )
235
+
236
+ @devices.require_context
237
+ def copy_to_host(self, ary=None, stream=0):
238
+ """Copy ``self`` to ``ary`` or create a new Numpy ndarray
239
+ if ``ary`` is ``None``.
240
+
241
+ If a CUDA ``stream`` is given, then the transfer will be made
242
+ asynchronously as part as the given stream. Otherwise, the transfer is
243
+ synchronous: the function returns after the copy is finished.
244
+
245
+ Always returns the host array.
246
+
247
+ Example::
248
+
249
+ import numpy as np
250
+ from numba import cuda
251
+
252
+ arr = np.arange(1000)
253
+ d_arr = cuda.to_device(arr)
254
+
255
+ my_kernel[100, 100](d_arr)
256
+
257
+ result_array = d_arr.copy_to_host()
258
+ """
259
+ if any(s < 0 for s in self.strides):
260
+ msg = "D->H copy not implemented for negative strides: {}"
261
+ raise NotImplementedError(msg.format(self.strides))
262
+ assert self.alloc_size >= 0, "Negative memory size"
263
+ stream = self._default_stream(stream)
264
+ if ary is None:
265
+ hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
266
+ else:
267
+ check_array_compatibility(self, ary)
268
+ hostary = ary
269
+
270
+ if self.alloc_size != 0:
271
+ _driver.device_to_host(
272
+ hostary, self, self.alloc_size, stream=stream
273
+ )
274
+
275
+ if ary is None:
276
+ if self.size == 0:
277
+ hostary = np.ndarray(
278
+ shape=self.shape, dtype=self.dtype, buffer=hostary
279
+ )
280
+ else:
281
+ hostary = np.ndarray(
282
+ shape=self.shape,
283
+ dtype=self.dtype,
284
+ strides=self.strides,
285
+ buffer=hostary,
286
+ )
287
+ return hostary
288
+
289
+ def split(self, section, stream=0):
290
+ """Split the array into equal partition of the `section` size.
291
+ If the array cannot be equally divided, the last section will be
292
+ smaller.
293
+ """
294
+ stream = self._default_stream(stream)
295
+ if self.ndim != 1:
296
+ raise ValueError("only support 1d array")
297
+ if self.strides[0] != self.dtype.itemsize:
298
+ raise ValueError("only support unit stride")
299
+ nsect = int(math.ceil(float(self.size) / section))
300
+ strides = self.strides
301
+ itemsize = self.dtype.itemsize
302
+ for i in range(nsect):
303
+ begin = i * section
304
+ end = min(begin + section, self.size)
305
+ shape = (end - begin,)
306
+ gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
307
+ yield DeviceNDArray(
308
+ shape,
309
+ strides,
310
+ dtype=self.dtype,
311
+ stream=stream,
312
+ gpu_data=gpu_data,
313
+ )
314
+
315
+ def as_cuda_arg(self):
316
+ """Returns a device memory object that is used as the argument."""
317
+ return self.gpu_data
318
+
319
+ def get_ipc_handle(self):
320
+ """
321
+ Returns a *IpcArrayHandle* object that is safe to serialize and transfer
322
+ to another process to share the local allocation.
323
+
324
+ Note: this feature is only available on Linux.
325
+ """
326
+ ipch = devices.get_context().get_ipc_handle(self.gpu_data)
327
+ desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
328
+ return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
329
+
330
+ def squeeze(self, axis=None, stream=0):
331
+ """
332
+ Remove axes of size one from the array shape.
333
+
334
+ Parameters
335
+ ----------
336
+ axis : None or int or tuple of ints, optional
337
+ Subset of dimensions to remove. A `ValueError` is raised if an axis
338
+ with size greater than one is selected. If `None`, all axes with
339
+ size one are removed.
340
+ stream : cuda stream or 0, optional
341
+ Default stream for the returned view of the array.
342
+
343
+ Returns
344
+ -------
345
+ DeviceNDArray
346
+ Squeezed view into the array.
347
+
348
+ """
349
+ new_dummy, _ = self._dummy.squeeze(axis=axis)
350
+ return DeviceNDArray(
351
+ shape=new_dummy.shape,
352
+ strides=new_dummy.strides,
353
+ dtype=self.dtype,
354
+ stream=self._default_stream(stream),
355
+ gpu_data=self.gpu_data,
356
+ )
357
+
358
+ def view(self, dtype):
359
+ """Returns a new object by reinterpretting the dtype without making a
360
+ copy of the data.
361
+ """
362
+ dtype = np.dtype(dtype)
363
+ shape = list(self.shape)
364
+ strides = list(self.strides)
365
+
366
+ if self.dtype.itemsize != dtype.itemsize:
367
+ if not self.is_c_contiguous():
368
+ raise ValueError(
369
+ "To change to a dtype of a different size,"
370
+ " the array must be C-contiguous"
371
+ )
372
+
373
+ shape[-1], rem = divmod(
374
+ shape[-1] * self.dtype.itemsize, dtype.itemsize
375
+ )
376
+
377
+ if rem != 0:
378
+ raise ValueError(
379
+ "When changing to a larger dtype,"
380
+ " its size must be a divisor of the total size in bytes"
381
+ " of the last axis of the array."
382
+ )
383
+
384
+ strides[-1] = dtype.itemsize
385
+
386
+ return DeviceNDArray(
387
+ shape=shape,
388
+ strides=strides,
389
+ dtype=dtype,
390
+ stream=self.stream,
391
+ gpu_data=self.gpu_data,
392
+ )
393
+
394
+ @property
395
+ def nbytes(self):
396
+ # Note: not using `alloc_size`. `alloc_size` reports memory
397
+ # consumption of the allocation, not the size of the array
398
+ # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
399
+ return self.dtype.itemsize * self.size
400
+
401
+
402
+ class DeviceRecord(DeviceNDArrayBase):
403
+ """
404
+ An on-GPU record type
405
+ """
406
+
407
+ def __init__(self, dtype, stream=0, gpu_data=None):
408
+ shape = ()
409
+ strides = ()
410
+ super(DeviceRecord, self).__init__(
411
+ shape, strides, dtype, stream, gpu_data
412
+ )
413
+
414
+ @property
415
+ def flags(self):
416
+ """
417
+ For `numpy.ndarray` compatibility. Ideally this would return a
418
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
419
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
420
+ aren't writeable).
421
+ """
422
+ return dict(self._dummy.flags) # defensive copy
423
+
424
+ @property
425
+ def _numba_type_(self):
426
+ """
427
+ Magic attribute expected by Numba to get the numba type that
428
+ represents this object.
429
+ """
430
+ return numpy_support.from_dtype(self.dtype)
431
+
432
+ @devices.require_context
433
+ def __getitem__(self, item):
434
+ return self._do_getitem(item)
435
+
436
+ @devices.require_context
437
+ def getitem(self, item, stream=0):
438
+ """Do `__getitem__(item)` with CUDA stream"""
439
+ return self._do_getitem(item, stream)
440
+
441
+ def _do_getitem(self, item, stream=0):
442
+ stream = self._default_stream(stream)
443
+ typ, offset = self.dtype.fields[item]
444
+ newdata = self.gpu_data.view(offset)
445
+
446
+ if typ.shape == ():
447
+ if typ.names is not None:
448
+ return DeviceRecord(dtype=typ, stream=stream, gpu_data=newdata)
449
+ else:
450
+ hostary = np.empty(1, dtype=typ)
451
+ _driver.device_to_host(
452
+ dst=hostary, src=newdata, size=typ.itemsize, stream=stream
453
+ )
454
+ return hostary[0]
455
+ else:
456
+ shape, strides, dtype = prepare_shape_strides_dtype(
457
+ typ.shape, None, typ.subdtype[0], "C"
458
+ )
459
+ return DeviceNDArray(
460
+ shape=shape,
461
+ strides=strides,
462
+ dtype=dtype,
463
+ gpu_data=newdata,
464
+ stream=stream,
465
+ )
466
+
467
+ @devices.require_context
468
+ def __setitem__(self, key, value):
469
+ return self._do_setitem(key, value)
470
+
471
+ @devices.require_context
472
+ def setitem(self, key, value, stream=0):
473
+ """Do `__setitem__(key, value)` with CUDA stream"""
474
+ return self._do_setitem(key, value, stream=stream)
475
+
476
+ def _do_setitem(self, key, value, stream=0):
477
+ stream = self._default_stream(stream)
478
+
479
+ # If the record didn't have a default stream, and the user didn't
480
+ # provide a stream, then we will use the default stream for the
481
+ # assignment kernel and synchronize on it.
482
+ synchronous = not stream
483
+ if synchronous:
484
+ ctx = devices.get_context()
485
+ stream = ctx.get_default_stream()
486
+
487
+ # (1) prepare LHS
488
+
489
+ typ, offset = self.dtype.fields[key]
490
+ newdata = self.gpu_data.view(offset)
491
+
492
+ lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)
493
+
494
+ # (2) prepare RHS
495
+
496
+ rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)
497
+
498
+ # (3) do the copy
499
+
500
+ _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)
501
+
502
+ if synchronous:
503
+ stream.synchronize()
504
+
505
+
506
+ @functools.lru_cache
507
+ def _assign_kernel(ndim):
508
+ """
509
+ A separate method so we don't need to compile code every assignment (!).
510
+
511
+ :param ndim: We need to have static array sizes for cuda.local.array, so
512
+ bake in the number of dimensions into the kernel
513
+ """
514
+ from numba import cuda # circular!
515
+
516
+ if ndim == 0:
517
+ # the (2, ndim) allocation below is not yet supported, so avoid it
518
+ @cuda.jit
519
+ def kernel(lhs, rhs):
520
+ lhs[()] = rhs[()]
521
+
522
+ return kernel
523
+
524
+ @cuda.jit
525
+ def kernel(lhs, rhs):
526
+ location = cuda.grid(1)
527
+
528
+ n_elements = 1
529
+ for i in range(lhs.ndim):
530
+ n_elements *= lhs.shape[i]
531
+ if location >= n_elements:
532
+ # bake n_elements into the kernel, better than passing it in
533
+ # as another argument.
534
+ return
535
+
536
+ # [0, :] is the to-index (into `lhs`)
537
+ # [1, :] is the from-index (into `rhs`)
538
+ idx = cuda.local.array(shape=(2, ndim), dtype=types.int64)
539
+
540
+ for i in range(ndim - 1, -1, -1):
541
+ idx[0, i] = location % lhs.shape[i]
542
+ idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
543
+ location //= lhs.shape[i]
544
+
545
+ lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
546
+
547
+ return kernel
548
+
549
+
550
+ class DeviceNDArray(DeviceNDArrayBase):
551
+ """
552
+ An on-GPU array type
553
+ """
554
+
555
+ def is_f_contiguous(self):
556
+ """
557
+ Return true if the array is Fortran-contiguous.
558
+ """
559
+ return self._dummy.is_f_contig
560
+
561
+ @property
562
+ def flags(self):
563
+ """
564
+ For `numpy.ndarray` compatibility. Ideally this would return a
565
+ `np.core.multiarray.flagsobj`, but that needs to be constructed
566
+ with an existing `numpy.ndarray` (as the C- and F- contiguous flags
567
+ aren't writeable).
568
+ """
569
+ return dict(self._dummy.flags) # defensive copy
570
+
571
+ def is_c_contiguous(self):
572
+ """
573
+ Return true if the array is C-contiguous.
574
+ """
575
+ return self._dummy.is_c_contig
576
+
577
+ def __array__(self, dtype=None, copy=None):
578
+ """
579
+ :return: an `numpy.ndarray`, so copies to the host.
580
+ """
581
+ if copy is False:
582
+ msg = "`copy=False` is not supported. A copy is always created."
583
+ raise ValueError(msg)
584
+ if dtype:
585
+ return self.copy_to_host().__array__(dtype)
586
+ else:
587
+ return self.copy_to_host().__array__()
588
+
589
+ def __len__(self):
590
+ return self.shape[0]
591
+
592
+ def reshape(self, *newshape, **kws):
593
+ """
594
+ Reshape the array without changing its contents, similarly to
595
+ :meth:`numpy.ndarray.reshape`. Example::
596
+
597
+ d_arr = d_arr.reshape(20, 50, order="F")
598
+ """
599
+ if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
600
+ newshape = newshape[0]
601
+
602
+ cls = type(self)
603
+ if newshape == self.shape:
604
+ # nothing to do
605
+ return cls(
606
+ shape=self.shape,
607
+ strides=self.strides,
608
+ dtype=self.dtype,
609
+ gpu_data=self.gpu_data,
610
+ )
611
+
612
+ newarr, extents = self._dummy.reshape(*newshape, **kws)
613
+
614
+ if extents == [self._dummy.extent]:
615
+ return cls(
616
+ shape=newarr.shape,
617
+ strides=newarr.strides,
618
+ dtype=self.dtype,
619
+ gpu_data=self.gpu_data,
620
+ )
621
+ else:
622
+ raise NotImplementedError("operation requires copying")
623
+
624
+ def ravel(self, order="C", stream=0):
625
+ """
626
+ Flattens a contiguous array without changing its contents, similar to
627
+ :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
628
+ exception.
629
+ """
630
+ stream = self._default_stream(stream)
631
+ cls = type(self)
632
+ newarr, extents = self._dummy.ravel(order=order)
633
+
634
+ if extents == [self._dummy.extent]:
635
+ return cls(
636
+ shape=newarr.shape,
637
+ strides=newarr.strides,
638
+ dtype=self.dtype,
639
+ gpu_data=self.gpu_data,
640
+ stream=stream,
641
+ )
642
+
643
+ else:
644
+ raise NotImplementedError("operation requires copying")
645
+
646
+ @devices.require_context
647
+ def __getitem__(self, item):
648
+ return self._do_getitem(item)
649
+
650
+ @devices.require_context
651
+ def getitem(self, item, stream=0):
652
+ """Do `__getitem__(item)` with CUDA stream"""
653
+ return self._do_getitem(item, stream)
654
+
655
+ def _do_getitem(self, item, stream=0):
656
+ stream = self._default_stream(stream)
657
+
658
+ arr = self._dummy.__getitem__(item)
659
+ extents = list(arr.iter_contiguous_extent())
660
+ cls = type(self)
661
+ if len(extents) == 1:
662
+ newdata = self.gpu_data.view(*extents[0])
663
+
664
+ if not arr.is_array:
665
+ # Check for structured array type (record)
666
+ if self.dtype.names is not None:
667
+ return DeviceRecord(
668
+ dtype=self.dtype, stream=stream, gpu_data=newdata
669
+ )
670
+ else:
671
+ # Element indexing
672
+ hostary = np.empty(1, dtype=self.dtype)
673
+ _driver.device_to_host(
674
+ dst=hostary,
675
+ src=newdata,
676
+ size=self._dummy.itemsize,
677
+ stream=stream,
678
+ )
679
+ return hostary[0]
680
+ else:
681
+ return cls(
682
+ shape=arr.shape,
683
+ strides=arr.strides,
684
+ dtype=self.dtype,
685
+ gpu_data=newdata,
686
+ stream=stream,
687
+ )
688
+ else:
689
+ newdata = self.gpu_data.view(*arr.extent)
690
+ return cls(
691
+ shape=arr.shape,
692
+ strides=arr.strides,
693
+ dtype=self.dtype,
694
+ gpu_data=newdata,
695
+ stream=stream,
696
+ )
697
+
698
+ @devices.require_context
699
+ def __setitem__(self, key, value):
700
+ return self._do_setitem(key, value)
701
+
702
+ @devices.require_context
703
+ def setitem(self, key, value, stream=0):
704
+ """Do `__setitem__(key, value)` with CUDA stream"""
705
+ return self._do_setitem(key, value, stream=stream)
706
+
707
+ def _do_setitem(self, key, value, stream=0):
708
+ stream = self._default_stream(stream)
709
+
710
+ # If the array didn't have a default stream, and the user didn't provide
711
+ # a stream, then we will use the default stream for the assignment
712
+ # kernel and synchronize on it.
713
+ synchronous = not stream
714
+ if synchronous:
715
+ ctx = devices.get_context()
716
+ stream = ctx.get_default_stream()
717
+
718
+ # (1) prepare LHS
719
+
720
+ arr = self._dummy.__getitem__(key)
721
+ newdata = self.gpu_data.view(*arr.extent)
722
+
723
+ if isinstance(arr, dummyarray.Element):
724
+ # convert to a 0d array
725
+ shape = ()
726
+ strides = ()
727
+ else:
728
+ shape = arr.shape
729
+ strides = arr.strides
730
+
731
+ lhs = type(self)(
732
+ shape=shape,
733
+ strides=strides,
734
+ dtype=self.dtype,
735
+ gpu_data=newdata,
736
+ stream=stream,
737
+ )
738
+
739
+ # (2) prepare RHS
740
+
741
+ rhs, _ = auto_device(value, stream=stream, user_explicit=True)
742
+ if rhs.ndim > lhs.ndim:
743
+ raise ValueError(
744
+ "Can't assign %s-D array to %s-D self" % (rhs.ndim, lhs.ndim)
745
+ )
746
+ rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
747
+ # negative indices would not work if rhs.ndim == 0
748
+ rhs_shape[lhs.ndim - rhs.ndim :] = rhs.shape
749
+ rhs = rhs.reshape(*rhs_shape)
750
+ for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
751
+ if r != 1 and l != r:
752
+ raise ValueError(
753
+ "Can't copy sequence with size %d to array "
754
+ "axis %d with dimension %d" % (r, i, l)
755
+ )
756
+
757
+ # (3) do the copy
758
+
759
+ n_elements = functools.reduce(operator.mul, lhs.shape, 1)
760
+ _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
761
+ if synchronous:
762
+ stream.synchronize()
763
+
764
+
765
+ class IpcArrayHandle(object):
766
+ """
767
+ An IPC array handle that can be serialized and transfer to another process
768
+ in the same machine for share a GPU allocation.
769
+
770
+ On the destination process, use the *.open()* method to creates a new
771
+ *DeviceNDArray* object that shares the allocation from the original process.
772
+ To release the resources, call the *.close()* method. After that, the
773
+ destination can no longer use the shared array object. (Note: the
774
+ underlying weakref to the resource is now dead.)
775
+
776
+ This object implements the context-manager interface that calls the
777
+ *.open()* and *.close()* method automatically::
778
+
779
+ with the_ipc_array_handle as ipc_array:
780
+ # use ipc_array here as a normal gpu array object
781
+ some_code(ipc_array)
782
+ # ipc_array is dead at this point
783
+ """
784
+
785
+ def __init__(self, ipc_handle, array_desc):
786
+ self._array_desc = array_desc
787
+ self._ipc_handle = ipc_handle
788
+
789
+ def open(self):
790
+ """
791
+ Returns a new *DeviceNDArray* that shares the allocation from the
792
+ original process. Must not be used on the original process.
793
+ """
794
+ dptr = self._ipc_handle.open(devices.get_context())
795
+ return DeviceNDArray(gpu_data=dptr, **self._array_desc)
796
+
797
+ def close(self):
798
+ """
799
+ Closes the IPC handle to the array.
800
+ """
801
+ self._ipc_handle.close()
802
+
803
+ def __enter__(self):
804
+ return self.open()
805
+
806
+ def __exit__(self, type, value, traceback):
807
+ self.close()
808
+
809
+
810
+ class MappedNDArray(DeviceNDArrayBase, np.ndarray):
811
+ """
812
+ A host array that uses CUDA mapped memory.
813
+ """
814
+
815
+ def device_setup(self, gpu_data, stream=0):
816
+ self.gpu_data = gpu_data
817
+ self.stream = stream
818
+
819
+
820
+ class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
821
+ """
822
+ A host array that uses CUDA managed memory.
823
+ """
824
+
825
+ def device_setup(self, gpu_data, stream=0):
826
+ self.gpu_data = gpu_data
827
+ self.stream = stream
828
+
829
+
830
+ def from_array_like(ary, stream=0, gpu_data=None):
831
+ "Create a DeviceNDArray object that is like ary."
832
+ return DeviceNDArray(
833
+ ary.shape, ary.strides, ary.dtype, stream=stream, gpu_data=gpu_data
834
+ )
835
+
836
+
837
+ def from_record_like(rec, stream=0, gpu_data=None):
838
+ "Create a DeviceRecord object that is like rec."
839
+ return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)
840
+
841
+
842
+ def array_core(ary):
843
+ """
844
+ Extract the repeated core of a broadcast array.
845
+
846
+ Broadcast arrays are by definition non-contiguous due to repeated
847
+ dimensions, i.e., dimensions with stride 0. In order to ascertain memory
848
+ contiguity and copy the underlying data from such arrays, we must create
849
+ a view without the repeated dimensions.
850
+
851
+ """
852
+ if not ary.strides or not ary.size:
853
+ return ary
854
+ core_index = []
855
+ for stride in ary.strides:
856
+ core_index.append(0 if stride == 0 else slice(None))
857
+ return ary[tuple(core_index)]
858
+
859
+
860
+ def is_contiguous(ary):
861
+ """
862
+ Returns True iff `ary` is C-style contiguous while ignoring
863
+ broadcasted and 1-sized dimensions.
864
+ As opposed to array_core(), it does not call require_context(),
865
+ which can be quite expensive.
866
+ """
867
+ size = ary.dtype.itemsize
868
+ for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
869
+ if shape > 1 and stride != 0:
870
+ if size != stride:
871
+ return False
872
+ size *= shape
873
+ return True
874
+
875
+
876
+ errmsg_contiguous_buffer = (
877
+ "Array contains non-contiguous buffer and cannot "
878
+ "be transferred as a single memory region. Please "
879
+ "ensure contiguous buffer with numpy "
880
+ ".ascontiguousarray()"
881
+ )
882
+
883
+
884
+ def sentry_contiguous(ary):
885
+ core = array_core(ary)
886
+ if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]:
887
+ raise ValueError(errmsg_contiguous_buffer)
888
+
889
+
890
+ def auto_device(obj, stream=0, copy=True, user_explicit=False):
891
+ """
892
+ Create a DeviceRecord or DeviceArray like obj and optionally copy data from
893
+ host to device. If obj already represents device memory, it is returned and
894
+ no copy is made.
895
+ """
896
+ if _driver.is_device_memory(obj):
897
+ return obj, False
898
+ elif (
899
+ interface := getattr(obj, "__cuda_array_interface__", None)
900
+ ) is not None:
901
+ from numba.cuda.api import from_cuda_array_interface
902
+
903
+ return from_cuda_array_interface(interface, owner=obj), False
904
+ else:
905
+ if isinstance(obj, np.void):
906
+ devobj = from_record_like(obj, stream=stream)
907
+ else:
908
+ # This allows you to pass non-array objects like constants and
909
+ # objects implementing the array interface
910
+ # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
911
+ # into this function (with no overhead -- copies -- for `obj`s
912
+ # that are already `ndarray`s.
913
+ obj = np.array(
914
+ obj, copy=False if numpy_version < (2, 0) else None, subok=True
915
+ )
916
+ sentry_contiguous(obj)
917
+ devobj = from_array_like(obj, stream=stream)
918
+ if copy:
919
+ if (
920
+ config.CUDA_WARN_ON_IMPLICIT_COPY
921
+ and not config.DISABLE_PERFORMANCE_WARNINGS
922
+ ):
923
+ if not user_explicit and (
924
+ not isinstance(obj, DeviceNDArray)
925
+ and isinstance(obj, np.ndarray)
926
+ ):
927
+ msg = (
928
+ "Host array used in CUDA kernel will incur "
929
+ "copy overhead to/from device."
930
+ )
931
+ warn(NumbaPerformanceWarning(msg))
932
+ devobj.copy_to_device(obj, stream=stream)
933
+ return devobj, True
934
+
935
+
936
+ def check_array_compatibility(ary1, ary2):
937
+ ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
938
+ if ary1.dtype != ary2.dtype:
939
+ raise TypeError(
940
+ "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype)
941
+ )
942
+ if ary1sq.shape != ary2sq.shape:
943
+ raise ValueError(
944
+ "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape)
945
+ )
946
+ # We check strides only if the size is nonzero, because strides are
947
+ # irrelevant (and can differ) for zero-length copies.
948
+ if ary1.size and ary1sq.strides != ary2sq.strides:
949
+ raise ValueError(
950
+ "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides)
951
+ )