numba-cuda 0.22.0__cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-313-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-313-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-313-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-313-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-313-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,141 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ # Contents in this file are referenced from the sphinx-generated docs.
5
+ # "magictoken" is used for markers as beginning and ending of example text.
6
+
7
+ import unittest
8
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
9
+ from numba.cuda.tests.support import skip_unless_cffi, override_config
10
+
11
+
12
+ @skip_unless_cffi
13
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
14
+ class TestFFI(CUDATestCase):
15
+ def test_ex_linking_cu(self):
16
+ # magictoken.ex_linking_cu.begin
17
+ from numba import cuda
18
+ import numpy as np
19
+ import os
20
+
21
+ # Path to the source containing the foreign function
22
+ # (here assumed to be in a subdirectory called "ffi")
23
+ basedir = os.path.dirname(os.path.abspath(__file__))
24
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
25
+
26
+ # Declaration of the foreign function
27
+ mul = cuda.declare_device(
28
+ "mul_f32_f32", "float32(float32, float32)", link=functions_cu
29
+ )
30
+
31
+ # A kernel that calls mul; functions.cu is linked automatically due to
32
+ # the call to mul.
33
+ @cuda.jit
34
+ def multiply_vectors(r, x, y):
35
+ i = cuda.grid(1)
36
+
37
+ if i < len(r):
38
+ r[i] = mul(x[i], y[i])
39
+
40
+ # Generate random data
41
+ N = 32
42
+ np.random.seed(1)
43
+ x = np.random.rand(N).astype(np.float32)
44
+ y = np.random.rand(N).astype(np.float32)
45
+ r = np.zeros_like(x)
46
+
47
+ # Run the kernel
48
+ multiply_vectors[1, 32](r, x, y)
49
+
50
+ # Sanity check - ensure the results match those expected
51
+ np.testing.assert_array_equal(r, x * y)
52
+ # magictoken.ex_linking_cu.end
53
+
54
+ def test_ex_from_buffer(self):
55
+ from numba import cuda
56
+ import os
57
+
58
+ basedir = os.path.dirname(os.path.abspath(__file__))
59
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
60
+
61
+ # magictoken.ex_from_buffer_decl.begin
62
+ signature = "float32(CPointer(float32), int32)"
63
+ sum_reduce = cuda.declare_device(
64
+ "sum_reduce", signature, link=functions_cu
65
+ )
66
+ # magictoken.ex_from_buffer_decl.end
67
+
68
+ # magictoken.ex_from_buffer_kernel.begin
69
+ import cffi
70
+
71
+ ffi = cffi.FFI()
72
+
73
+ @cuda.jit
74
+ def reduction_caller(result, array):
75
+ array_ptr = ffi.from_buffer(array)
76
+ result[()] = sum_reduce(array_ptr, len(array))
77
+
78
+ # magictoken.ex_from_buffer_kernel.end
79
+
80
+ import numpy as np
81
+
82
+ x = np.arange(10).astype(np.float32)
83
+ r = np.ndarray((), dtype=np.float32)
84
+
85
+ reduction_caller[1, 1](r, x)
86
+
87
+ expected = np.sum(x)
88
+ actual = r[()]
89
+ np.testing.assert_allclose(expected, actual)
90
+
91
+ def test_ex_extra_includes(self):
92
+ import numpy as np
93
+ from numba import cuda
94
+ from numba.cuda import config
95
+ import os
96
+
97
+ basedir = os.path.dirname(os.path.abspath(__file__))
98
+ mul_dir = os.path.join(basedir, "ffi", "include")
99
+ saxpy_cu = os.path.join(basedir, "ffi", "saxpy.cu")
100
+
101
+ testdir = os.path.dirname(basedir)
102
+ add_dir = os.path.join(testdir, "data", "include")
103
+
104
+ includedir = ":".join([mul_dir, add_dir])
105
+ with override_config("CUDA_NVRTC_EXTRA_SEARCH_PATHS", includedir):
106
+ # magictoken.ex_extra_search_paths.begin
107
+ from numba.cuda import config
108
+
109
+ includedir = ":".join([mul_dir, add_dir])
110
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = includedir
111
+ # magictoken.ex_extra_search_paths.end
112
+
113
+ # magictoken.ex_extra_search_paths_kernel.begin
114
+ sig = "float32(float32, float32, float32)"
115
+ saxpy = cuda.declare_device("saxpy", sig=sig, link=saxpy_cu)
116
+
117
+ @cuda.jit
118
+ def vector_saxpy(a, x, y, res):
119
+ i = cuda.grid(1)
120
+ if i < len(res):
121
+ res[i] = saxpy(a, x[i], y[i])
122
+
123
+ # magictoken.ex_extra_search_paths_kernel.end
124
+
125
+ size = 10_000
126
+ a = 3.0
127
+ X = np.ones((size,), dtype="float32")
128
+ Y = np.ones((size,), dtype="float32")
129
+ R = np.zeros((size,), dtype="float32")
130
+
131
+ block_size = 32
132
+ num_blocks = (size // block_size) + 1
133
+
134
+ vector_saxpy[num_blocks, block_size](a, X, Y, R)
135
+
136
+ expected = a * X + Y
137
+ np.testing.assert_equal(R, expected)
138
+
139
+
140
+ if __name__ == "__main__":
141
+ unittest.main()
@@ -0,0 +1,160 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import unittest
5
+
6
+ from numba.cuda.testing import (
7
+ CUDATestCase,
8
+ skip_if_cudadevrt_missing,
9
+ skip_on_cudasim,
10
+ skip_unless_cc_60,
11
+ )
12
+ from numba.cuda.tests.support import captured_stdout
13
+
14
+
15
+ @skip_if_cudadevrt_missing
16
+ @skip_unless_cc_60
17
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
18
+ class TestLaplace(CUDATestCase):
19
+ """
20
+ Test simple vector addition
21
+ """
22
+
23
+ def setUp(self):
24
+ # Prevent output from this test showing up when running the test suite
25
+ self._captured_stdout = captured_stdout()
26
+ self._captured_stdout.__enter__()
27
+ super().setUp()
28
+
29
+ def tearDown(self):
30
+ # No exception type, value, or traceback
31
+ self._captured_stdout.__exit__(None, None, None)
32
+ super().tearDown()
33
+
34
+ def test_ex_laplace(self):
35
+ # set True to regenerate the figures that
36
+ # accompany this example
37
+ plot = False
38
+
39
+ # ex_laplace.import.begin
40
+ import numpy as np
41
+ from numba import cuda
42
+ # ex_laplace.import.end
43
+
44
+ # ex_laplace.allocate.begin
45
+ # Use an odd problem size.
46
+ # This is so there can be an element truly in the "middle" for symmetry.
47
+ size = 1001
48
+ data = np.zeros(size)
49
+
50
+ # Middle element is made very hot
51
+ data[500] = 10000
52
+ buf_0 = cuda.to_device(data)
53
+
54
+ # This extra array is used for synchronization purposes
55
+ buf_1 = cuda.device_array_like(buf_0)
56
+
57
+ niter = 10000
58
+ # ex_laplace.allocate.end
59
+
60
+ if plot:
61
+ import matplotlib.pyplot as plt
62
+
63
+ fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
64
+ plt.plot(
65
+ np.arange(len(buf_0)),
66
+ buf_0.copy_to_host(),
67
+ lw=3,
68
+ marker="*",
69
+ color="black",
70
+ )
71
+
72
+ plt.title("Initial State", fontsize=24)
73
+ plt.xlabel("Position", fontsize=24)
74
+ plt.ylabel("Temperature", fontsize=24)
75
+
76
+ ax.set_xticks(ax.get_xticks(), fontsize=16)
77
+ ax.set_yticks(ax.get_yticks(), fontsize=16)
78
+ plt.xlim(0, len(data))
79
+ plt.ylim(0, 10001)
80
+ plt.savefig("laplace_initial.svg")
81
+
82
+ # ex_laplace.kernel.begin
83
+ @cuda.jit
84
+ def solve_heat_equation(buf_0, buf_1, timesteps, k):
85
+ i = cuda.grid(1)
86
+
87
+ # Don't continue if our index is outside the domain
88
+ if i >= len(buf_0):
89
+ return
90
+
91
+ # Prepare to do a grid-wide synchronization later
92
+ grid = cuda.cg.this_grid()
93
+
94
+ for step in range(timesteps):
95
+ # Select the buffer from the previous timestep
96
+ if (step % 2) == 0:
97
+ data = buf_0
98
+ next_data = buf_1
99
+ else:
100
+ data = buf_1
101
+ next_data = buf_0
102
+
103
+ # Get the current temperature associated with this point
104
+ curr_temp = data[i]
105
+
106
+ # Apply formula from finite difference equation
107
+ if i == 0:
108
+ # Left wall is held at T = 0
109
+ next_temp = curr_temp + k * (data[i + 1] - (2 * curr_temp))
110
+ elif i == len(data) - 1:
111
+ # Right wall is held at T = 0
112
+ next_temp = curr_temp + k * (data[i - 1] - (2 * curr_temp))
113
+ else:
114
+ # Interior points are a weighted average of their neighbors
115
+ next_temp = curr_temp + k * (
116
+ data[i - 1] - (2 * curr_temp) + data[i + 1]
117
+ )
118
+
119
+ # Write new value to the next buffer
120
+ next_data[i] = next_temp
121
+
122
+ # Wait for every thread to write before moving on
123
+ grid.sync()
124
+
125
+ # ex_laplace.kernel.end
126
+
127
+ # ex_laplace.launch.begin
128
+ solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25)
129
+ # ex_laplace.launch.end
130
+
131
+ results = buf_1.copy_to_host()
132
+ if plot:
133
+ fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
134
+ plt.plot(
135
+ np.arange(len(results)),
136
+ results,
137
+ lw=3,
138
+ marker="*",
139
+ color="black",
140
+ )
141
+ plt.title(f"T = {niter}", fontsize=24)
142
+ plt.xlabel("Position", fontsize=24)
143
+ plt.ylabel("Temperature", fontsize=24)
144
+
145
+ ax.set_xticks(ax.get_xticks(), fontsize=16)
146
+ ax.set_yticks(ax.get_yticks(), fontsize=16)
147
+
148
+ plt.ylim(0, max(results))
149
+ plt.xlim(0, len(results))
150
+ plt.savefig("laplace_final.svg")
151
+
152
+ # Integral over the domain should be equal to its initial value.
153
+ # Note that this should match the initial value of data[500] above, but
154
+ # we don't assign it to a variable because that would make the example
155
+ # code look a bit oddly verbose.
156
+ np.testing.assert_allclose(results.sum(), 10000)
157
+
158
+
159
+ if __name__ == "__main__":
160
+ unittest.main()
@@ -0,0 +1,180 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ Matrix multiplication example via `cuda.jit`.
6
+
7
+ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
8
+
9
+ Contents in this file are referenced from the sphinx-generated docs.
10
+ "magictoken" is used for markers as beginning and ending of example text.
11
+ """
12
+
13
+ import unittest
14
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
15
+ from numba.cuda.tests.support import captured_stdout
16
+
17
+
18
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
19
+ class TestMatMul(CUDATestCase):
20
+ """
21
+ Text matrix multiplication using simple, shared memory/square, and shared
22
+ memory/nonsquare cases.
23
+ """
24
+
25
+ def setUp(self):
26
+ # Prevent output from this test showing up when running the test suite
27
+ self._captured_stdout = captured_stdout()
28
+ self._captured_stdout.__enter__()
29
+ super().setUp()
30
+
31
+ def tearDown(self):
32
+ # No exception type, value, or traceback
33
+ self._captured_stdout.__exit__(None, None, None)
34
+ super().tearDown()
35
+
36
+ def test_ex_matmul(self):
37
+ """Test of matrix multiplication on various cases."""
38
+ # magictoken.ex_import.begin
39
+ from numba import cuda
40
+ from numba.cuda import float32
41
+ import numpy as np
42
+ import math
43
+ # magictoken.ex_import.end
44
+
45
+ # magictoken.ex_matmul.begin
46
+ @cuda.jit
47
+ def matmul(A, B, C):
48
+ """Perform square matrix multiplication of C = A * B."""
49
+ i, j = cuda.grid(2)
50
+ if i < C.shape[0] and j < C.shape[1]:
51
+ tmp = 0.0
52
+ for k in range(A.shape[1]):
53
+ tmp += A[i, k] * B[k, j]
54
+ C[i, j] = tmp
55
+
56
+ # magictoken.ex_matmul.end
57
+
58
+ # magictoken.ex_run_matmul.begin
59
+ x_h = np.arange(16).reshape([4, 4])
60
+ y_h = np.ones([4, 4])
61
+ z_h = np.zeros([4, 4])
62
+
63
+ x_d = cuda.to_device(x_h)
64
+ y_d = cuda.to_device(y_h)
65
+ z_d = cuda.to_device(z_h)
66
+
67
+ threadsperblock = (16, 16)
68
+ blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
69
+ blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
70
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
71
+
72
+ matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
73
+ z_h = z_d.copy_to_host()
74
+ print(z_h)
75
+ print(x_h @ y_h)
76
+ # magictoken.ex_run_matmul.end
77
+
78
+ # magictoken.ex_fast_matmul.begin
79
+ # Controls threads per block and shared memory usage.
80
+ # The computation will be done on blocks of TPBxTPB elements.
81
+ # TPB should not be larger than 32 in this example
82
+ TPB = 16
83
+
84
+ @cuda.jit
85
+ def fast_matmul(A, B, C):
86
+ """
87
+ Perform matrix multiplication of C = A * B using CUDA shared memory.
88
+
89
+ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
90
+ """
91
+ # Define an array in the shared memory
92
+ # The size and type of the arrays must be known at compile time
93
+ sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
94
+ sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
95
+
96
+ x, y = cuda.grid(2)
97
+
98
+ tx = cuda.threadIdx.x
99
+ ty = cuda.threadIdx.y
100
+ bpg = cuda.gridDim.x # blocks per grid
101
+
102
+ # Each thread computes one element in the result matrix.
103
+ # The dot product is chunked into dot products of TPB-long vectors.
104
+ tmp = float32(0.0)
105
+ for i in range(bpg):
106
+ # Preload data into shared memory
107
+ sA[ty, tx] = 0
108
+ sB[ty, tx] = 0
109
+ if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:
110
+ sA[ty, tx] = A[y, tx + i * TPB]
111
+ if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:
112
+ sB[ty, tx] = B[ty + i * TPB, x]
113
+
114
+ # Wait until all threads finish preloading
115
+ cuda.syncthreads()
116
+
117
+ # Computes partial product on the shared memory
118
+ for j in range(TPB):
119
+ tmp += sA[ty, j] * sB[j, tx]
120
+
121
+ # Wait until all threads finish computing
122
+ cuda.syncthreads()
123
+ if y < C.shape[0] and x < C.shape[1]:
124
+ C[y, x] = tmp
125
+
126
+ # magictoken.ex_fast_matmul.end
127
+
128
+ # magictoken.ex_run_fast_matmul.begin
129
+ x_h = np.arange(16).reshape([4, 4])
130
+ y_h = np.ones([4, 4])
131
+ z_h = np.zeros([4, 4])
132
+
133
+ x_d = cuda.to_device(x_h)
134
+ y_d = cuda.to_device(y_h)
135
+ z_d = cuda.to_device(z_h)
136
+
137
+ threadsperblock = (TPB, TPB)
138
+ blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
139
+ blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
140
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
141
+
142
+ fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
143
+ z_h = z_d.copy_to_host()
144
+ print(z_h)
145
+ print(x_h @ y_h)
146
+ # magictoken.ex_run_fast_matmul.end
147
+
148
+ # fast_matmul test(s)
149
+ msg = "fast_matmul incorrect for shared memory, square case."
150
+ self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
151
+
152
+ # magictoken.ex_run_nonsquare.begin
153
+ x_h = np.arange(115).reshape([5, 23])
154
+ y_h = np.ones([23, 7])
155
+ z_h = np.zeros([5, 7])
156
+
157
+ x_d = cuda.to_device(x_h)
158
+ y_d = cuda.to_device(y_h)
159
+ z_d = cuda.to_device(z_h)
160
+
161
+ threadsperblock = (TPB, TPB)
162
+ grid_y_max = max(x_h.shape[0], y_h.shape[0])
163
+ grid_x_max = max(x_h.shape[1], y_h.shape[1])
164
+ blockspergrid_x = math.ceil(grid_x_max / threadsperblock[0])
165
+ blockspergrid_y = math.ceil(grid_y_max / threadsperblock[1])
166
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
167
+
168
+ fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
169
+ z_h = z_d.copy_to_host()
170
+ print(z_h)
171
+ print(x_h @ y_h)
172
+ # magictoken.ex_run_nonsquare.end
173
+
174
+ # nonsquare fast_matmul test(s)
175
+ msg = "fast_matmul incorrect for shared memory, non-square case."
176
+ self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
177
+
178
+
179
+ if __name__ == "__main__":
180
+ unittest.main()
@@ -0,0 +1,119 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import unittest
5
+
6
+ from numba.cuda.testing import (
7
+ CUDATestCase,
8
+ skip_on_cudasim,
9
+ skip_on_standalone_numba_cuda,
10
+ )
11
+ from numba.cuda.tests.support import captured_stdout
12
+
13
+
14
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
15
+ class TestMonteCarlo(CUDATestCase):
16
+ """
17
+ Test monte-carlo integration
18
+ """
19
+
20
+ def setUp(self):
21
+ # Prevent output from this test showing up when running the test suite
22
+ self._captured_stdout = captured_stdout()
23
+ self._captured_stdout.__enter__()
24
+ super().setUp()
25
+
26
+ def tearDown(self):
27
+ # No exception type, value, or traceback
28
+ self._captured_stdout.__exit__(None, None, None)
29
+ super().tearDown()
30
+
31
+ @skip_on_standalone_numba_cuda
32
+ def test_ex_montecarlo(self):
33
+ # ex_montecarlo.import.begin
34
+ import numba
35
+ import numpy as np
36
+ from numba import cuda
37
+ from numba.cuda.random import (
38
+ create_xoroshiro128p_states,
39
+ xoroshiro128p_uniform_float32,
40
+ )
41
+ # ex_montecarlo.import.end
42
+
43
+ # ex_montecarlo.define.begin
44
+ # number of samples, higher will lead to a more accurate answer
45
+ nsamps = 1000000
46
+ # ex_montecarlo.define.end
47
+
48
+ # ex_montecarlo.kernel.begin
49
+ @cuda.jit
50
+ def mc_integrator_kernel(out, rng_states, lower_lim, upper_lim):
51
+ """
52
+ kernel to draw random samples and evaluate the function to
53
+ be integrated at those sample values
54
+ """
55
+ size = len(out)
56
+
57
+ gid = cuda.grid(1)
58
+ if gid < size:
59
+ # draw a sample between 0 and 1 on this thread
60
+ samp = xoroshiro128p_uniform_float32(rng_states, gid)
61
+
62
+ # normalize this sample to the limit range
63
+ samp = samp * (upper_lim - lower_lim) + lower_lim
64
+
65
+ # evaluate the function to be
66
+ # integrated at the normalized
67
+ # value of the sample
68
+ y = func(samp)
69
+ out[gid] = y
70
+
71
+ # ex_montecarlo.kernel.end
72
+
73
+ # ex_montecarlo.callfunc.begin
74
+ @cuda.reduce
75
+ def sum_reduce(a, b):
76
+ return a + b
77
+
78
+ def mc_integrate(lower_lim, upper_lim, nsamps):
79
+ """
80
+ approximate the definite integral of `func` from
81
+ `lower_lim` to `upper_lim`
82
+ """
83
+ out = cuda.to_device(np.zeros(nsamps, dtype="float32"))
84
+ rng_states = create_xoroshiro128p_states(nsamps, seed=42)
85
+
86
+ # jit the function for use in CUDA kernels
87
+
88
+ mc_integrator_kernel.forall(nsamps)(
89
+ out, rng_states, lower_lim, upper_lim
90
+ )
91
+ # normalization factor to convert
92
+ # to the average: (b - a)/(N - 1)
93
+ factor = (upper_lim - lower_lim) / (nsamps - 1)
94
+
95
+ return sum_reduce(out) * factor
96
+
97
+ # ex_montecarlo.callfunc.end
98
+
99
+ # ex_montecarlo.launch.begin
100
+ # define a function to integrate
101
+ @numba.jit
102
+ def func(x):
103
+ return 1.0 / x
104
+
105
+ mc_integrate(1, 2, nsamps) # array(0.6929643, dtype=float32)
106
+ mc_integrate(2, 3, nsamps) # array(0.4054021, dtype=float32)
107
+ # ex_montecarlo.launch.end
108
+
109
+ # values computed independently using maple
110
+ np.testing.assert_allclose(
111
+ mc_integrate(1, 2, nsamps), 0.69315, atol=0.001
112
+ )
113
+ np.testing.assert_allclose(
114
+ mc_integrate(2, 3, nsamps), 0.4055, atol=0.001
115
+ )
116
+
117
+
118
+ if __name__ == "__main__":
119
+ unittest.main()
@@ -0,0 +1,66 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ # Contents in this file are referenced from the sphinx-generated docs.
5
+ # "magictoken" is used for markers as beginning and ending of example text.
6
+
7
+ import unittest
8
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
9
+
10
+
11
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
12
+ class TestRandom(CUDATestCase):
13
+ def test_ex_3d_grid(self):
14
+ # magictoken.ex_3d_grid.begin
15
+ from numba import cuda
16
+ from numba.cuda.random import (
17
+ create_xoroshiro128p_states,
18
+ xoroshiro128p_uniform_float32,
19
+ )
20
+ import numpy as np
21
+
22
+ @cuda.jit
23
+ def random_3d(arr, rng_states):
24
+ # Per-dimension thread indices and strides
25
+ startx, starty, startz = cuda.grid(3)
26
+ stridex, stridey, stridez = cuda.gridsize(3)
27
+
28
+ # Linearized thread index
29
+ tid = (startz * stridey * stridex) + (starty * stridex) + startx
30
+
31
+ # Use strided loops over the array to assign a random value to each entry
32
+ for i in range(startz, arr.shape[0], stridez):
33
+ for j in range(starty, arr.shape[1], stridey):
34
+ for k in range(startx, arr.shape[2], stridex):
35
+ arr[i, j, k] = xoroshiro128p_uniform_float32(
36
+ rng_states, tid
37
+ )
38
+
39
+ # Array dimensions
40
+ X, Y, Z = 701, 900, 719
41
+
42
+ # Block and grid dimensions
43
+ bx, by, bz = 8, 8, 8
44
+ gx, gy, gz = 16, 16, 16
45
+
46
+ # Total number of threads
47
+ nthreads = bx * by * bz * gx * gy * gz
48
+
49
+ # Initialize a state for each thread
50
+ rng_states = create_xoroshiro128p_states(nthreads, seed=1)
51
+
52
+ # Generate random numbers
53
+ arr = cuda.device_array((X, Y, Z), dtype=np.float32)
54
+ random_3d[(gx, gy, gz), (bx, by, bz)](arr, rng_states)
55
+ # magictoken.ex_3d_grid.end
56
+
57
+ # Some basic tests of the randomly-generated numbers
58
+ host_arr = arr.copy_to_host()
59
+ self.assertGreater(np.mean(host_arr), 0.49)
60
+ self.assertLess(np.mean(host_arr), 0.51)
61
+ self.assertTrue(np.all(host_arr <= 1.0))
62
+ self.assertTrue(np.all(host_arr >= 0.0))
63
+
64
+
65
+ if __name__ == "__main__":
66
+ unittest.main()