numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,24 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+
5
+ def initialize_all():
6
+ # Import models to register them with the data model manager
7
+ import numba.cuda.models # noqa: F401
8
+
9
+ from numba.cuda import HAS_NUMBA
10
+
11
+ if not HAS_NUMBA:
12
+ return
13
+
14
+ from numba.cuda.decorators import jit
15
+ from numba.cuda.dispatcher import CUDADispatcher
16
+ from numba.core.target_extension import (
17
+ target_registry,
18
+ dispatcher_registry,
19
+ jit_registry,
20
+ )
21
+
22
+ cuda_target = target_registry["cuda"]
23
+ jit_registry[cuda_target] = jit
24
+ dispatcher_registry[cuda_target] = CUDADispatcher
@@ -0,0 +1,531 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from llvmlite import ir
5
+
6
+ from numba import cuda
7
+ from numba.cuda import types
8
+ from numba.cuda import cgutils
9
+ from numba.cuda.core.errors import (
10
+ RequireLiteralValue,
11
+ TypingError,
12
+ NumbaTypeError,
13
+ )
14
+ from numba.cuda.typing import signature
15
+ from numba.cuda.extending import overload_attribute, overload_method
16
+ from numba.cuda import nvvmutils
17
+ from numba.cuda.extending import intrinsic
18
+
19
+
20
+ # -------------------------------------------------------------------------------
21
+ # Grid functions
22
+
23
+
24
+ def _type_grid_function(ndim):
25
+ val = ndim.literal_value
26
+ if val == 1:
27
+ restype = types.int64
28
+ elif val in (2, 3):
29
+ restype = types.UniTuple(types.int64, val)
30
+ else:
31
+ raise ValueError("argument can only be 1, 2, 3")
32
+
33
+ return signature(restype, types.int32)
34
+
35
+
36
+ @intrinsic
37
+ def grid(typingctx, ndim):
38
+ """grid(ndim)
39
+
40
+ Return the absolute position of the current thread in the entire grid of
41
+ blocks. *ndim* should correspond to the number of dimensions declared when
42
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
43
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
44
+
45
+ Computation of the first integer is as follows::
46
+
47
+ cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
48
+
49
+ and is similar for the other two indices, but using the ``y`` and ``z``
50
+ attributes.
51
+ """
52
+
53
+ if not isinstance(ndim, types.IntegerLiteral):
54
+ raise RequireLiteralValue(ndim)
55
+
56
+ sig = _type_grid_function(ndim)
57
+
58
+ def codegen(context, builder, sig, args):
59
+ restype = sig.return_type
60
+ if restype == types.int64:
61
+ return nvvmutils.get_global_id(builder, dim=1)
62
+ elif isinstance(restype, types.UniTuple):
63
+ ids = nvvmutils.get_global_id(builder, dim=restype.count)
64
+ return cgutils.pack_array(builder, ids)
65
+
66
+ return sig, codegen
67
+
68
+
69
+ @intrinsic
70
+ def gridsize(typingctx, ndim):
71
+ """gridsize(ndim)
72
+
73
+ Return the absolute size (or shape) in threads of the entire grid of
74
+ blocks. *ndim* should correspond to the number of dimensions declared when
75
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
76
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
77
+
78
+ Computation of the first integer is as follows::
79
+
80
+ cuda.blockDim.x * cuda.gridDim.x
81
+
82
+ and is similar for the other two indices, but using the ``y`` and ``z``
83
+ attributes.
84
+ """
85
+
86
+ if not isinstance(ndim, types.IntegerLiteral):
87
+ raise RequireLiteralValue(ndim)
88
+
89
+ sig = _type_grid_function(ndim)
90
+
91
+ def _nthreads_for_dim(builder, dim):
92
+ i64 = ir.IntType(64)
93
+ ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
94
+ nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
95
+ return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
96
+
97
+ def codegen(context, builder, sig, args):
98
+ restype = sig.return_type
99
+ nx = _nthreads_for_dim(builder, "x")
100
+
101
+ if restype == types.int64:
102
+ return nx
103
+ elif isinstance(restype, types.UniTuple):
104
+ ny = _nthreads_for_dim(builder, "y")
105
+
106
+ if restype.count == 2:
107
+ return cgutils.pack_array(builder, (nx, ny))
108
+ elif restype.count == 3:
109
+ nz = _nthreads_for_dim(builder, "z")
110
+ return cgutils.pack_array(builder, (nx, ny, nz))
111
+
112
+ return sig, codegen
113
+
114
+
115
+ @intrinsic
116
+ def _warpsize(typingctx):
117
+ sig = signature(types.int32)
118
+
119
+ def codegen(context, builder, sig, args):
120
+ return nvvmutils.call_sreg(builder, "warpsize")
121
+
122
+ return sig, codegen
123
+
124
+
125
+ @overload_attribute(types.Module(cuda), "warpsize", target="cuda")
126
+ def cuda_warpsize(mod):
127
+ """
128
+ The size of a warp. All architectures implemented to date have a warp size
129
+ of 32.
130
+ """
131
+
132
+ def get(mod):
133
+ return _warpsize()
134
+
135
+ return get
136
+
137
+
138
+ # -------------------------------------------------------------------------------
139
+ # syncthreads
140
+
141
+
142
+ @intrinsic
143
+ def syncthreads(typingctx):
144
+ """
145
+ Synchronize all threads in the same thread block. This function implements
146
+ the same pattern as barriers in traditional multi-threaded programming: this
147
+ function waits until all threads in the block call it, at which point it
148
+ returns control to all its callers.
149
+ """
150
+ sig = signature(types.none)
151
+
152
+ def codegen(context, builder, sig, args):
153
+ fname = "llvm.nvvm.barrier0"
154
+ lmod = builder.module
155
+ fnty = ir.FunctionType(ir.VoidType(), ())
156
+ sync = cgutils.get_or_insert_function(lmod, fnty, fname)
157
+ builder.call(sync, ())
158
+ return context.get_dummy_value()
159
+
160
+ return sig, codegen
161
+
162
+
163
+ def _syncthreads_predicate(typingctx, predicate, fname):
164
+ if not isinstance(predicate, types.Integer):
165
+ return None
166
+
167
+ sig = signature(types.i4, types.i4)
168
+
169
+ def codegen(context, builder, sig, args):
170
+ fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
171
+ sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
172
+ return builder.call(sync, args)
173
+
174
+ return sig, codegen
175
+
176
+
177
+ @intrinsic
178
+ def syncthreads_count(typingctx, predicate):
179
+ """
180
+ syncthreads_count(predicate)
181
+
182
+ An extension to numba.cuda.syncthreads where the return value is a count
183
+ of the threads where predicate is true.
184
+ """
185
+ fname = "llvm.nvvm.barrier0.popc"
186
+ return _syncthreads_predicate(typingctx, predicate, fname)
187
+
188
+
189
+ @intrinsic
190
+ def syncthreads_and(typingctx, predicate):
191
+ """
192
+ syncthreads_and(predicate)
193
+
194
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
195
+ true for all threads or 0 otherwise.
196
+ """
197
+ fname = "llvm.nvvm.barrier0.and"
198
+ return _syncthreads_predicate(typingctx, predicate, fname)
199
+
200
+
201
+ @intrinsic
202
+ def syncthreads_or(typingctx, predicate):
203
+ """
204
+ syncthreads_or(predicate)
205
+
206
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
207
+ true for any thread or 0 otherwise.
208
+ """
209
+ fname = "llvm.nvvm.barrier0.or"
210
+ return _syncthreads_predicate(typingctx, predicate, fname)
211
+
212
+
213
+ @overload_method(types.Integer, "bit_count", target="cuda")
214
+ def integer_bit_count(i):
215
+ return lambda i: cuda.popc(i)
216
+
217
+
218
+ # -------------------------------------------------------------------------------
219
+ # Warp shuffle functions
220
+ #
221
+ # References:
222
+ #
223
+ # - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
224
+ # - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
225
+ #
226
+ # Notes:
227
+ #
228
+ # - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
229
+ # different names for parameters to the NVVM IR specification. So that we
230
+ # can correlate the implementation with the documentation, the @intrinsic
231
+ # API functions map the public API arguments to the NVVM intrinsic
232
+ # arguments.
233
+ # - The NVVM IR specification requires some of the parameters (e.g. mode) to be
234
+ # constants. It's therefore essential that we pass in some values to the
235
+ # shfl_sync_intrinsic function (e.g. the mode and c values).
236
+ # - Normally parameters for intrinsic functions in Numba would be given the
237
+ # same name as used in the API, and would contain a type. However, because we
238
+ # have to pass in some values and some times (and there is divergence between
239
+ # the names in the intrinsic documentation and the public APIs) we instead
240
+ # follow the convention of naming shfl_sync_intrinsic parameters with a
241
+ # suffix of _type or _value depending on whether they contain a type or a
242
+ # value.
243
+
244
+
245
+ @intrinsic
246
+ def shfl_sync(typingctx, mask, value, src_lane):
247
+ """
248
+ Shuffles ``value`` across the masked warp and returns the value from
249
+ ``src_lane``. If this is outside the warp, then the given value is
250
+ returned.
251
+ """
252
+ membermask_type = mask
253
+ mode_value = 0
254
+ a_type = value
255
+ b_type = src_lane
256
+ c_value = 0x1F
257
+ return shfl_sync_intrinsic(
258
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
259
+ )
260
+
261
+
262
+ @intrinsic
263
+ def shfl_up_sync(typingctx, mask, value, delta):
264
+ """
265
+ Shuffles ``value`` across the masked warp and returns the value from
266
+ ``(laneid - delta)``. If this is outside the warp, then the given value is
267
+ returned.
268
+ """
269
+ membermask_type = mask
270
+ mode_value = 1
271
+ a_type = value
272
+ b_type = delta
273
+ c_value = 0
274
+ return shfl_sync_intrinsic(
275
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
276
+ )
277
+
278
+
279
+ @intrinsic
280
+ def shfl_down_sync(typingctx, mask, value, delta):
281
+ """
282
+ Shuffles ``value`` across the masked warp and returns the value from
283
+ ``(laneid + delta)``. If this is outside the warp, then the given value is
284
+ returned.
285
+ """
286
+ membermask_type = mask
287
+ mode_value = 2
288
+ a_type = value
289
+ b_type = delta
290
+ c_value = 0x1F
291
+ return shfl_sync_intrinsic(
292
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
293
+ )
294
+
295
+
296
+ @intrinsic
297
+ def shfl_xor_sync(typingctx, mask, value, lane_mask):
298
+ """
299
+ Shuffles ``value`` across the masked warp and returns the value from
300
+ ``(laneid ^ lane_mask)``.
301
+ """
302
+ membermask_type = mask
303
+ mode_value = 3
304
+ a_type = value
305
+ b_type = lane_mask
306
+ c_value = 0x1F
307
+ return shfl_sync_intrinsic(
308
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
309
+ )
310
+
311
+
312
+ def shfl_sync_intrinsic(
313
+ typingctx,
314
+ membermask_type,
315
+ mode_value,
316
+ a_type,
317
+ b_type,
318
+ c_value,
319
+ ):
320
+ if a_type not in (types.i4, types.i8, types.f4, types.f8):
321
+ raise TypingError(
322
+ "shfl_sync only supports 32- and 64-bit ints and floats"
323
+ )
324
+
325
+ def codegen(context, builder, sig, args):
326
+ """
327
+ The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
328
+ intrinsic supports both 32- and 64-bit ints and floats, so for feature
329
+ parity, i32, i64, f32, and f64 are implemented. Floats by way of
330
+ bitcasting the float to an int, then shuffling, then bitcasting
331
+ back."""
332
+ membermask, a, b = args
333
+
334
+ # Types
335
+ a_type = sig.args[1]
336
+ return_type = context.get_value_type(sig.return_type)
337
+ i32 = ir.IntType(32)
338
+ i64 = ir.IntType(64)
339
+
340
+ if a_type in types.real_domain:
341
+ a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
342
+
343
+ # NVVM intrinsic definition
344
+ arg_types = (i32, i32, i32, i32, i32)
345
+ shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
346
+ fnty = ir.FunctionType(shfl_return_type, arg_types)
347
+
348
+ fname = "llvm.nvvm.shfl.sync.i32"
349
+ shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
350
+
351
+ # Intrinsic arguments
352
+ mode = ir.Constant(i32, mode_value)
353
+ c = ir.Constant(i32, c_value)
354
+ membermask = builder.trunc(membermask, i32)
355
+ b = builder.trunc(b, i32)
356
+
357
+ if a_type.bitwidth == 32:
358
+ a = builder.trunc(a, i32)
359
+ ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
360
+ d = builder.extract_value(ret, 0)
361
+ else:
362
+ # Handle 64-bit values by shuffling as two 32-bit values and
363
+ # packing the result into 64 bits.
364
+
365
+ # Extract high and low parts
366
+ lo = builder.trunc(a, i32)
367
+ a_lshr = builder.lshr(a, ir.Constant(i64, 32))
368
+ hi = builder.trunc(a_lshr, i32)
369
+
370
+ # Shuffle individual parts
371
+ ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
372
+ ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
373
+
374
+ # Combine individual result parts into a 64-bit result
375
+ d_lo = builder.extract_value(ret_lo, 0)
376
+ d_hi = builder.extract_value(ret_hi, 0)
377
+ d_lo_64 = builder.zext(d_lo, i64)
378
+ d_hi_64 = builder.zext(d_hi, i64)
379
+ d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
380
+ d = builder.or_(d_shl, d_lo_64)
381
+
382
+ return builder.bitcast(d, return_type)
383
+
384
+ sig = signature(a_type, membermask_type, a_type, b_type)
385
+
386
+ return sig, codegen
387
+
388
+
389
+ # -------------------------------------------------------------------------------
390
+ # Warp vote functions
391
+ #
392
+ # References:
393
+ #
394
+ # - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-vote-functions
395
+ # - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?highlight=data%2520movement#vote
396
+ #
397
+ # Notes:
398
+ #
399
+ # - The NVVM IR specification requires some of the mode parameter to be
400
+ # constants. It's therefore essential that we pass in mode values to the
401
+ # vote_sync_intrinsic.
402
+
403
+
404
+ @intrinsic
405
+ def all_sync(typingctx, mask_type, predicate_type):
406
+ """
407
+ If for all threads in the masked warp the predicate is true, then
408
+ a non-zero value is returned, otherwise 0 is returned.
409
+ """
410
+ mode_value = 0
411
+ sig, codegen_inner = vote_sync_intrinsic(
412
+ typingctx, mask_type, mode_value, predicate_type
413
+ )
414
+
415
+ def codegen(context, builder, sig_outer, args):
416
+ # Call vote_sync_intrinsic and extract the boolean result (index 1)
417
+ result_tuple = codegen_inner(context, builder, sig, args)
418
+ return builder.extract_value(result_tuple, 1)
419
+
420
+ sig_outer = signature(types.b1, mask_type, predicate_type)
421
+ return sig_outer, codegen
422
+
423
+
424
+ @intrinsic
425
+ def any_sync(typingctx, mask_type, predicate_type):
426
+ """
427
+ If for any thread in the masked warp the predicate is true, then
428
+ a non-zero value is returned, otherwise 0 is returned.
429
+ """
430
+ mode_value = 1
431
+ sig, codegen_inner = vote_sync_intrinsic(
432
+ typingctx, mask_type, mode_value, predicate_type
433
+ )
434
+
435
+ def codegen(context, builder, sig_outer, args):
436
+ result_tuple = codegen_inner(context, builder, sig, args)
437
+ return builder.extract_value(result_tuple, 1)
438
+
439
+ sig_outer = signature(types.b1, mask_type, predicate_type)
440
+ return sig_outer, codegen
441
+
442
+
443
+ @intrinsic
444
+ def eq_sync(typingctx, mask_type, predicate_type):
445
+ """
446
+ If for all threads in the masked warp the boolean predicate is the same,
447
+ then a non-zero value is returned, otherwise 0 is returned.
448
+ """
449
+ mode_value = 2
450
+ sig, codegen_inner = vote_sync_intrinsic(
451
+ typingctx, mask_type, mode_value, predicate_type
452
+ )
453
+
454
+ def codegen(context, builder, sig_outer, args):
455
+ result_tuple = codegen_inner(context, builder, sig, args)
456
+ return builder.extract_value(result_tuple, 1)
457
+
458
+ sig_outer = signature(types.b1, mask_type, predicate_type)
459
+ return sig_outer, codegen
460
+
461
+
462
+ @intrinsic
463
+ def ballot_sync(typingctx, mask_type, predicate_type):
464
+ """
465
+ Returns a mask of all threads in the warp whose predicate is true,
466
+ and are within the given mask.
467
+ """
468
+ mode_value = 3
469
+ sig, codegen_inner = vote_sync_intrinsic(
470
+ typingctx, mask_type, mode_value, predicate_type
471
+ )
472
+
473
+ def codegen(context, builder, sig_outer, args):
474
+ result_tuple = codegen_inner(context, builder, sig, args)
475
+ return builder.extract_value(
476
+ result_tuple, 0
477
+ ) # Extract ballot result (index 0)
478
+
479
+ sig_outer = signature(types.i4, mask_type, predicate_type)
480
+ return sig_outer, codegen
481
+
482
+
483
+ def vote_sync_intrinsic(typingctx, mask_type, mode_value, predicate_type):
484
+ # Validate mode value
485
+ if mode_value not in (0, 1, 2, 3):
486
+ raise ValueError("Mode must be 0 (all), 1 (any), 2 (eq), or 3 (ballot)")
487
+
488
+ if types.unliteral(mask_type) not in types.integer_domain:
489
+ raise NumbaTypeError(f"Mask type must be an integer. Got {mask_type}")
490
+ predicate_types = types.integer_domain | {types.boolean}
491
+
492
+ if types.unliteral(predicate_type) not in predicate_types:
493
+ raise NumbaTypeError(
494
+ f"Predicate must be an integer or boolean. Got {predicate_type}"
495
+ )
496
+
497
+ def codegen(context, builder, sig, args):
498
+ mask, predicate = args
499
+
500
+ # Types
501
+ i1 = ir.IntType(1)
502
+ i32 = ir.IntType(32)
503
+
504
+ # NVVM intrinsic definition
505
+ arg_types = (i32, i32, i1)
506
+ vote_return_type = ir.LiteralStructType((i32, i1))
507
+ fnty = ir.FunctionType(vote_return_type, arg_types)
508
+
509
+ fname = "llvm.nvvm.vote.sync"
510
+ lmod = builder.module
511
+ vote_sync = cgutils.get_or_insert_function(lmod, fnty, fname)
512
+
513
+ # Intrinsic arguments
514
+ mode = ir.Constant(i32, mode_value)
515
+ mask_i32 = builder.trunc(mask, i32)
516
+
517
+ # Convert predicate to i1
518
+ if predicate.type != ir.IntType(1):
519
+ predicate_bool = builder.icmp_signed(
520
+ "!=", predicate, ir.Constant(predicate.type, 0)
521
+ )
522
+ else:
523
+ predicate_bool = predicate
524
+
525
+ return builder.call(vote_sync, [mask_i32, mode, predicate_bool])
526
+
527
+ sig = signature(
528
+ types.Tuple((types.i4, types.b1)), mask_type, predicate_type
529
+ )
530
+
531
+ return sig, codegen