numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,24 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+
5
+ def initialize_all():
6
+ # Import models to register them with the data model manager
7
+ import numba.cuda.models # noqa: F401
8
+
9
+ from numba.cuda import HAS_NUMBA
10
+
11
+ if not HAS_NUMBA:
12
+ return
13
+
14
+ from numba.cuda.decorators import jit
15
+ from numba.cuda.dispatcher import CUDADispatcher
16
+ from numba.core.target_extension import (
17
+ target_registry,
18
+ dispatcher_registry,
19
+ jit_registry,
20
+ )
21
+
22
+ cuda_target = target_registry["cuda"]
23
+ jit_registry[cuda_target] = jit
24
+ dispatcher_registry[cuda_target] = CUDADispatcher
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from .decorators import jit
5
+ import numba
6
+
7
+
8
+ @jit(device=True)
9
+ def all_sync(mask, predicate):
10
+ """
11
+ If for all threads in the masked warp the predicate is true, then
12
+ a non-zero value is returned, otherwise 0 is returned.
13
+ """
14
+ return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
15
+
16
+
17
+ @jit(device=True)
18
+ def any_sync(mask, predicate):
19
+ """
20
+ If for any thread in the masked warp the predicate is true, then
21
+ a non-zero value is returned, otherwise 0 is returned.
22
+ """
23
+ return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
24
+
25
+
26
+ @jit(device=True)
27
+ def eq_sync(mask, predicate):
28
+ """
29
+ If for all threads in the masked warp the boolean predicate is the same,
30
+ then a non-zero value is returned, otherwise 0 is returned.
31
+ """
32
+ return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
33
+
34
+
35
+ @jit(device=True)
36
+ def ballot_sync(mask, predicate):
37
+ """
38
+ Returns a mask of all threads in the warp whose predicate is true,
39
+ and are within the given mask.
40
+ """
41
+ return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
@@ -0,0 +1,382 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from llvmlite import ir
5
+
6
+ from numba import cuda
7
+ from numba.cuda import types
8
+ from numba.cuda import cgutils
9
+ from numba.cuda.core.errors import RequireLiteralValue, TypingError
10
+ from numba.cuda.typing import signature
11
+ from numba.cuda.extending import overload_attribute, overload_method
12
+ from numba.cuda import nvvmutils
13
+ from numba.cuda.extending import intrinsic
14
+
15
+
16
+ # -------------------------------------------------------------------------------
17
+ # Grid functions
18
+
19
+
20
+ def _type_grid_function(ndim):
21
+ val = ndim.literal_value
22
+ if val == 1:
23
+ restype = types.int64
24
+ elif val in (2, 3):
25
+ restype = types.UniTuple(types.int64, val)
26
+ else:
27
+ raise ValueError("argument can only be 1, 2, 3")
28
+
29
+ return signature(restype, types.int32)
30
+
31
+
32
+ @intrinsic
33
+ def grid(typingctx, ndim):
34
+ """grid(ndim)
35
+
36
+ Return the absolute position of the current thread in the entire grid of
37
+ blocks. *ndim* should correspond to the number of dimensions declared when
38
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
39
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
40
+
41
+ Computation of the first integer is as follows::
42
+
43
+ cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
44
+
45
+ and is similar for the other two indices, but using the ``y`` and ``z``
46
+ attributes.
47
+ """
48
+
49
+ if not isinstance(ndim, types.IntegerLiteral):
50
+ raise RequireLiteralValue(ndim)
51
+
52
+ sig = _type_grid_function(ndim)
53
+
54
+ def codegen(context, builder, sig, args):
55
+ restype = sig.return_type
56
+ if restype == types.int64:
57
+ return nvvmutils.get_global_id(builder, dim=1)
58
+ elif isinstance(restype, types.UniTuple):
59
+ ids = nvvmutils.get_global_id(builder, dim=restype.count)
60
+ return cgutils.pack_array(builder, ids)
61
+
62
+ return sig, codegen
63
+
64
+
65
+ @intrinsic
66
+ def gridsize(typingctx, ndim):
67
+ """gridsize(ndim)
68
+
69
+ Return the absolute size (or shape) in threads of the entire grid of
70
+ blocks. *ndim* should correspond to the number of dimensions declared when
71
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
72
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
73
+
74
+ Computation of the first integer is as follows::
75
+
76
+ cuda.blockDim.x * cuda.gridDim.x
77
+
78
+ and is similar for the other two indices, but using the ``y`` and ``z``
79
+ attributes.
80
+ """
81
+
82
+ if not isinstance(ndim, types.IntegerLiteral):
83
+ raise RequireLiteralValue(ndim)
84
+
85
+ sig = _type_grid_function(ndim)
86
+
87
+ def _nthreads_for_dim(builder, dim):
88
+ i64 = ir.IntType(64)
89
+ ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
90
+ nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
91
+ return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
92
+
93
+ def codegen(context, builder, sig, args):
94
+ restype = sig.return_type
95
+ nx = _nthreads_for_dim(builder, "x")
96
+
97
+ if restype == types.int64:
98
+ return nx
99
+ elif isinstance(restype, types.UniTuple):
100
+ ny = _nthreads_for_dim(builder, "y")
101
+
102
+ if restype.count == 2:
103
+ return cgutils.pack_array(builder, (nx, ny))
104
+ elif restype.count == 3:
105
+ nz = _nthreads_for_dim(builder, "z")
106
+ return cgutils.pack_array(builder, (nx, ny, nz))
107
+
108
+ return sig, codegen
109
+
110
+
111
+ @intrinsic
112
+ def _warpsize(typingctx):
113
+ sig = signature(types.int32)
114
+
115
+ def codegen(context, builder, sig, args):
116
+ return nvvmutils.call_sreg(builder, "warpsize")
117
+
118
+ return sig, codegen
119
+
120
+
121
+ @overload_attribute(types.Module(cuda), "warpsize", target="cuda")
122
+ def cuda_warpsize(mod):
123
+ """
124
+ The size of a warp. All architectures implemented to date have a warp size
125
+ of 32.
126
+ """
127
+
128
+ def get(mod):
129
+ return _warpsize()
130
+
131
+ return get
132
+
133
+
134
+ # -------------------------------------------------------------------------------
135
+ # syncthreads
136
+
137
+
138
+ @intrinsic
139
+ def syncthreads(typingctx):
140
+ """
141
+ Synchronize all threads in the same thread block. This function implements
142
+ the same pattern as barriers in traditional multi-threaded programming: this
143
+ function waits until all threads in the block call it, at which point it
144
+ returns control to all its callers.
145
+ """
146
+ sig = signature(types.none)
147
+
148
+ def codegen(context, builder, sig, args):
149
+ fname = "llvm.nvvm.barrier0"
150
+ lmod = builder.module
151
+ fnty = ir.FunctionType(ir.VoidType(), ())
152
+ sync = cgutils.get_or_insert_function(lmod, fnty, fname)
153
+ builder.call(sync, ())
154
+ return context.get_dummy_value()
155
+
156
+ return sig, codegen
157
+
158
+
159
+ def _syncthreads_predicate(typingctx, predicate, fname):
160
+ if not isinstance(predicate, types.Integer):
161
+ return None
162
+
163
+ sig = signature(types.i4, types.i4)
164
+
165
+ def codegen(context, builder, sig, args):
166
+ fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
167
+ sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
168
+ return builder.call(sync, args)
169
+
170
+ return sig, codegen
171
+
172
+
173
+ @intrinsic
174
+ def syncthreads_count(typingctx, predicate):
175
+ """
176
+ syncthreads_count(predicate)
177
+
178
+ An extension to numba.cuda.syncthreads where the return value is a count
179
+ of the threads where predicate is true.
180
+ """
181
+ fname = "llvm.nvvm.barrier0.popc"
182
+ return _syncthreads_predicate(typingctx, predicate, fname)
183
+
184
+
185
+ @intrinsic
186
+ def syncthreads_and(typingctx, predicate):
187
+ """
188
+ syncthreads_and(predicate)
189
+
190
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
191
+ true for all threads or 0 otherwise.
192
+ """
193
+ fname = "llvm.nvvm.barrier0.and"
194
+ return _syncthreads_predicate(typingctx, predicate, fname)
195
+
196
+
197
+ @intrinsic
198
+ def syncthreads_or(typingctx, predicate):
199
+ """
200
+ syncthreads_or(predicate)
201
+
202
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
203
+ true for any thread or 0 otherwise.
204
+ """
205
+ fname = "llvm.nvvm.barrier0.or"
206
+ return _syncthreads_predicate(typingctx, predicate, fname)
207
+
208
+
209
+ @overload_method(types.Integer, "bit_count", target="cuda")
210
+ def integer_bit_count(i):
211
+ return lambda i: cuda.popc(i)
212
+
213
+
214
+ # -------------------------------------------------------------------------------
215
+ # Warp shuffle functions
216
+ #
217
+ # References:
218
+ #
219
+ # - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
220
+ # - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
221
+ #
222
+ # Notes:
223
+ #
224
+ # - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
225
+ # different names for parameters to the NVVM IR specification. So that we
226
+ # can correlate the implementation with the documentation, the @intrinsic
227
+ # API functions map the public API arguments to the NVVM intrinsic
228
+ # arguments.
229
+ # - The NVVM IR specification requires some of the parameters (e.g. mode) to be
230
+ # constants. It's therefore essential that we pass in some values to the
231
+ # shfl_sync_intrinsic function (e.g. the mode and c values).
232
+ # - Normally parameters for intrinsic functions in Numba would be given the
233
+ # same name as used in the API, and would contain a type. However, because we
234
+ # have to pass in some values and some times (and there is divergence between
235
+ # the names in the intrinsic documentation and the public APIs) we instead
236
+ # follow the convention of naming shfl_sync_intrinsic parameters with a
237
+ # suffix of _type or _value depending on whether they contain a type or a
238
+ # value.
239
+
240
+
241
+ @intrinsic
242
+ def shfl_sync(typingctx, mask, value, src_lane):
243
+ """
244
+ Shuffles ``value`` across the masked warp and returns the value from
245
+ ``src_lane``. If this is outside the warp, then the given value is
246
+ returned.
247
+ """
248
+ membermask_type = mask
249
+ mode_value = 0
250
+ a_type = value
251
+ b_type = src_lane
252
+ c_value = 0x1F
253
+ return shfl_sync_intrinsic(
254
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
255
+ )
256
+
257
+
258
+ @intrinsic
259
+ def shfl_up_sync(typingctx, mask, value, delta):
260
+ """
261
+ Shuffles ``value`` across the masked warp and returns the value from
262
+ ``(laneid - delta)``. If this is outside the warp, then the given value is
263
+ returned.
264
+ """
265
+ membermask_type = mask
266
+ mode_value = 1
267
+ a_type = value
268
+ b_type = delta
269
+ c_value = 0
270
+ return shfl_sync_intrinsic(
271
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
272
+ )
273
+
274
+
275
+ @intrinsic
276
+ def shfl_down_sync(typingctx, mask, value, delta):
277
+ """
278
+ Shuffles ``value`` across the masked warp and returns the value from
279
+ ``(laneid + delta)``. If this is outside the warp, then the given value is
280
+ returned.
281
+ """
282
+ membermask_type = mask
283
+ mode_value = 2
284
+ a_type = value
285
+ b_type = delta
286
+ c_value = 0x1F
287
+ return shfl_sync_intrinsic(
288
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
289
+ )
290
+
291
+
292
+ @intrinsic
293
+ def shfl_xor_sync(typingctx, mask, value, lane_mask):
294
+ """
295
+ Shuffles ``value`` across the masked warp and returns the value from
296
+ ``(laneid ^ lane_mask)``.
297
+ """
298
+ membermask_type = mask
299
+ mode_value = 3
300
+ a_type = value
301
+ b_type = lane_mask
302
+ c_value = 0x1F
303
+ return shfl_sync_intrinsic(
304
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
305
+ )
306
+
307
+
308
+ def shfl_sync_intrinsic(
309
+ typingctx,
310
+ membermask_type,
311
+ mode_value,
312
+ a_type,
313
+ b_type,
314
+ c_value,
315
+ ):
316
+ if a_type not in (types.i4, types.i8, types.f4, types.f8):
317
+ raise TypingError(
318
+ "shfl_sync only supports 32- and 64-bit ints and floats"
319
+ )
320
+
321
+ def codegen(context, builder, sig, args):
322
+ """
323
+ The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
324
+ intrinsic supports both 32- and 64-bit ints and floats, so for feature
325
+ parity, i32, i64, f32, and f64 are implemented. Floats by way of
326
+ bitcasting the float to an int, then shuffling, then bitcasting
327
+ back."""
328
+ membermask, a, b = args
329
+
330
+ # Types
331
+ a_type = sig.args[1]
332
+ return_type = context.get_value_type(sig.return_type)
333
+ i32 = ir.IntType(32)
334
+ i64 = ir.IntType(64)
335
+
336
+ if a_type in types.real_domain:
337
+ a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
338
+
339
+ # NVVM intrinsic definition
340
+ arg_types = (i32, i32, i32, i32, i32)
341
+ shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
342
+ fnty = ir.FunctionType(shfl_return_type, arg_types)
343
+
344
+ fname = "llvm.nvvm.shfl.sync.i32"
345
+ shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
346
+
347
+ # Intrinsic arguments
348
+ mode = ir.Constant(i32, mode_value)
349
+ c = ir.Constant(i32, c_value)
350
+ membermask = builder.trunc(membermask, i32)
351
+ b = builder.trunc(b, i32)
352
+
353
+ if a_type.bitwidth == 32:
354
+ a = builder.trunc(a, i32)
355
+ ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
356
+ d = builder.extract_value(ret, 0)
357
+ else:
358
+ # Handle 64-bit values by shuffling as two 32-bit values and
359
+ # packing the result into 64 bits.
360
+
361
+ # Extract high and low parts
362
+ lo = builder.trunc(a, i32)
363
+ a_lshr = builder.lshr(a, ir.Constant(i64, 32))
364
+ hi = builder.trunc(a_lshr, i32)
365
+
366
+ # Shuffle individual parts
367
+ ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
368
+ ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
369
+
370
+ # Combine individual result parts into a 64-bit result
371
+ d_lo = builder.extract_value(ret_lo, 0)
372
+ d_hi = builder.extract_value(ret_hi, 0)
373
+ d_lo_64 = builder.zext(d_lo, i64)
374
+ d_hi_64 = builder.zext(d_hi, i64)
375
+ d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
376
+ d = builder.or_(d_shl, d_lo_64)
377
+
378
+ return builder.bitcast(d, return_type)
379
+
380
+ sig = signature(a_type, membermask_type, a_type, b_type)
381
+
382
+ return sig, codegen
@@ -0,0 +1,214 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ Itanium CXX ABI Mangler
6
+
7
+ Reference: https://itanium-cxx-abi.github.io/cxx-abi/abi.html
8
+
9
+ The basics of the mangling scheme.
10
+
11
+ We are hijacking the CXX mangling scheme for our use. We map Python modules
12
+ into CXX namespace. A `module1.submodule2.foo` is mapped to
13
+ `module1::submodule2::foo`. For parameterized numba types, we treat them as
14
+ templated types; for example, `array(int64, 1d, C)` becomes an
15
+ `array<int64, 1, C>`.
16
+
17
+ All mangled names are prefixed with "_Z". It is followed by the name of the
18
+ entity. A name contains one or more identifiers. Each identifier is encoded
19
+ as "<num of char><name>". If the name is namespaced and, therefore,
20
+ has multiple identifiers, the entire name is encoded as "N<name>E".
21
+
22
+ For functions, arguments types follow. There are condensed encodings for basic
23
+ built-in types; e.g. "i" for int, "f" for float. For other types, the
24
+ previously mentioned name encoding should be used.
25
+
26
+ For templated types, the template parameters are encoded immediately after the
27
+ name. If it is namespaced, it should be within the 'N' 'E' marker. Template
28
+ parameters are encoded in "I<params>E", where each parameter is encoded using
29
+ the mentioned name encoding scheme. Template parameters can contain literal
30
+ values like the '1' in the array type shown earlier. There is special encoding
31
+ scheme for them to avoid leading digits.
32
+ """
33
+
34
+ import re
35
+
36
+ from numba.cuda import types
37
+
38
+
39
+ # According the scheme, valid characters for mangled names are [a-zA-Z0-9_].
40
+ # We borrow the '_' as the escape character to encode invalid char into
41
+ # '_xx' where 'xx' is the hex codepoint.
42
+ _re_invalid_char = re.compile(r"[^a-z0-9_]", re.I)
43
+
44
+ PREFIX = "_Z"
45
+
46
+ # Numba types to mangled type code. These correspond with the codes listed in
47
+ # https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling-builtin
48
+ N2CODE = {
49
+ types.void: "v",
50
+ types.boolean: "b",
51
+ types.uint8: "h",
52
+ types.int8: "a",
53
+ types.uint16: "t",
54
+ types.int16: "s",
55
+ types.uint32: "j",
56
+ types.int32: "i",
57
+ types.uint64: "y",
58
+ types.int64: "x",
59
+ types.float16: "Dh",
60
+ types.float32: "f",
61
+ types.float64: "d",
62
+ }
63
+
64
+
65
+ def _escape_string(text):
66
+ """Escape the given string so that it only contains ASCII characters
67
+ of [a-zA-Z0-9_$].
68
+
69
+ The dollar symbol ($) and other invalid characters are escaped into
70
+ the string sequence of "$xx" where "xx" is the hex codepoint of the char.
71
+
72
+ Multibyte characters are encoded into utf8 and converted into the above
73
+ hex format.
74
+ """
75
+
76
+ def repl(m):
77
+ return "".join(("_%02x" % ch) for ch in m.group(0).encode("utf8"))
78
+
79
+ ret = re.sub(_re_invalid_char, repl, text)
80
+ # Return str if we got a unicode (for py2)
81
+ if not isinstance(ret, str):
82
+ return ret.encode("ascii")
83
+ return ret
84
+
85
+
86
+ def _fix_lead_digit(text):
87
+ """
88
+ Fix text with leading digit
89
+ """
90
+ if text and text[0].isdigit():
91
+ return "_" + text
92
+ else:
93
+ return text
94
+
95
+
96
+ def _len_encoded(string):
97
+ """
98
+ Prefix string with digit indicating the length.
99
+ Add underscore if string is prefixed with digits.
100
+ """
101
+ string = _fix_lead_digit(string)
102
+ return "%u%s" % (len(string), string)
103
+
104
+
105
+ def mangle_abi_tag(abi_tag: str) -> str:
106
+ return "B" + _len_encoded(_escape_string(abi_tag))
107
+
108
+
109
+ def mangle_identifier(ident, template_params="", *, abi_tags=(), uid=None):
110
+ """
111
+ Mangle the identifier with optional template parameters and abi_tags.
112
+
113
+ Note:
114
+
115
+ This treats '.' as '::' in C++.
116
+ """
117
+ if uid is not None:
118
+ # Add uid to abi-tags
119
+ abi_tags = (f"v{uid}", *abi_tags)
120
+ parts = [_len_encoded(_escape_string(x)) for x in ident.split(".")]
121
+ enc_abi_tags = list(map(mangle_abi_tag, abi_tags))
122
+ extras = template_params + "".join(enc_abi_tags)
123
+ if len(parts) > 1:
124
+ return "N%s%sE" % ("".join(parts), extras)
125
+ else:
126
+ return "%s%s" % (parts[0], extras)
127
+
128
+
129
+ def mangle_type_or_value(typ):
130
+ """
131
+ Mangle type parameter and arbitrary value.
132
+ """
133
+ # Handle numba types
134
+ if isinstance(typ, types.Type):
135
+ if typ in N2CODE:
136
+ return N2CODE[typ]
137
+ else:
138
+ return mangle_templated_ident(*typ.mangling_args)
139
+ # Handle integer literal
140
+ elif isinstance(typ, int):
141
+ return "Li%dE" % typ
142
+ # Handle str as identifier
143
+ elif isinstance(typ, str):
144
+ return mangle_identifier(typ)
145
+ # Otherwise
146
+ else:
147
+ enc = _escape_string(str(typ))
148
+ return _len_encoded(enc)
149
+
150
+
151
+ # Alias
152
+ mangle_type = mangle_type_or_value
153
+ mangle_value = mangle_type_or_value
154
+
155
+
156
+ def mangle_templated_ident(identifier, parameters):
157
+ """
158
+ Mangle templated identifier.
159
+ """
160
+ template_params = (
161
+ "I%sE" % "".join(map(mangle_type_or_value, parameters))
162
+ if parameters
163
+ else ""
164
+ )
165
+ return mangle_identifier(identifier, template_params)
166
+
167
+
168
+ def mangle_args(argtys):
169
+ """
170
+ Mangle sequence of Numba type objects and arbitrary values.
171
+ """
172
+ return "".join([mangle_type_or_value(t) for t in argtys])
173
+
174
+
175
+ def mangle(ident, argtys, *, abi_tags=(), uid=None):
176
+ """
177
+ Mangle identifier with Numba type objects and abi-tags.
178
+ """
179
+ return "".join(
180
+ [
181
+ PREFIX,
182
+ mangle_identifier(ident, abi_tags=abi_tags, uid=uid),
183
+ mangle_args(argtys),
184
+ ]
185
+ )
186
+
187
+
188
+ def prepend_namespace(mangled, ns):
189
+ """
190
+ Prepend namespace to mangled name.
191
+ """
192
+ if not mangled.startswith(PREFIX):
193
+ raise ValueError("input is not a mangled name")
194
+ elif mangled.startswith(PREFIX + "N"):
195
+ # nested
196
+ remaining = mangled[3:]
197
+ ret = PREFIX + "N" + mangle_identifier(ns) + remaining
198
+ else:
199
+ # non-nested
200
+ remaining = mangled[2:]
201
+ head, tail = _split_mangled_ident(remaining)
202
+ ret = PREFIX + "N" + mangle_identifier(ns) + head + "E" + tail
203
+ return ret
204
+
205
+
206
+ def _split_mangled_ident(mangled):
207
+ """
208
+ Returns `(head, tail)` where `head` is the `<len> + <name>` encoded
209
+ identifier and `tail` is the remaining.
210
+ """
211
+ ct = int(mangled)
212
+ ctlen = len(str(ct))
213
+ at = ctlen + ct
214
+ return mangled[:at], mangled[at:]
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause