numba-cuda 0.22.0__cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-313-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-313-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-313-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-313-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-313-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,214 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ Itanium CXX ABI Mangler
6
+
7
+ Reference: https://itanium-cxx-abi.github.io/cxx-abi/abi.html
8
+
9
+ The basics of the mangling scheme.
10
+
11
+ We are hijacking the CXX mangling scheme for our use. We map Python modules
12
+ into CXX namespace. A `module1.submodule2.foo` is mapped to
13
+ `module1::submodule2::foo`. For parameterized numba types, we treat them as
14
+ templated types; for example, `array(int64, 1d, C)` becomes an
15
+ `array<int64, 1, C>`.
16
+
17
+ All mangled names are prefixed with "_Z". It is followed by the name of the
18
+ entity. A name contains one or more identifiers. Each identifier is encoded
19
+ as "<num of char><name>". If the name is namespaced and, therefore,
20
+ has multiple identifiers, the entire name is encoded as "N<name>E".
21
+
22
+ For functions, arguments types follow. There are condensed encodings for basic
23
+ built-in types; e.g. "i" for int, "f" for float. For other types, the
24
+ previously mentioned name encoding should be used.
25
+
26
+ For templated types, the template parameters are encoded immediately after the
27
+ name. If it is namespaced, it should be within the 'N' 'E' marker. Template
28
+ parameters are encoded in "I<params>E", where each parameter is encoded using
29
+ the mentioned name encoding scheme. Template parameters can contain literal
30
+ values like the '1' in the array type shown earlier. There is special encoding
31
+ scheme for them to avoid leading digits.
32
+ """
33
+
34
+ import re
35
+
36
+ from numba.cuda import types
37
+
38
+
39
+ # According the scheme, valid characters for mangled names are [a-zA-Z0-9_].
40
+ # We borrow the '_' as the escape character to encode invalid char into
41
+ # '_xx' where 'xx' is the hex codepoint.
42
+ _re_invalid_char = re.compile(r"[^a-z0-9_]", re.I)
43
+
44
+ PREFIX = "_Z"
45
+
46
+ # Numba types to mangled type code. These correspond with the codes listed in
47
+ # https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling-builtin
48
+ N2CODE = {
49
+ types.void: "v",
50
+ types.boolean: "b",
51
+ types.uint8: "h",
52
+ types.int8: "a",
53
+ types.uint16: "t",
54
+ types.int16: "s",
55
+ types.uint32: "j",
56
+ types.int32: "i",
57
+ types.uint64: "y",
58
+ types.int64: "x",
59
+ types.float16: "Dh",
60
+ types.float32: "f",
61
+ types.float64: "d",
62
+ }
63
+
64
+
65
+ def _escape_string(text):
66
+ """Escape the given string so that it only contains ASCII characters
67
+ of [a-zA-Z0-9_$].
68
+
69
+ The dollar symbol ($) and other invalid characters are escaped into
70
+ the string sequence of "$xx" where "xx" is the hex codepoint of the char.
71
+
72
+ Multibyte characters are encoded into utf8 and converted into the above
73
+ hex format.
74
+ """
75
+
76
+ def repl(m):
77
+ return "".join(("_%02x" % ch) for ch in m.group(0).encode("utf8"))
78
+
79
+ ret = re.sub(_re_invalid_char, repl, text)
80
+ # Return str if we got a unicode (for py2)
81
+ if not isinstance(ret, str):
82
+ return ret.encode("ascii")
83
+ return ret
84
+
85
+
86
+ def _fix_lead_digit(text):
87
+ """
88
+ Fix text with leading digit
89
+ """
90
+ if text and text[0].isdigit():
91
+ return "_" + text
92
+ else:
93
+ return text
94
+
95
+
96
+ def _len_encoded(string):
97
+ """
98
+ Prefix string with digit indicating the length.
99
+ Add underscore if string is prefixed with digits.
100
+ """
101
+ string = _fix_lead_digit(string)
102
+ return "%u%s" % (len(string), string)
103
+
104
+
105
+ def mangle_abi_tag(abi_tag: str) -> str:
106
+ return "B" + _len_encoded(_escape_string(abi_tag))
107
+
108
+
109
+ def mangle_identifier(ident, template_params="", *, abi_tags=(), uid=None):
110
+ """
111
+ Mangle the identifier with optional template parameters and abi_tags.
112
+
113
+ Note:
114
+
115
+ This treats '.' as '::' in C++.
116
+ """
117
+ if uid is not None:
118
+ # Add uid to abi-tags
119
+ abi_tags = (f"v{uid}", *abi_tags)
120
+ parts = [_len_encoded(_escape_string(x)) for x in ident.split(".")]
121
+ enc_abi_tags = list(map(mangle_abi_tag, abi_tags))
122
+ extras = template_params + "".join(enc_abi_tags)
123
+ if len(parts) > 1:
124
+ return "N%s%sE" % ("".join(parts), extras)
125
+ else:
126
+ return "%s%s" % (parts[0], extras)
127
+
128
+
129
+ def mangle_type_or_value(typ):
130
+ """
131
+ Mangle type parameter and arbitrary value.
132
+ """
133
+ # Handle numba types
134
+ if isinstance(typ, types.Type):
135
+ if typ in N2CODE:
136
+ return N2CODE[typ]
137
+ else:
138
+ return mangle_templated_ident(*typ.mangling_args)
139
+ # Handle integer literal
140
+ elif isinstance(typ, int):
141
+ return "Li%dE" % typ
142
+ # Handle str as identifier
143
+ elif isinstance(typ, str):
144
+ return mangle_identifier(typ)
145
+ # Otherwise
146
+ else:
147
+ enc = _escape_string(str(typ))
148
+ return _len_encoded(enc)
149
+
150
+
151
+ # Alias
152
+ mangle_type = mangle_type_or_value
153
+ mangle_value = mangle_type_or_value
154
+
155
+
156
+ def mangle_templated_ident(identifier, parameters):
157
+ """
158
+ Mangle templated identifier.
159
+ """
160
+ template_params = (
161
+ "I%sE" % "".join(map(mangle_type_or_value, parameters))
162
+ if parameters
163
+ else ""
164
+ )
165
+ return mangle_identifier(identifier, template_params)
166
+
167
+
168
+ def mangle_args(argtys):
169
+ """
170
+ Mangle sequence of Numba type objects and arbitrary values.
171
+ """
172
+ return "".join([mangle_type_or_value(t) for t in argtys])
173
+
174
+
175
+ def mangle(ident, argtys, *, abi_tags=(), uid=None):
176
+ """
177
+ Mangle identifier with Numba type objects and abi-tags.
178
+ """
179
+ return "".join(
180
+ [
181
+ PREFIX,
182
+ mangle_identifier(ident, abi_tags=abi_tags, uid=uid),
183
+ mangle_args(argtys),
184
+ ]
185
+ )
186
+
187
+
188
+ def prepend_namespace(mangled, ns):
189
+ """
190
+ Prepend namespace to mangled name.
191
+ """
192
+ if not mangled.startswith(PREFIX):
193
+ raise ValueError("input is not a mangled name")
194
+ elif mangled.startswith(PREFIX + "N"):
195
+ # nested
196
+ remaining = mangled[3:]
197
+ ret = PREFIX + "N" + mangle_identifier(ns) + remaining
198
+ else:
199
+ # non-nested
200
+ remaining = mangled[2:]
201
+ head, tail = _split_mangled_ident(remaining)
202
+ ret = PREFIX + "N" + mangle_identifier(ns) + head + "E" + tail
203
+ return ret
204
+
205
+
206
+ def _split_mangled_ident(mangled):
207
+ """
208
+ Returns `(head, tail)` where `head` is the `<len> + <name>` encoded
209
+ identifier and `tail` is the remaining.
210
+ """
211
+ ct = int(mangled)
212
+ ctlen = len(str(ct))
213
+ at = ctlen + ct
214
+ return mangled[:at], mangled[at:]
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
@@ -0,0 +1,265 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ A library written in CUDA Python for generating reduction kernels
6
+ """
7
+
8
+ from numba.cuda.np.numpy_support import from_dtype
9
+
10
+
11
+ _WARPSIZE = 32
12
+ _NUMWARPS = 4
13
+
14
+
15
+ def _gpu_reduce_factory(fn, nbtype):
16
+ from numba import cuda
17
+
18
+ reduce_op = cuda.jit(device=True)(fn)
19
+ inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
20
+ max_blocksize = _NUMWARPS * _WARPSIZE
21
+
22
+ @cuda.jit(device=True)
23
+ def inner_warp_reduction(sm_partials, init):
24
+ """
25
+ Compute reduction within a single warp
26
+ """
27
+ tid = cuda.threadIdx.x
28
+ warpid = tid // _WARPSIZE
29
+ laneid = tid % _WARPSIZE
30
+
31
+ sm_this = sm_partials[warpid, :]
32
+ sm_this[laneid] = init
33
+ cuda.syncwarp()
34
+
35
+ width = _WARPSIZE // 2
36
+ while width:
37
+ if laneid < width:
38
+ old = sm_this[laneid]
39
+ sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
40
+ cuda.syncwarp()
41
+ width //= 2
42
+
43
+ @cuda.jit(device=True)
44
+ def device_reduce_full_block(arr, partials, sm_partials):
45
+ """
46
+ Partially reduce `arr` into `partials` using `sm_partials` as working
47
+ space. The algorithm goes like:
48
+
49
+ array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
50
+ block-0: | x | | | x | |
51
+ block-1: | | x | | | x |
52
+ block-2: | | | x | | |
53
+
54
+ The array is divided into chunks of 128 (size of a threadblock).
55
+ The threadblocks consumes the chunks in roundrobin scheduling.
56
+ First, a threadblock loads a chunk into temp memory. Then, all
57
+ subsequent chunks are combined into the temp memory.
58
+
59
+ Once all chunks are processed. Inner-block reduction is performed
60
+ on the temp memory. So that, there will just be one scalar result
61
+ per block. The result from each block is stored to `partials` at
62
+ the dedicated slot.
63
+ """
64
+ tid = cuda.threadIdx.x
65
+ blkid = cuda.blockIdx.x
66
+ blksz = cuda.blockDim.x
67
+ gridsz = cuda.gridDim.x
68
+
69
+ # block strided loop to compute the reduction
70
+ start = tid + blksz * blkid
71
+ stop = arr.size
72
+ step = blksz * gridsz
73
+
74
+ # load first value
75
+ tmp = arr[start]
76
+ # loop over all values in block-stride
77
+ for i in range(start + step, stop, step):
78
+ tmp = reduce_op(tmp, arr[i])
79
+
80
+ cuda.syncthreads()
81
+ # inner-warp reduction
82
+ inner_warp_reduction(sm_partials, tmp)
83
+
84
+ cuda.syncthreads()
85
+ # at this point, only the first slot for each warp in tsm_partials
86
+ # is valid.
87
+
88
+ # finish up block reduction
89
+ # warning: this is assuming 4 warps.
90
+ # assert numwarps == 4
91
+ if tid < 2:
92
+ sm_partials[tid, 0] = reduce_op(
93
+ sm_partials[tid, 0], sm_partials[tid + 2, 0]
94
+ )
95
+ cuda.syncwarp()
96
+ if tid == 0:
97
+ partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
98
+
99
+ @cuda.jit(device=True)
100
+ def device_reduce_partial_block(arr, partials, sm_partials):
101
+ """
102
+ This computes reduction on `arr`.
103
+ This device function must be used by 1 threadblock only.
104
+ The blocksize must match `arr.size` and must not be greater than 128.
105
+ """
106
+ tid = cuda.threadIdx.x
107
+ blkid = cuda.blockIdx.x
108
+ blksz = cuda.blockDim.x
109
+ warpid = tid // _WARPSIZE
110
+ laneid = tid % _WARPSIZE
111
+
112
+ size = arr.size
113
+ # load first value
114
+ tid = cuda.threadIdx.x
115
+ value = arr[tid]
116
+ sm_partials[warpid, laneid] = value
117
+
118
+ cuda.syncthreads()
119
+
120
+ if (warpid + 1) * _WARPSIZE < size:
121
+ # fully populated warps
122
+ inner_warp_reduction(sm_partials, value)
123
+ else:
124
+ # partially populated warps
125
+ # NOTE: this uses a very inefficient sequential algorithm
126
+ if laneid == 0:
127
+ sm_this = sm_partials[warpid, :]
128
+ base = warpid * _WARPSIZE
129
+ for i in range(1, size - base):
130
+ sm_this[0] = reduce_op(sm_this[0], sm_this[i])
131
+
132
+ cuda.syncthreads()
133
+ # finish up
134
+ if tid == 0:
135
+ num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
136
+
137
+ result = sm_partials[0, 0]
138
+ for i in range(1, num_active_warps):
139
+ result = reduce_op(result, sm_partials[i, 0])
140
+
141
+ partials[blkid] = result
142
+
143
+ def gpu_reduce_block_strided(arr, partials, init, use_init):
144
+ """
145
+ Perform reductions on *arr* and writing out partial reduction result
146
+ into *partials*. The length of *partials* is determined by the
147
+ number of threadblocks. The initial value is set with *init*.
148
+
149
+ Launch config:
150
+
151
+ Blocksize must be multiple of warpsize and it is limited to 4 warps.
152
+ """
153
+ tid = cuda.threadIdx.x
154
+
155
+ sm_partials = cuda.shared.array(
156
+ (_NUMWARPS, inner_sm_size), dtype=nbtype
157
+ )
158
+ if cuda.blockDim.x == max_blocksize:
159
+ device_reduce_full_block(arr, partials, sm_partials)
160
+ else:
161
+ device_reduce_partial_block(arr, partials, sm_partials)
162
+ # deal with the initializer
163
+ if use_init and tid == 0 and cuda.blockIdx.x == 0:
164
+ partials[0] = reduce_op(partials[0], init)
165
+
166
+ return cuda.jit(gpu_reduce_block_strided)
167
+
168
+
169
+ class Reduce(object):
170
+ """Create a reduction object that reduces values using a given binary
171
+ function. The binary function is compiled once and cached inside this
172
+ object. Keeping this object alive will prevent re-compilation.
173
+ """
174
+
175
+ _cache = {}
176
+
177
+ def __init__(self, functor):
178
+ """
179
+ :param functor: A function implementing a binary operation for
180
+ reduction. It will be compiled as a CUDA device
181
+ function using ``cuda.jit(device=True)``.
182
+ """
183
+ self._functor = functor
184
+
185
+ def _compile(self, dtype):
186
+ key = self._functor, dtype
187
+ if key in self._cache:
188
+ kernel = self._cache[key]
189
+ else:
190
+ kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
191
+ self._cache[key] = kernel
192
+ return kernel
193
+
194
+ def __call__(self, arr, size=None, res=None, init=0, stream=0):
195
+ """Performs a full reduction.
196
+
197
+ :param arr: A host or device array.
198
+ :param size: Optional integer specifying the number of elements in
199
+ ``arr`` to reduce. If this parameter is not specified, the
200
+ entire array is reduced.
201
+ :param res: Optional device array into which to write the reduction
202
+ result to. The result is written into the first element of
203
+ this array. If this parameter is specified, then no
204
+ communication of the reduction output takes place from the
205
+ device to the host.
206
+ :param init: Optional initial value for the reduction, the type of which
207
+ must match ``arr.dtype``.
208
+ :param stream: Optional CUDA stream in which to perform the reduction.
209
+ If no stream is specified, the default stream of 0 is
210
+ used.
211
+ :return: If ``res`` is specified, ``None`` is returned. Otherwise, the
212
+ result of the reduction is returned.
213
+ """
214
+ from numba import cuda
215
+
216
+ # ensure 1d array
217
+ if arr.ndim != 1:
218
+ raise TypeError("only support 1D array")
219
+
220
+ # adjust array size
221
+ if size is not None:
222
+ arr = arr[:size]
223
+
224
+ init = arr.dtype.type(init) # ensure the right type
225
+
226
+ # return `init` if `arr` is empty
227
+ if arr.size < 1:
228
+ return init
229
+
230
+ kernel = self._compile(arr.dtype)
231
+
232
+ # Perform the reduction on the GPU
233
+ blocksize = _NUMWARPS * _WARPSIZE
234
+ size_full = (arr.size // blocksize) * blocksize
235
+ size_partial = arr.size - size_full
236
+ full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
237
+
238
+ # allocate size of partials array
239
+ partials_size = full_blockct
240
+ if size_partial:
241
+ partials_size += 1
242
+ partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
243
+
244
+ if size_full:
245
+ # kernel for the fully populated threadblocks
246
+ kernel[full_blockct, blocksize, stream](
247
+ arr[:size_full], partials[:full_blockct], init, True
248
+ )
249
+
250
+ if size_partial:
251
+ # kernel for partially populated threadblocks
252
+ kernel[1, size_partial, stream](
253
+ arr[size_full:], partials[full_blockct:], init, not full_blockct
254
+ )
255
+
256
+ if partials.size > 1:
257
+ # finish up
258
+ kernel[1, partials_size, stream](partials, partials, init, False)
259
+
260
+ # handle return value
261
+ if res is not None:
262
+ res[:1].copy_to_device(partials[:1], stream=stream)
263
+ return
264
+ else:
265
+ return partials[0]
@@ -0,0 +1,65 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from numba import cuda
5
+ from numba.cuda.cudadrv.driver import driver
6
+ import math
7
+ from numba.cuda.np import numpy_support as nps
8
+
9
+
10
+ def transpose(a, b=None):
11
+ """Compute the transpose of 'a' and store it into 'b', if given,
12
+ and return it. If 'b' is not given, allocate a new array
13
+ and return that.
14
+
15
+ This implements the algorithm documented in
16
+ http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
17
+
18
+ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
19
+ the device its stream will be used to perform the transpose (and to copy
20
+ `b` to the device if necessary).
21
+ """
22
+
23
+ # prefer `a`'s stream if
24
+ stream = getattr(a, "stream", 0)
25
+
26
+ if not b:
27
+ cols, rows = a.shape
28
+ strides = a.dtype.itemsize * cols, a.dtype.itemsize
29
+ b = cuda.cudadrv.devicearray.DeviceNDArray(
30
+ (rows, cols), strides, dtype=a.dtype, stream=stream
31
+ )
32
+
33
+ dt = nps.from_dtype(a.dtype)
34
+
35
+ tpb = driver.get_device().MAX_THREADS_PER_BLOCK
36
+ # we need to factor available threads into x and y axis
37
+ tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
38
+ tile_height = int(tpb / tile_width)
39
+
40
+ tile_shape = (tile_height, tile_width + 1)
41
+
42
+ @cuda.jit
43
+ def kernel(input, output):
44
+ tile = cuda.shared.array(shape=tile_shape, dtype=dt)
45
+
46
+ tx = cuda.threadIdx.x
47
+ ty = cuda.threadIdx.y
48
+ bx = cuda.blockIdx.x * cuda.blockDim.x
49
+ by = cuda.blockIdx.y * cuda.blockDim.y
50
+ x = by + tx
51
+ y = bx + ty
52
+
53
+ if by + ty < input.shape[0] and bx + tx < input.shape[1]:
54
+ tile[ty, tx] = input[by + ty, bx + tx]
55
+ cuda.syncthreads()
56
+ if y < output.shape[0] and x < output.shape[1]:
57
+ output[y, x] = tile[tx, ty]
58
+
59
+ # one block per tile, plus one for remainders
60
+ blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
61
+ # one thread per tile element
62
+ threads = tile_height, tile_width
63
+ kernel[blocks, threads, stream](a, b)
64
+
65
+ return b