numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,374 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import sys
5
+ import math
6
+ import operator
7
+ from llvmlite import ir
8
+ from numba.cuda import types, typing
9
+ from numba.cuda import cgutils
10
+ from numba.cuda.core.imputils import Registry
11
+ from numba.cuda.types import float32, float64, int64, uint64
12
+ from numba.cuda import libdevice
13
+ from numba.cuda.core import targetconfig
14
+
15
+ registry = Registry("mathimpl")
16
+ lower = registry.lower
17
+
18
+
19
+ booleans = []
20
+ booleans += [("isnand", "isnanf", math.isnan)]
21
+ booleans += [("isinfd", "isinff", math.isinf)]
22
+ booleans += [("isfinited", "finitef", math.isfinite)]
23
+
24
+ unarys = []
25
+ unarys += [("ceil", "ceilf", math.ceil)]
26
+ unarys += [("floor", "floorf", math.floor)]
27
+ unarys += [("fabs", "fabsf", math.fabs)]
28
+ unarys += [("exp", "expf", math.exp)]
29
+ if sys.version_info >= (3, 11):
30
+ unarys += [("exp2", "exp2f", math.exp2)]
31
+ unarys += [("expm1", "expm1f", math.expm1)]
32
+ unarys += [("erf", "erff", math.erf)]
33
+ unarys += [("erfc", "erfcf", math.erfc)]
34
+ unarys += [("tgamma", "tgammaf", math.gamma)]
35
+ unarys += [("lgamma", "lgammaf", math.lgamma)]
36
+ unarys += [("sqrt", "sqrtf", math.sqrt)]
37
+ unarys += [("log", "logf", math.log)]
38
+ unarys += [("log2", "log2f", math.log2)]
39
+ unarys += [("log10", "log10f", math.log10)]
40
+ unarys += [("log1p", "log1pf", math.log1p)]
41
+ unarys += [("acosh", "acoshf", math.acosh)]
42
+ unarys += [("acos", "acosf", math.acos)]
43
+ unarys += [("cos", "cosf", math.cos)]
44
+ unarys += [("cosh", "coshf", math.cosh)]
45
+ unarys += [("asinh", "asinhf", math.asinh)]
46
+ unarys += [("asin", "asinf", math.asin)]
47
+ unarys += [("sin", "sinf", math.sin)]
48
+ unarys += [("sinh", "sinhf", math.sinh)]
49
+ unarys += [("atan", "atanf", math.atan)]
50
+ unarys += [("atanh", "atanhf", math.atanh)]
51
+ unarys += [("tan", "tanf", math.tan)]
52
+ unarys += [("trunc", "truncf", math.trunc)]
53
+
54
+ unarys_fastmath = {}
55
+ unarys_fastmath["cosf"] = "fast_cosf"
56
+ unarys_fastmath["sinf"] = "fast_sinf"
57
+ unarys_fastmath["tanf"] = "fast_tanf"
58
+ unarys_fastmath["expf"] = "fast_expf"
59
+ unarys_fastmath["log2f"] = "fast_log2f"
60
+ unarys_fastmath["log10f"] = "fast_log10f"
61
+ unarys_fastmath["logf"] = "fast_logf"
62
+
63
+ binarys = []
64
+ binarys += [("copysign", "copysignf", math.copysign)]
65
+ binarys += [("atan2", "atan2f", math.atan2)]
66
+ binarys += [("pow", "powf", math.pow)]
67
+ binarys += [("fmod", "fmodf", math.fmod)]
68
+ binarys += [("hypot", "hypotf", math.hypot)]
69
+ binarys += [("remainder", "remainderf", math.remainder)]
70
+ binarys += [("nextafter", "nextafterf", math.nextafter)]
71
+
72
+ binarys_fastmath = {}
73
+ binarys_fastmath["powf"] = "fast_powf"
74
+
75
+
76
+ @lower(math.isinf, types.Integer)
77
+ @lower(math.isnan, types.Integer)
78
+ def math_isinf_isnan_int(context, builder, sig, args):
79
+ return context.get_constant(types.boolean, 0)
80
+
81
+
82
+ @lower(operator.truediv, types.float32, types.float32)
83
+ def maybe_fast_truediv(context, builder, sig, args):
84
+ if context.fastmath:
85
+ sig = typing.signature(float32, float32, float32)
86
+ impl = context.get_function(libdevice.fast_fdividef, sig)
87
+ return impl(builder, args)
88
+ else:
89
+ with cgutils.if_zero(builder, args[1]):
90
+ context.error_model.fp_zero_division(builder, ("division by zero",))
91
+ res = builder.fdiv(*args)
92
+ return res
93
+
94
+
95
+ @lower(math.isfinite, types.Integer)
96
+ def math_isfinite_int(context, builder, sig, args):
97
+ return context.get_constant(types.boolean, 1)
98
+
99
+
100
+ def impl_boolean(key, ty, libfunc):
101
+ def lower_boolean_impl(context, builder, sig, args):
102
+ libfunc_impl = context.get_function(
103
+ libfunc, typing.signature(types.int32, ty)
104
+ )
105
+ result = libfunc_impl(builder, args)
106
+ return context.cast(builder, result, types.int32, types.boolean)
107
+
108
+ lower(key, ty)(lower_boolean_impl)
109
+
110
+
111
+ def get_lower_unary_impl(key, ty, libfunc):
112
+ def lower_unary_impl(context, builder, sig, args):
113
+ actual_libfunc = libfunc
114
+ fast_replacement = None
115
+ if ty == float32 and context.fastmath:
116
+ fast_replacement = unarys_fastmath.get(libfunc.__name__)
117
+
118
+ if fast_replacement is not None:
119
+ actual_libfunc = getattr(libdevice, fast_replacement)
120
+
121
+ libfunc_impl = context.get_function(
122
+ actual_libfunc, typing.signature(ty, ty)
123
+ )
124
+ return libfunc_impl(builder, args)
125
+
126
+ return lower_unary_impl
127
+
128
+
129
+ def get_unary_impl_for_fn_and_ty(fn, ty):
130
+ # tanh is a special case - because it is not registered like the other
131
+ # unary implementations, it does not appear in the unarys list. However,
132
+ # its implementation can be looked up by key like the other
133
+ # implementations, so we add it to the list we search here.
134
+ tanh_impls = ("tanh", "tanhf", math.tanh)
135
+ for fname64, fname32, key in unarys + [tanh_impls]:
136
+ if fn == key:
137
+ if ty == float32:
138
+ impl = getattr(libdevice, fname32)
139
+ elif ty == float64:
140
+ impl = getattr(libdevice, fname64)
141
+
142
+ return get_lower_unary_impl(key, ty, impl)
143
+
144
+ raise RuntimeError(f"Implementation of {fn} for {ty} not found")
145
+
146
+
147
+ def impl_unary(key, ty, libfunc):
148
+ lower_unary_impl = get_lower_unary_impl(key, ty, libfunc)
149
+ lower(key, ty)(lower_unary_impl)
150
+
151
+
152
+ def impl_unary_int(key, ty, libfunc):
153
+ def lower_unary_int_impl(context, builder, sig, args):
154
+ if sig.args[0] == int64:
155
+ convert = builder.sitofp
156
+ elif sig.args[0] == uint64:
157
+ convert = builder.uitofp
158
+ else:
159
+ m = "Only 64-bit integers are supported for generic unary int ops"
160
+ raise TypeError(m)
161
+
162
+ arg = convert(args[0], ir.DoubleType())
163
+ sig = typing.signature(float64, float64)
164
+ libfunc_impl = context.get_function(libfunc, sig)
165
+ return libfunc_impl(builder, [arg])
166
+
167
+ lower(key, ty)(lower_unary_int_impl)
168
+
169
+
170
+ def get_lower_binary_impl(key, ty, libfunc):
171
+ def lower_binary_impl(context, builder, sig, args):
172
+ actual_libfunc = libfunc
173
+ fast_replacement = None
174
+ if ty == float32 and context.fastmath:
175
+ fast_replacement = binarys_fastmath.get(libfunc.__name__)
176
+
177
+ if fast_replacement is not None:
178
+ actual_libfunc = getattr(libdevice, fast_replacement)
179
+
180
+ libfunc_impl = context.get_function(
181
+ actual_libfunc, typing.signature(ty, ty, ty)
182
+ )
183
+ return libfunc_impl(builder, args)
184
+
185
+ return lower_binary_impl
186
+
187
+
188
+ def get_binary_impl_for_fn_and_ty(fn, ty):
189
+ for fname64, fname32, key in binarys:
190
+ if fn == key:
191
+ if ty == float32:
192
+ impl = getattr(libdevice, fname32)
193
+ elif ty == float64:
194
+ impl = getattr(libdevice, fname64)
195
+
196
+ return get_lower_binary_impl(key, ty, impl)
197
+
198
+ raise RuntimeError(f"Implementation of {fn} for {ty} not found")
199
+
200
+
201
+ def impl_binary(key, ty, libfunc):
202
+ lower_binary_impl = get_lower_binary_impl(key, ty, libfunc)
203
+ lower(key, ty, ty)(lower_binary_impl)
204
+
205
+
206
+ def impl_binary_int(key, ty, libfunc):
207
+ def lower_binary_int_impl(context, builder, sig, args):
208
+ if sig.args[0] == int64:
209
+ convert = builder.sitofp
210
+ elif sig.args[0] == uint64:
211
+ convert = builder.uitofp
212
+ else:
213
+ m = "Only 64-bit integers are supported for generic binary int ops"
214
+ raise TypeError(m)
215
+
216
+ args = [convert(arg, ir.DoubleType()) for arg in args]
217
+ sig = typing.signature(float64, float64, float64)
218
+ libfunc_impl = context.get_function(libfunc, sig)
219
+ return libfunc_impl(builder, args)
220
+
221
+ lower(key, ty, ty)(lower_binary_int_impl)
222
+
223
+
224
+ for fname64, fname32, key in booleans:
225
+ impl32 = getattr(libdevice, fname32)
226
+ impl64 = getattr(libdevice, fname64)
227
+ impl_boolean(key, float32, impl32)
228
+ impl_boolean(key, float64, impl64)
229
+
230
+
231
+ for fname64, fname32, key in unarys:
232
+ impl32 = getattr(libdevice, fname32)
233
+ impl64 = getattr(libdevice, fname64)
234
+ impl_unary(key, float32, impl32)
235
+ impl_unary(key, float64, impl64)
236
+ impl_unary_int(key, int64, impl64)
237
+ impl_unary_int(key, uint64, impl64)
238
+
239
+
240
+ for fname64, fname32, key in binarys:
241
+ impl32 = getattr(libdevice, fname32)
242
+ impl64 = getattr(libdevice, fname64)
243
+ impl_binary(key, float32, impl32)
244
+ impl_binary(key, float64, impl64)
245
+ impl_binary_int(key, int64, impl64)
246
+ impl_binary_int(key, uint64, impl64)
247
+
248
+
249
+ def impl_pow_int(ty, libfunc):
250
+ def lower_pow_impl_int(context, builder, sig, args):
251
+ powi_sig = typing.signature(ty, ty, types.int32)
252
+ libfunc_impl = context.get_function(libfunc, powi_sig)
253
+ return libfunc_impl(builder, args)
254
+
255
+ lower(math.pow, ty, types.int32)(lower_pow_impl_int)
256
+
257
+
258
+ impl_pow_int(types.float32, libdevice.powif)
259
+ impl_pow_int(types.float64, libdevice.powi)
260
+
261
+
262
+ def impl_modf(ty, libfunc):
263
+ retty = types.UniTuple(ty, 2)
264
+
265
+ def lower_modf_impl(context, builder, sig, args):
266
+ modf_sig = typing.signature(retty, ty)
267
+ libfunc_impl = context.get_function(libfunc, modf_sig)
268
+ return libfunc_impl(builder, args)
269
+
270
+ lower(math.modf, ty)(lower_modf_impl)
271
+
272
+
273
+ impl_modf(types.float32, libdevice.modff)
274
+ impl_modf(types.float64, libdevice.modf)
275
+
276
+
277
+ def impl_frexp(ty, libfunc):
278
+ retty = types.Tuple((ty, types.int32))
279
+
280
+ def lower_frexp_impl(context, builder, sig, args):
281
+ frexp_sig = typing.signature(retty, ty)
282
+ libfunc_impl = context.get_function(libfunc, frexp_sig)
283
+ return libfunc_impl(builder, args)
284
+
285
+ lower(math.frexp, ty)(lower_frexp_impl)
286
+
287
+
288
+ impl_frexp(types.float32, libdevice.frexpf)
289
+ impl_frexp(types.float64, libdevice.frexp)
290
+
291
+
292
+ def impl_ldexp(ty, libfunc):
293
+ def lower_ldexp_impl(context, builder, sig, args):
294
+ ldexp_sig = typing.signature(ty, ty, types.int32)
295
+ libfunc_impl = context.get_function(libfunc, ldexp_sig)
296
+ return libfunc_impl(builder, args)
297
+
298
+ lower(math.ldexp, ty, types.int32)(lower_ldexp_impl)
299
+
300
+
301
+ impl_ldexp(types.float32, libdevice.ldexpf)
302
+ impl_ldexp(types.float64, libdevice.ldexp)
303
+
304
+
305
+ def impl_tanh(ty, libfunc):
306
+ def lower_tanh_impl(context, builder, sig, args):
307
+ def get_compute_capability():
308
+ flags = targetconfig.ConfigStack().top()
309
+ return flags.compute_capability
310
+
311
+ def tanh_impl_libdevice():
312
+ tanh_sig = typing.signature(ty, ty)
313
+ libfunc_impl = context.get_function(libfunc, tanh_sig)
314
+ return libfunc_impl(builder, args)
315
+
316
+ def tanhf_impl_fastmath():
317
+ fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
318
+ asm = ir.InlineAsm(fnty, "tanh.approx.f32 $0, $1;", "=f,f")
319
+ return builder.call(asm, args)
320
+
321
+ if ty == float32 and context.fastmath:
322
+ cc = get_compute_capability()
323
+ if cc >= (7, 5):
324
+ return tanhf_impl_fastmath()
325
+
326
+ return tanh_impl_libdevice()
327
+
328
+ lower(math.tanh, ty)(lower_tanh_impl)
329
+
330
+
331
+ impl_tanh(types.float32, libdevice.tanhf)
332
+ impl_tanh(types.float64, libdevice.tanh)
333
+
334
+ impl_unary_int(math.tanh, int64, libdevice.tanh)
335
+ impl_unary_int(math.tanh, uint64, libdevice.tanh)
336
+
337
+
338
+ # Complex power implementations - translations of _Py_c_pow from CPython
339
+ # https://github.com/python/cpython/blob/a755410e054e1e2390de5830befc08fe80706c66/Objects/complexobject.c#L123-L151
340
+ #
341
+ # The complex64 variant casts all constants and some variables to ensure that
342
+ # as much computation is done in single precision as possible. A small number
343
+ # of operations are still done in 64-bit, but these come from libdevice code.
344
+
345
+
346
+ def cpow_implement(fty, cty):
347
+ def core(context, builder, sig, args):
348
+ def cpow_internal(a, b):
349
+ if b.real == fty(0.0) and b.imag == fty(0.0):
350
+ return cty(1.0) + cty(0.0j)
351
+ elif a.real == fty(0.0) and b.real == fty(0.0):
352
+ return cty(0.0) + cty(0.0j)
353
+
354
+ vabs = math.hypot(a.real, a.imag)
355
+ len = math.pow(vabs, b.real)
356
+ at = math.atan2(a.imag, a.real)
357
+ phase = at * b.real
358
+ if b.imag != fty(0.0):
359
+ len /= math.exp(at * b.imag)
360
+ phase += b.imag * math.log(vabs)
361
+
362
+ return len * (
363
+ cty(math.cos(phase)) + cty(math.sin(phase) * cty(1.0j))
364
+ )
365
+
366
+ return context.compile_internal(builder, cpow_internal, sig, args)
367
+
368
+ lower(operator.pow, cty, cty)(core)
369
+ lower(operator.ipow, cty, cty)(core)
370
+ lower(pow, cty, cty)(core)
371
+
372
+
373
+ cpow_implement(types.float32, types.complex64)
374
+ cpow_implement(types.float64, types.complex128)
@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from numba.cuda.memory_management.nrt import rtsys # noqa: F401
@@ -0,0 +1,99 @@
1
+ /*
2
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ * SPDX-License-Identifier: BSD-2-Clause
4
+ */
5
+
6
+ #include "memsys.cuh"
7
+
8
+ __device__ size_t memsys_size = sizeof(NRT_MemSys);
9
+
10
+ namespace detail
11
+ {
12
+ void __device__ check_memsys()
13
+ {
14
+ if (TheMSys == nullptr)
15
+ {
16
+ assert(false && "TheMSys pointer is null. Please use NRT_MemSys_set to set pointer first.");
17
+ }
18
+ }
19
+ }
20
+
21
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
22
+ {
23
+ TheMSys = memsys_ptr;
24
+ }
25
+
26
+ extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
27
+ {
28
+ detail::check_memsys();
29
+ managed_memsys[0] = TheMSys->stats.alloc;
30
+ managed_memsys[1] = TheMSys->stats.free;
31
+ managed_memsys[2] = TheMSys->stats.mi_alloc;
32
+ managed_memsys[3] = TheMSys->stats.mi_free;
33
+ }
34
+
35
+ extern "C" __global__ void NRT_MemSys_read_alloc(uint64_t *managed_result)
36
+ {
37
+ detail::check_memsys();
38
+ managed_result[0] = TheMSys->stats.alloc;
39
+ }
40
+
41
+ extern "C" __global__ void NRT_MemSys_read_free(uint64_t *managed_result)
42
+ {
43
+ detail::check_memsys();
44
+ managed_result[0] = TheMSys->stats.free;
45
+ }
46
+
47
+ extern "C" __global__ void NRT_MemSys_read_mi_alloc(uint64_t *managed_result)
48
+ {
49
+ detail::check_memsys();
50
+ managed_result[0] = TheMSys->stats.mi_alloc;
51
+ }
52
+
53
+ extern "C" __global__ void NRT_MemSys_read_mi_free(uint64_t *managed_result)
54
+ {
55
+ detail::check_memsys();
56
+ managed_result[0] = TheMSys->stats.mi_free;
57
+ }
58
+
59
+ extern "C" __global__ void NRT_MemSys_init(void)
60
+ {
61
+ detail::check_memsys();
62
+ TheMSys->stats.enabled = false;
63
+ TheMSys->stats.alloc = 0;
64
+ TheMSys->stats.free = 0;
65
+ TheMSys->stats.mi_alloc = 0;
66
+ TheMSys->stats.mi_free = 0;
67
+ }
68
+
69
+ extern "C" __global__ void NRT_MemSys_enable_stats(void)
70
+ {
71
+ detail::check_memsys();
72
+ TheMSys->stats.enabled = true;
73
+ }
74
+
75
+ extern "C" __global__ void NRT_MemSys_disable_stats(void)
76
+ {
77
+ detail::check_memsys();
78
+ TheMSys->stats.enabled = false;
79
+ }
80
+
81
+ extern "C" __global__ void NRT_MemSys_stats_enabled(uint8_t *enabled)
82
+ {
83
+ detail::check_memsys();
84
+ *enabled = static_cast<uint8_t>(TheMSys->stats.enabled);
85
+ }
86
+
87
+ extern "C" __global__ void NRT_MemSys_print(void)
88
+ {
89
+ if (TheMSys != nullptr)
90
+ {
91
+ printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
92
+ printf("TheMSys->stats.alloc %zu\n", TheMSys->stats.alloc.load());
93
+ printf("TheMSys->stats.free %zu\n", TheMSys->stats.free.load());
94
+ printf("TheMSys->stats.mi_alloc %zu\n", TheMSys->stats.mi_alloc.load());
95
+ printf("TheMSys->stats.mi_free %zu\n", TheMSys->stats.mi_free.load());
96
+ } else {
97
+ printf("TheMsys is null.\n");
98
+ }
99
+ }
@@ -0,0 +1,22 @@
1
+ /*
2
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ * SPDX-License-Identifier: BSD-2-Clause
4
+ */
5
+
6
+ #include <cuda/atomic>
7
+
8
+ // Globally needed variables
9
+ struct NRT_MemSys {
10
+ struct {
11
+ bool enabled;
12
+ cuda::atomic<size_t, cuda::thread_scope_device> alloc;
13
+ cuda::atomic<size_t, cuda::thread_scope_device> free;
14
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
15
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
16
+ } stats;
17
+ };
18
+
19
+ /* The Memory System object */
20
+ __device__ NRT_MemSys* TheMSys;
21
+
22
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
@@ -0,0 +1,212 @@
1
+ /*
2
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ * SPDX-License-Identifier: BSD-2-Clause
4
+ */
5
+
6
+ #ifndef _NRT_H
7
+ #define _NRT_H
8
+
9
+ #include <cuda/atomic>
10
+
11
+ #include "memsys.cuh"
12
+ #include "nrt.cuh"
13
+
14
+
15
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
16
+ {
17
+ TheMSys = memsys_ptr;
18
+ }
19
+
20
+
21
+ extern "C" __device__ void* NRT_Allocate(size_t size)
22
+ {
23
+ void* ptr = NULL;
24
+ ptr = malloc(size);
25
+ if (TheMSys && TheMSys->stats.enabled) {
26
+ TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
27
+ return ptr;
28
+ }
29
+
30
+ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
31
+ void* data,
32
+ size_t size,
33
+ NRT_dtor_function dtor,
34
+ void* dtor_info)
35
+ {
36
+ mi->refct = 1; /* starts with 1 refct */
37
+ mi->dtor = dtor;
38
+ mi->dtor_info = dtor_info;
39
+ mi->data = data;
40
+ mi->size = size;
41
+ if (TheMSys && TheMSys->stats.enabled) {
42
+ TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
43
+ }
44
+
45
+ extern "C"
46
+ __device__ NRT_MemInfo* NRT_MemInfo_new(
47
+ void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
48
+ {
49
+ NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
50
+ if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
51
+ return mi;
52
+ }
53
+
54
+ extern "C" __device__ void NRT_Free(void* ptr)
55
+ {
56
+ free(ptr);
57
+ if (TheMSys && TheMSys->stats.enabled) {
58
+ TheMSys->stats.free.fetch_add(1, cuda::memory_order_relaxed); }
59
+ }
60
+
61
+ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
62
+ {
63
+ NRT_Free(mi);
64
+ }
65
+
66
+ extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
67
+ {
68
+ NRT_dealloc(mi);
69
+ if (TheMSys && TheMSys->stats.enabled) {
70
+ TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
71
+ }
72
+
73
+ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
74
+ {
75
+ if (mi->dtor) /* We have a destructor */
76
+ mi->dtor(mi->data, mi->size, NULL);
77
+ /* Clear and release MemInfo */
78
+ NRT_MemInfo_destroy(mi);
79
+ }
80
+
81
+ static void __device__
82
+ nrt_varsize_dtor(void *ptr, size_t size, void *info) {
83
+ if (info) {
84
+ /* call element dtor */
85
+ typedef void dtor_fn_t(void *ptr);
86
+ dtor_fn_t *dtor = (dtor_fn_t *)info;
87
+ dtor(ptr);
88
+ }
89
+ NRT_Free(ptr);
90
+ }
91
+
92
+ extern "C"
93
+ __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size)
94
+ {
95
+ void *data = NRT_Allocate(size);
96
+ if (data == NULL) {
97
+ return NULL; /* return early as allocation failed */
98
+ }
99
+
100
+ NRT_MemInfo *mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL);
101
+
102
+ return mi;
103
+ }
104
+
105
+ extern "C"
106
+ __device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor) {
107
+ NRT_MemInfo *mi = NRT_MemInfo_new_varsize(size);
108
+ if (mi) {
109
+ mi->dtor_info = (void*)dtor;
110
+ }
111
+ return mi;
112
+ }
113
+
114
+ extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
115
+ {
116
+ return mi->data;
117
+ }
118
+
119
+ extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
120
+ NRT_MemInfo *mi = NULL;
121
+ void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
122
+ if (data == NULL) {
123
+ return NULL; /* return early as allocation failed */
124
+ }
125
+ //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
126
+ NRT_MemInfo_init(mi, data, size, NULL, NULL);
127
+ return mi;
128
+ }
129
+
130
+ static
131
+ __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
132
+ NRT_MemInfo **mi)
133
+ {
134
+ size_t offset = 0, intptr = 0, remainder = 0;
135
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
136
+ char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
137
+ if (base == NULL) {
138
+ return NULL; /* return early as allocation failed */
139
+ }
140
+ intptr = (size_t) base;
141
+ /*
142
+ * See if the allocation is aligned already...
143
+ * Check if align is a power of 2, if so the modulo can be avoided.
144
+ */
145
+ if((align & (align - 1)) == 0)
146
+ {
147
+ remainder = intptr & (align - 1);
148
+ }
149
+ else
150
+ {
151
+ remainder = intptr % align;
152
+ }
153
+ if (remainder == 0){ /* Yes */
154
+ offset = 0;
155
+ } else { /* No, move forward `offset` bytes */
156
+ offset = align - remainder;
157
+ }
158
+ return (void*)((char *)base + offset);
159
+ }
160
+
161
+ static
162
+ __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
163
+ NRT_MemInfo *mi = NULL;
164
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
165
+ char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
166
+ if (base == NULL) {
167
+ *mi_out = NULL; /* set meminfo to NULL as allocation failed */
168
+ return NULL; /* return early as allocation failed */
169
+ }
170
+ mi = (NRT_MemInfo *) base;
171
+ *mi_out = mi;
172
+ return (void*)((char *)base + sizeof(NRT_MemInfo));
173
+ }
174
+
175
+ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
176
+ void *ptr = NULL;
177
+ ptr = malloc(size);
178
+ //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
179
+
180
+ if (TheMSys && TheMSys->stats.enabled)
181
+ {
182
+ TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed);
183
+ }
184
+ return ptr;
185
+ }
186
+
187
+
188
+ /*
189
+ c++ version of the NRT_decref function that usually is added to
190
+ the final kernel link in PTX form by numba. This version may be
191
+ used by c++ APIs that accept ownership of live objects and must
192
+ manage them going forward.
193
+ */
194
+ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
195
+ {
196
+ if (mi != NULL) {
197
+ mi->refct--;
198
+ if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
199
+ }
200
+ }
201
+
202
+
203
+
204
+
205
+ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
206
+ {
207
+ if (mi != NULL) {
208
+ mi->refct++;
209
+ }
210
+ }
211
+
212
+ #endif