numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """Compatibility module.
5
+
6
+ It can be necessary to load files generated by previous versions of cloudpickle
7
+ that rely on symbols being defined under the `cloudpickle.cloudpickle_fast`
8
+ namespace.
9
+
10
+ See: tests/test_backward_compat.py
11
+ """
12
+
13
+ from . import cloudpickle
14
+
15
+
16
+ def __getattr__(name):
17
+ return getattr(cloudpickle, name)
@@ -0,0 +1,541 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from llvmlite import ir
5
+
6
+ from numba.cuda.core import config
7
+ from numba.cuda import serialize
8
+ from .cudadrv import devices, driver, nvvm, runtime, nvrtc
9
+ from numba.cuda.core.codegen import Codegen, CodeLibrary
10
+ from numba.cuda.cudadrv.libs import get_cudalib
11
+ from numba.cuda.cudadrv.linkable_code import LinkableCode
12
+ from numba.cuda.memory_management.nrt import NRT_LIBRARY
13
+
14
+ import os
15
+ import subprocess
16
+ import tempfile
17
+
18
+ CUDA_TRIPLE = "nvptx64-nvidia-cuda"
19
+
20
+
21
+ def run_nvdisasm(cubin, flags):
22
+ # nvdisasm only accepts input from a file, so we need to write out to a
23
+ # temp file and clean up afterwards.
24
+ fd = None
25
+ fname = None
26
+ try:
27
+ fd, fname = tempfile.mkstemp()
28
+ with open(fname, "wb") as f:
29
+ f.write(cubin.code)
30
+
31
+ try:
32
+ cp = subprocess.run(
33
+ ["nvdisasm", *flags, fname],
34
+ check=True,
35
+ stdout=subprocess.PIPE,
36
+ stderr=subprocess.PIPE,
37
+ )
38
+ except FileNotFoundError as e:
39
+ msg = (
40
+ "nvdisasm has not been found. You may need "
41
+ "to install the CUDA toolkit and ensure that "
42
+ "it is available on your PATH.\n"
43
+ )
44
+ raise RuntimeError(msg) from e
45
+ return cp.stdout.decode("utf-8")
46
+ finally:
47
+ if fd is not None:
48
+ os.close(fd)
49
+ if fname is not None:
50
+ os.unlink(fname)
51
+
52
+
53
+ def disassemble_cubin(cubin):
54
+ # Request lineinfo in disassembly
55
+ flags = ["-gi"]
56
+ return run_nvdisasm(cubin, flags)
57
+
58
+
59
+ def disassemble_cubin_for_cfg(cubin):
60
+ # Request control flow graph in disassembly
61
+ flags = ["-cfg"]
62
+ return run_nvdisasm(cubin, flags)
63
+
64
+
65
+ class ExternalCodeLibrary(CodeLibrary):
66
+ """Holds code produced externally, for linking with generated code."""
67
+
68
+ def __init__(self, codegen, name):
69
+ super().__init__(codegen, name)
70
+ # Files to link
71
+ self._linking_files = set()
72
+ # Setup and teardown functions for the module.
73
+ # The order is determined by the order they are added to the codelib.
74
+ self._setup_functions = []
75
+ self._teardown_functions = []
76
+
77
+ self.use_cooperative = False
78
+
79
+ @property
80
+ def modules(self):
81
+ # There are no LLVM IR modules in an ExternalCodeLibrary
82
+ return set()
83
+
84
+ def add_linking_file(self, path_or_obj):
85
+ # Adding new files after finalization is prohibited, in case the list
86
+ # of libraries has already been added to another code library; the
87
+ # newly-added files would be omitted from their linking process.
88
+ self._raise_if_finalized()
89
+
90
+ if isinstance(path_or_obj, LinkableCode):
91
+ if path_or_obj.setup_callback:
92
+ self._setup_functions.append(path_or_obj.setup_callback)
93
+ if path_or_obj.teardown_callback:
94
+ self._teardown_functions.append(path_or_obj.teardown_callback)
95
+
96
+ self._linking_files.add(path_or_obj)
97
+
98
+ def add_ir_module(self, module):
99
+ raise NotImplementedError("Cannot add LLVM IR to external code")
100
+
101
+ def add_linking_library(self, library):
102
+ raise NotImplementedError("Cannot add libraries to external code")
103
+
104
+ def finalize(self):
105
+ self._raise_if_finalized()
106
+ self._finalized = True
107
+
108
+ def get_asm_str(self):
109
+ raise NotImplementedError("No assembly for external code")
110
+
111
+ def get_llvm_str(self):
112
+ raise NotImplementedError("No LLVM IR for external code")
113
+
114
+ def get_function(self, name):
115
+ raise NotImplementedError("Cannot get function from external code")
116
+
117
+
118
+ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
119
+ """
120
+ The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
121
+ compute capabilities. It also loads cubins to multiple devices (via
122
+ get_cufunc), which may be of different compute capabilities.
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ codegen,
128
+ name,
129
+ entry_name=None,
130
+ max_registers=None,
131
+ lto=False,
132
+ nvvm_options=None,
133
+ ):
134
+ """
135
+ codegen:
136
+ Codegen object.
137
+ name:
138
+ Name of the function in the source.
139
+ entry_name:
140
+ Name of the kernel function in the binary, if this is a global
141
+ kernel and not a device function.
142
+ max_registers:
143
+ The maximum register usage to aim for when linking.
144
+ lto:
145
+ Whether to enable link-time optimization.
146
+ nvvm_options:
147
+ Dict of options to pass to NVVM.
148
+ """
149
+ super().__init__(codegen, name)
150
+
151
+ # The llvmlite module for this library.
152
+ self._module = None
153
+ # CodeLibrary objects that will be "linked" into this library. The
154
+ # modules within them are compiled from NVVM IR to PTX along with the
155
+ # IR from this module - in that sense they are "linked" by NVVM at PTX
156
+ # generation time, rather than at link time.
157
+ self._linking_libraries = set()
158
+ # Files to link with the generated PTX. These are linked using the
159
+ # Driver API at link time.
160
+ self._linking_files = set()
161
+ # List of setup functions to the loaded module
162
+ # the order is determined by the order they are added to the codelib.
163
+ self._setup_functions = []
164
+ # List of teardown functions to the loaded module
165
+ # the order is determined by the order they are added to the codelib.
166
+ self._teardown_functions = []
167
+ # Should we link libcudadevrt?
168
+ self.needs_cudadevrt = False
169
+
170
+ # Cache the LLVM IR string
171
+ self._llvm_strs = None
172
+ # Maps CC -> PTX string
173
+ self._ptx_cache = {}
174
+ # Maps CC -> LTO-IR
175
+ self._ltoir_cache = {}
176
+ # Maps CC -> cubin
177
+ self._cubin_cache = {}
178
+ # Maps CC -> linker info output for cubin
179
+ self._linkerinfo_cache = {}
180
+ # Maps Device numeric ID -> cufunc
181
+ self._cufunc_cache = {}
182
+
183
+ self._max_registers = max_registers
184
+ self._lto = lto
185
+ if nvvm_options is None:
186
+ nvvm_options = {}
187
+ self._nvvm_options = nvvm_options
188
+ self._entry_name = entry_name
189
+
190
+ self.use_cooperative = False
191
+
192
+ @property
193
+ def llvm_strs(self):
194
+ if self._llvm_strs is None:
195
+ self._llvm_strs = [str(mod) for mod in self.modules]
196
+ return self._llvm_strs
197
+
198
+ def get_llvm_str(self):
199
+ return "\n\n".join(self.llvm_strs)
200
+
201
+ def _ensure_cc(self, cc):
202
+ if cc is not None:
203
+ return cc
204
+
205
+ device = devices.get_context().device
206
+ return device.compute_capability
207
+
208
+ def get_asm_str(self, cc=None):
209
+ cc = self._ensure_cc(cc)
210
+
211
+ ptxes = self._ptx_cache.get(cc, None)
212
+ if ptxes:
213
+ return ptxes
214
+
215
+ arch = nvrtc.get_arch_option(*cc)
216
+ options = self._nvvm_options.copy()
217
+ options["arch"] = arch
218
+
219
+ irs = self.llvm_strs
220
+
221
+ ptx = nvvm.compile_ir(irs, **options)
222
+
223
+ # Sometimes the result from NVVM contains trailing whitespace and
224
+ # nulls, which we strip so that the assembly dump looks a little
225
+ # tidier.
226
+ ptx = ptx.decode().strip("\x00").strip()
227
+
228
+ if config.DUMP_ASSEMBLY:
229
+ print(("ASSEMBLY %s" % self._name).center(80, "-"))
230
+ print(ptx)
231
+ print("=" * 80)
232
+
233
+ self._ptx_cache[cc] = ptx
234
+
235
+ return ptx
236
+
237
+ def get_lto_ptx(self, cc=None):
238
+ """
239
+ Get the PTX code after LTO.
240
+ """
241
+
242
+ if not self._lto:
243
+ raise RuntimeError("LTO is not enabled")
244
+
245
+ if not driver._have_nvjitlink():
246
+ raise RuntimeError("Link time optimization requires nvJitLink.")
247
+
248
+ cc = self._ensure_cc(cc)
249
+
250
+ linker = driver._Linker.new(
251
+ max_registers=self._max_registers,
252
+ cc=cc,
253
+ additional_flags=["-ptx"],
254
+ lto=self._lto,
255
+ )
256
+
257
+ self._link_all(linker, cc, ignore_nonlto=True)
258
+
259
+ ptx = linker.get_linked_ptx()
260
+ ptx = ptx.decode("utf-8")
261
+
262
+ return ptx
263
+
264
+ def get_ltoir(self, cc=None):
265
+ cc = self._ensure_cc(cc)
266
+
267
+ ltoir = self._ltoir_cache.get(cc, None)
268
+ if ltoir is not None:
269
+ return ltoir
270
+
271
+ arch = nvrtc.get_arch_option(*cc)
272
+ options = self._nvvm_options.copy()
273
+ options["arch"] = arch
274
+ options["gen-lto"] = None
275
+
276
+ irs = self.llvm_strs
277
+ ltoir = nvvm.compile_ir(irs, **options)
278
+ self._ltoir_cache[cc] = ltoir
279
+
280
+ return ltoir
281
+
282
+ def _link_all(self, linker, cc, ignore_nonlto=False):
283
+ if linker.lto:
284
+ ltoir = self.get_ltoir(cc=cc)
285
+ linker.add_ltoir(ltoir)
286
+ else:
287
+ ptx = self.get_asm_str(cc=cc)
288
+ linker.add_ptx(ptx.encode())
289
+
290
+ for path in self._linking_files:
291
+ linker.add_file_guess_ext(path, ignore_nonlto)
292
+ if self.needs_cudadevrt:
293
+ linker.add_file_guess_ext(
294
+ get_cudalib("cudadevrt", static=True), ignore_nonlto
295
+ )
296
+
297
+ def get_cubin(self, cc=None):
298
+ cc = self._ensure_cc(cc)
299
+
300
+ cubin = self._cubin_cache.get(cc, None)
301
+ if cubin:
302
+ return cubin
303
+
304
+ if self._lto and config.DUMP_ASSEMBLY:
305
+ ptx = self.get_lto_ptx(cc=cc)
306
+
307
+ print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
308
+ print(ptx)
309
+ print("=" * 80)
310
+
311
+ linker = driver._Linker.new(
312
+ max_registers=self._max_registers, cc=cc, lto=self._lto
313
+ )
314
+ self._link_all(linker, cc, ignore_nonlto=False)
315
+ cubin = linker.complete()
316
+
317
+ self._cubin_cache[cc] = cubin
318
+ self._linkerinfo_cache[cc] = linker.info_log
319
+
320
+ return cubin
321
+
322
+ def get_cufunc(self):
323
+ if self._entry_name is None:
324
+ msg = (
325
+ "Missing entry_name - are you trying to get the cufunc "
326
+ "for a device function?"
327
+ )
328
+ raise RuntimeError(msg)
329
+
330
+ ctx = devices.get_context()
331
+ device = ctx.device
332
+
333
+ cufunc = self._cufunc_cache.get(device.id, None)
334
+ if cufunc:
335
+ return cufunc
336
+ cubin = self.get_cubin(cc=device.compute_capability)
337
+ module = ctx.create_module_image(
338
+ cubin, self._setup_functions, self._teardown_functions
339
+ )
340
+
341
+ # Load
342
+ cufunc = module.get_function(self._entry_name)
343
+
344
+ # Populate caches
345
+ self._cufunc_cache[device.id] = cufunc
346
+
347
+ return cufunc
348
+
349
+ def get_linkerinfo(self, cc):
350
+ try:
351
+ return self._linkerinfo_cache[cc]
352
+ except KeyError:
353
+ raise KeyError(f"No linkerinfo for CC {cc}")
354
+
355
+ def get_sass(self, cc=None):
356
+ return disassemble_cubin(self.get_cubin(cc=cc))
357
+
358
+ def get_sass_cfg(self, cc=None):
359
+ return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
360
+
361
+ def add_ir_module(self, mod):
362
+ self._raise_if_finalized()
363
+ if self._module is not None:
364
+ raise RuntimeError("CUDACodeLibrary only supports one module")
365
+ self._module = mod
366
+
367
+ def add_linking_library(self, library):
368
+ library._ensure_finalized()
369
+
370
+ # We don't want to allow linking more libraries in after finalization
371
+ # because our linked libraries are modified by the finalization, and we
372
+ # won't be able to finalize again after adding new ones
373
+ self._raise_if_finalized()
374
+
375
+ self._linking_libraries.add(library)
376
+ self._linking_files.update(library._linking_files)
377
+ self._setup_functions.extend(library._setup_functions)
378
+ self._teardown_functions.extend(library._teardown_functions)
379
+ self.use_cooperative |= library.use_cooperative
380
+
381
+ def add_linking_file(self, path_or_obj):
382
+ if isinstance(path_or_obj, LinkableCode):
383
+ if path_or_obj.setup_callback:
384
+ self._setup_functions.append(path_or_obj.setup_callback)
385
+ if path_or_obj.teardown_callback:
386
+ self._teardown_functions.append(path_or_obj.teardown_callback)
387
+
388
+ self._linking_files.add(path_or_obj)
389
+
390
+ def get_function(self, name):
391
+ for fn in self._module.functions:
392
+ if fn.name == name:
393
+ return fn
394
+ raise KeyError(f"Function {name} not found")
395
+
396
+ @property
397
+ def modules(self):
398
+ return [self._module] + [
399
+ mod for lib in self._linking_libraries for mod in lib.modules
400
+ ]
401
+
402
+ @property
403
+ def linking_libraries(self):
404
+ # Libraries we link to may link to other libraries, so we recursively
405
+ # traverse the linking libraries property to build up a list of all
406
+ # linked libraries.
407
+ libs = []
408
+ for lib in self._linking_libraries:
409
+ libs.extend(lib.linking_libraries)
410
+ libs.append(lib)
411
+ return libs
412
+
413
+ def finalize(self):
414
+ # Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
415
+ # we only adjust the linkage of functions. Global kernels (with
416
+ # external linkage) have their linkage untouched. Device functions are
417
+ # set linkonce_odr to prevent them appearing in the PTX.
418
+
419
+ self._raise_if_finalized()
420
+
421
+ # Note in-place modification of the linkage of functions in linked
422
+ # libraries. This presently causes no issues as only device functions
423
+ # are shared across code libraries, so they would always need their
424
+ # linkage set to linkonce_odr. If in a future scenario some code
425
+ # libraries require linkonce_odr linkage of functions in linked
426
+ # modules, and another code library requires another linkage, each code
427
+ # library will need to take its own private copy of its linked modules.
428
+ #
429
+ # See also discussion on PR #890:
430
+ # https://github.com/numba/numba/pull/890
431
+ for library in self._linking_libraries:
432
+ for mod in library.modules:
433
+ for fn in mod.functions:
434
+ if not fn.is_declaration:
435
+ fn.linkage = "linkonce_odr"
436
+
437
+ self._finalized = True
438
+
439
+ def _reduce_states(self):
440
+ """
441
+ Reduce the instance for serialization. We retain the PTX and cubins,
442
+ but loaded functions are discarded. They are recreated when needed
443
+ after deserialization.
444
+ """
445
+ nrt = False
446
+ if self._linking_files:
447
+ if (
448
+ len(self._linking_files) == 1
449
+ and NRT_LIBRARY in self._linking_files
450
+ ):
451
+ nrt = True
452
+ else:
453
+ msg = "Cannot pickle CUDACodeLibrary with linking files"
454
+ raise RuntimeError(msg)
455
+
456
+ if not self._finalized:
457
+ raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
458
+ return dict(
459
+ codegen=None,
460
+ name=self.name,
461
+ entry_name=self._entry_name,
462
+ llvm_strs=self.llvm_strs,
463
+ ptx_cache=self._ptx_cache,
464
+ cubin_cache=self._cubin_cache,
465
+ linkerinfo_cache=self._linkerinfo_cache,
466
+ max_registers=self._max_registers,
467
+ nvvm_options=self._nvvm_options,
468
+ needs_cudadevrt=self.needs_cudadevrt,
469
+ nrt=nrt,
470
+ use_cooperative=self.use_cooperative,
471
+ lto=self._lto,
472
+ )
473
+
474
+ @classmethod
475
+ def _rebuild(
476
+ cls,
477
+ codegen,
478
+ name,
479
+ entry_name,
480
+ llvm_strs,
481
+ ptx_cache,
482
+ cubin_cache,
483
+ linkerinfo_cache,
484
+ max_registers,
485
+ nvvm_options,
486
+ needs_cudadevrt,
487
+ nrt,
488
+ use_cooperative,
489
+ lto,
490
+ ):
491
+ """
492
+ Rebuild an instance.
493
+ """
494
+ instance = cls(codegen, name, entry_name=entry_name)
495
+
496
+ instance._llvm_strs = llvm_strs
497
+ instance._ptx_cache = ptx_cache
498
+ instance._cubin_cache = cubin_cache
499
+ instance._linkerinfo_cache = linkerinfo_cache
500
+
501
+ instance._max_registers = max_registers
502
+ instance._nvvm_options = nvvm_options
503
+ instance.needs_cudadevrt = needs_cudadevrt
504
+ instance.use_cooperative = use_cooperative
505
+
506
+ instance._finalized = True
507
+ if nrt:
508
+ instance._linking_files = {NRT_LIBRARY}
509
+
510
+ instance._lto = lto
511
+ return instance
512
+
513
+
514
+ class JITCUDACodegen(Codegen):
515
+ """
516
+ This codegen implementation for CUDA only generates optimized LLVM IR.
517
+ Generation of PTX code is done separately (see numba.cuda.compiler).
518
+ """
519
+
520
+ _library_class = CUDACodeLibrary
521
+
522
+ def __init__(self, module_name):
523
+ pass
524
+
525
+ def _create_empty_module(self, name):
526
+ ir_module = ir.Module(name)
527
+ ir_module.triple = CUDA_TRIPLE
528
+ ir_module.data_layout = nvvm.NVVM().data_layout
529
+ nvvm.add_ir_version(ir_module)
530
+ return ir_module
531
+
532
+ def _add_module(self, module):
533
+ pass
534
+
535
+ def magic_tuple(self):
536
+ """
537
+ Return a tuple unambiguously describing the codegen behaviour.
538
+ """
539
+ ctx = devices.get_context()
540
+ cc = ctx.device.compute_capability
541
+ return (runtime.runtime.get_version(), cc)