numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,16 @@
1
1
  import math
2
2
 
3
- from numba import (config, cuda, float32, float64, uint32, int64, uint64,
4
- from_dtype, jit)
3
+ from numba import (
4
+ config,
5
+ cuda,
6
+ float32,
7
+ float64,
8
+ uint32,
9
+ int64,
10
+ uint64,
11
+ from_dtype,
12
+ jit,
13
+ )
5
14
 
6
15
  import numpy as np
7
16
 
@@ -29,8 +38,9 @@ import numpy as np
29
38
  # using the CPU @jit decorator everywhere to create functions that work as
30
39
  # both CPU and CUDA device functions.
31
40
 
32
- xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)],
33
- align=True)
41
+ xoroshiro128p_dtype = np.dtype(
42
+ [("s0", np.uint64), ("s1", np.uint64)], align=True
43
+ )
34
44
  xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
35
45
 
36
46
  # When cudasim is enabled, Fake CUDA arrays are passed to some of the
@@ -45,7 +55,7 @@ _nopython = not config.ENABLE_CUDASIM
45
55
 
46
56
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
47
57
  def init_xoroshiro128p_state(states, index, seed):
48
- '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
58
+ """Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
49
59
 
50
60
  This ensures that manually set small seeds don't result in a predictable
51
61
  initial sequence from the random number generator.
@@ -56,7 +66,7 @@ def init_xoroshiro128p_state(states, index, seed):
56
66
  :param index: offset in states to update
57
67
  :type seed: int64
58
68
  :param seed: seed value to use when initializing state
59
- '''
69
+ """
60
70
  index = int64(index)
61
71
  seed = uint64(seed)
62
72
 
@@ -65,13 +75,13 @@ def init_xoroshiro128p_state(states, index, seed):
65
75
  z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
66
76
  z = z ^ (z >> uint32(31))
67
77
 
68
- states[index]['s0'] = z
69
- states[index]['s1'] = z
78
+ states[index]["s0"] = z
79
+ states[index]["s1"] = z
70
80
 
71
81
 
72
82
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
73
83
  def rotl(x, k):
74
- '''Left rotate x by k bits.'''
84
+ """Left rotate x by k bits."""
75
85
  x = uint64(x)
76
86
  k = uint32(k)
77
87
  return (x << k) | (x >> uint32(64 - k))
@@ -79,38 +89,38 @@ def rotl(x, k):
79
89
 
80
90
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
81
91
  def xoroshiro128p_next(states, index):
82
- '''Return the next random uint64 and advance the RNG in states[index].
92
+ """Return the next random uint64 and advance the RNG in states[index].
83
93
 
84
94
  :type states: 1D array, dtype=xoroshiro128p_dtype
85
95
  :param states: array of RNG states
86
96
  :type index: int64
87
97
  :param index: offset in states to update
88
98
  :rtype: uint64
89
- '''
99
+ """
90
100
  index = int64(index)
91
- s0 = states[index]['s0']
92
- s1 = states[index]['s1']
101
+ s0 = states[index]["s0"]
102
+ s1 = states[index]["s1"]
93
103
  result = s0 + s1
94
104
 
95
105
  s1 ^= s0
96
- states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
97
- states[index]['s1'] = uint64(rotl(s1, uint32(36)))
106
+ states[index]["s0"] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
107
+ states[index]["s1"] = uint64(rotl(s1, uint32(36)))
98
108
 
99
109
  return result
100
110
 
101
111
 
102
112
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
103
113
  def xoroshiro128p_jump(states, index):
104
- '''Advance the RNG in ``states[index]`` by 2**64 steps.
114
+ """Advance the RNG in ``states[index]`` by 2**64 steps.
105
115
 
106
116
  :type states: 1D array, dtype=xoroshiro128p_dtype
107
117
  :param states: array of RNG states
108
118
  :type index: int64
109
119
  :param index: offset in states to update
110
- '''
120
+ """
111
121
  index = int64(index)
112
122
 
113
- jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922))
123
+ jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922))
114
124
 
115
125
  s0 = uint64(0)
116
126
  s1 = uint64(0)
@@ -118,52 +128,52 @@ def xoroshiro128p_jump(states, index):
118
128
  for i in range(2):
119
129
  for b in range(64):
120
130
  if jump[i] & (uint64(1) << uint32(b)):
121
- s0 ^= states[index]['s0']
122
- s1 ^= states[index]['s1']
131
+ s0 ^= states[index]["s0"]
132
+ s1 ^= states[index]["s1"]
123
133
  xoroshiro128p_next(states, index)
124
134
 
125
- states[index]['s0'] = s0
126
- states[index]['s1'] = s1
135
+ states[index]["s0"] = s0
136
+ states[index]["s1"] = s1
127
137
 
128
138
 
129
139
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
130
140
  def uint64_to_unit_float64(x):
131
- '''Convert uint64 to float64 value in the range [0.0, 1.0)'''
141
+ """Convert uint64 to float64 value in the range [0.0, 1.0)"""
132
142
  x = uint64(x)
133
143
  return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
134
144
 
135
145
 
136
146
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
137
147
  def uint64_to_unit_float32(x):
138
- '''Convert uint64 to float32 value in the range [0.0, 1.0)'''
148
+ """Convert uint64 to float32 value in the range [0.0, 1.0)"""
139
149
  x = uint64(x)
140
150
  return float32(uint64_to_unit_float64(x))
141
151
 
142
152
 
143
153
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
144
154
  def xoroshiro128p_uniform_float32(states, index):
145
- '''Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
155
+ """Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
146
156
 
147
157
  :type states: 1D array, dtype=xoroshiro128p_dtype
148
158
  :param states: array of RNG states
149
159
  :type index: int64
150
160
  :param index: offset in states to update
151
161
  :rtype: float32
152
- '''
162
+ """
153
163
  index = int64(index)
154
164
  return uint64_to_unit_float32(xoroshiro128p_next(states, index))
155
165
 
156
166
 
157
167
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
158
168
  def xoroshiro128p_uniform_float64(states, index):
159
- '''Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
169
+ """Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
160
170
 
161
171
  :type states: 1D array, dtype=xoroshiro128p_dtype
162
172
  :param states: array of RNG states
163
173
  :type index: int64
164
174
  :param index: offset in states to update
165
175
  :rtype: float64
166
- '''
176
+ """
167
177
  index = int64(index)
168
178
  return uint64_to_unit_float64(xoroshiro128p_next(states, index))
169
179
 
@@ -174,7 +184,7 @@ TWO_PI_FLOAT64 = np.float64(2 * math.pi)
174
184
 
175
185
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
176
186
  def xoroshiro128p_normal_float32(states, index):
177
- '''Return a normally distributed float32 and advance ``states[index]``.
187
+ """Return a normally distributed float32 and advance ``states[index]``.
178
188
 
179
189
  The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
180
190
  Box-Muller transform. This advances the RNG sequence by two steps.
@@ -184,7 +194,7 @@ def xoroshiro128p_normal_float32(states, index):
184
194
  :type index: int64
185
195
  :param index: offset in states to update
186
196
  :rtype: float32
187
- '''
197
+ """
188
198
  index = int64(index)
189
199
 
190
200
  u1 = xoroshiro128p_uniform_float32(states, index)
@@ -199,7 +209,7 @@ def xoroshiro128p_normal_float32(states, index):
199
209
 
200
210
  @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
201
211
  def xoroshiro128p_normal_float64(states, index):
202
- '''Return a normally distributed float32 and advance ``states[index]``.
212
+ """Return a normally distributed float32 and advance ``states[index]``.
203
213
 
204
214
  The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
205
215
  Box-Muller transform. This advances the RNG sequence by two steps.
@@ -209,7 +219,7 @@ def xoroshiro128p_normal_float64(states, index):
209
219
  :type index: int64
210
220
  :param index: offset in states to update
211
221
  :rtype: float64
212
- '''
222
+ """
213
223
  index = int64(index)
214
224
 
215
225
  u1 = xoroshiro128p_uniform_float32(states, index)
@@ -242,7 +252,7 @@ def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
242
252
 
243
253
 
244
254
  def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
245
- '''Initialize RNG states on the GPU for parallel generators.
255
+ """Initialize RNG states on the GPU for parallel generators.
246
256
 
247
257
  This initializes the RNG states so that each state in the array corresponds
248
258
  subsequences in the separated by 2**64 steps from each other in the main
@@ -257,7 +267,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
257
267
  :param states: array of RNG states
258
268
  :type seed: uint64
259
269
  :param seed: starting seed for list of generators
260
- '''
270
+ """
261
271
 
262
272
  # Initialization on CPU is much faster than the GPU
263
273
  states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
@@ -267,7 +277,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
267
277
 
268
278
 
269
279
  def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
270
- '''Returns a new device array initialized for n random number generators.
280
+ """Returns a new device array initialized for n random number generators.
271
281
 
272
282
  This initializes the RNG states so that each state in the array corresponds
273
283
  subsequences in the separated by 2**64 steps from each other in the main
@@ -286,7 +296,7 @@ def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
286
296
  :param subsequence_start:
287
297
  :type stream: CUDA stream
288
298
  :param stream: stream to run initialization kernel on
289
- '''
299
+ """
290
300
  states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
291
301
  init_xoroshiro128p_states(states, seed, subsequence_start, stream)
292
302
  return states
@@ -148,4 +148,4 @@ numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *
148
148
  }
149
149
 
150
150
  return 1;
151
- }
151
+ }
@@ -1 +1 @@
1
- from numba.cuda.runtime.nrt import rtsys # noqa: F401
1
+ from numba.cuda.runtime.nrt import rtsys # noqa: F401
@@ -91,4 +91,4 @@ extern "C" __global__ void NRT_MemSys_print(void)
91
91
  } else {
92
92
  printf("TheMsys is null.\n");
93
93
  }
94
- }
94
+ }
@@ -14,4 +14,4 @@ struct NRT_MemSys {
14
14
  /* The Memory System object */
15
15
  __device__ NRT_MemSys* TheMSys;
16
16
 
17
- extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
17
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
@@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
33
33
  {
34
34
  void* ptr = NULL;
35
35
  ptr = malloc(size);
36
- if (TheMSys && TheMSys->stats.enabled) {
36
+ if (TheMSys && TheMSys->stats.enabled) {
37
37
  TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
38
38
  return ptr;
39
39
  }
@@ -49,7 +49,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
49
49
  mi->dtor_info = dtor_info;
50
50
  mi->data = data;
51
51
  mi->size = size;
52
- if (TheMSys && TheMSys->stats.enabled) {
52
+ if (TheMSys && TheMSys->stats.enabled) {
53
53
  TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
54
54
  }
55
55
 
@@ -77,7 +77,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
77
77
  extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
78
78
  {
79
79
  NRT_dealloc(mi);
80
- if (TheMSys && TheMSys->stats.enabled) {
80
+ if (TheMSys && TheMSys->stats.enabled) {
81
81
  TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
82
82
  }
83
83
 
@@ -5,26 +5,28 @@ import numpy as np
5
5
 
6
6
  from numba import cuda, config
7
7
  from numba.core.runtime.nrt import _nrt_mstats
8
- from numba.cuda.cudadrv.driver import (Linker, driver, launch_kernel,
9
- USE_NV_BINDING)
8
+ from numba.cuda.cudadrv.driver import (
9
+ Linker,
10
+ driver,
11
+ launch_kernel,
12
+ USE_NV_BINDING,
13
+ )
10
14
  from numba.cuda.cudadrv import devices
11
15
  from numba.cuda.api import get_current_device
12
16
  from numba.cuda.utils import _readenv
13
17
 
14
18
 
15
19
  # Check environment variable or config for NRT statistics enablement
16
- NRT_STATS = (
17
- _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
18
- getattr(config, "NUMBA_CUDA_NRT_STATS", False)
20
+ NRT_STATS = _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or getattr(
21
+ config, "NUMBA_CUDA_NRT_STATS", False
19
22
  )
20
23
  if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
21
24
  config.CUDA_NRT_STATS = NRT_STATS
22
25
 
23
26
 
24
27
  # Check environment variable or config for NRT enablement
25
- ENABLE_NRT = (
26
- _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
27
- getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
28
+ ENABLE_NRT = _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or getattr(
29
+ config, "NUMBA_CUDA_ENABLE_NRT", False
28
30
  )
29
31
  if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
30
32
  config.CUDA_ENABLE_NRT = ENABLE_NRT
@@ -35,16 +37,19 @@ def _alloc_init_guard(method):
35
37
  """
36
38
  Ensure NRT memory allocation and initialization before running the method
37
39
  """
40
+
38
41
  @wraps(method)
39
42
  def wrapper(self, *args, **kwargs):
40
43
  self.ensure_allocated()
41
44
  self.ensure_initialized()
42
45
  return method(self, *args, **kwargs)
46
+
43
47
  return wrapper
44
48
 
45
49
 
46
50
  class _Runtime:
47
51
  """Singleton class for Numba CUDA runtime"""
52
+
48
53
  _instance = None
49
54
 
50
55
  def __new__(cls, *args, **kwargs):
@@ -64,8 +69,7 @@ class _Runtime:
64
69
  """
65
70
  # Define the path for memsys.cu
66
71
  memsys_mod = os.path.join(
67
- os.path.dirname(os.path.abspath(__file__)),
68
- "memsys.cu"
72
+ os.path.dirname(os.path.abspath(__file__)), "memsys.cu"
69
73
  )
70
74
  cc = get_current_device().compute_capability
71
75
 
@@ -105,10 +109,12 @@ class _Runtime:
105
109
  # Allocate space for NRT_MemSys
106
110
  ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
107
111
  memsys_size = ctypes.c_uint64()
108
- driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
109
- ptr.device_ctypes_pointer, nbytes)
112
+ driver.cuMemcpyDtoH(
113
+ ctypes.addressof(memsys_size), ptr.device_ctypes_pointer, nbytes
114
+ )
110
115
  self._memsys = device_array(
111
- (memsys_size.value,), dtype="i1", stream=stream)
116
+ (memsys_size.value,), dtype="i1", stream=stream
117
+ )
112
118
  self.set_memsys_to_module(self._memsys_module, stream=stream)
113
119
 
114
120
  def _single_thread_launch(self, module, stream, name, params=()):
@@ -121,12 +127,16 @@ class _Runtime:
121
127
  func = module.get_function(name)
122
128
  launch_kernel(
123
129
  func.handle,
124
- 1, 1, 1,
125
- 1, 1, 1,
130
+ 1,
131
+ 1,
132
+ 1,
133
+ 1,
134
+ 1,
135
+ 1,
126
136
  0,
127
137
  stream.handle,
128
138
  params,
129
- cooperative=False
139
+ cooperative=False,
130
140
  )
131
141
 
132
142
  def _ctypes_pointer(self, array):
@@ -158,7 +168,8 @@ class _Runtime:
158
168
  self.ensure_allocated()
159
169
 
160
170
  self._single_thread_launch(
161
- self._memsys_module, stream, "NRT_MemSys_init")
171
+ self._memsys_module, stream, "NRT_MemSys_init"
172
+ )
162
173
  self._initialized = True
163
174
 
164
175
  if config.CUDA_NRT_STATS:
@@ -170,7 +181,8 @@ class _Runtime:
170
181
  Enable memsys statistics
171
182
  """
172
183
  self._single_thread_launch(
173
- self._memsys_module, stream, "NRT_MemSys_enable_stats")
184
+ self._memsys_module, stream, "NRT_MemSys_enable_stats"
185
+ )
174
186
 
175
187
  @_alloc_init_guard
176
188
  def memsys_disable_stats(self, stream=None):
@@ -178,7 +190,8 @@ class _Runtime:
178
190
  Disable memsys statistics
179
191
  """
180
192
  self._single_thread_launch(
181
- self._memsys_module, stream, "NRT_MemSys_disable_stats")
193
+ self._memsys_module, stream, "NRT_MemSys_disable_stats"
194
+ )
182
195
 
183
196
  @_alloc_init_guard
184
197
  def memsys_stats_enabled(self, stream=None):
@@ -193,7 +206,7 @@ class _Runtime:
193
206
  self._memsys_module,
194
207
  stream,
195
208
  "NRT_MemSys_stats_enabled",
196
- (enabled_ptr,)
209
+ (enabled_ptr,),
197
210
  )
198
211
 
199
212
  cuda.synchronize()
@@ -204,21 +217,20 @@ class _Runtime:
204
217
  """
205
218
  Copy all statistics of memsys to the host
206
219
  """
207
- dt = np.dtype([
208
- ('alloc', np.uint64),
209
- ('free', np.uint64),
210
- ('mi_alloc', np.uint64),
211
- ('mi_free', np.uint64)
212
- ])
220
+ dt = np.dtype(
221
+ [
222
+ ("alloc", np.uint64),
223
+ ("free", np.uint64),
224
+ ("mi_alloc", np.uint64),
225
+ ("mi_free", np.uint64),
226
+ ]
227
+ )
213
228
 
214
229
  stats_for_read = cuda.managed_array(1, dt)
215
230
  stats_ptr = self._ctypes_pointer(stats_for_read)
216
231
 
217
232
  self._single_thread_launch(
218
- self._memsys_module,
219
- stream,
220
- "NRT_MemSys_read",
221
- [stats_ptr]
233
+ self._memsys_module, stream, "NRT_MemSys_read", [stats_ptr]
222
234
  )
223
235
  cuda.synchronize()
224
236
 
@@ -237,7 +249,7 @@ class _Runtime:
237
249
  alloc=memsys["alloc"],
238
250
  free=memsys["free"],
239
251
  mi_alloc=memsys["mi_alloc"],
240
- mi_free=memsys["mi_free"]
252
+ mi_free=memsys["mi_free"],
241
253
  )
242
254
 
243
255
  @_alloc_init_guard
@@ -249,10 +261,7 @@ class _Runtime:
249
261
  got_ptr = self._ctypes_pointer(got)
250
262
 
251
263
  self._single_thread_launch(
252
- self._memsys_module,
253
- stream,
254
- f"NRT_MemSys_read_{stat}",
255
- [got_ptr]
264
+ self._memsys_module, stream, f"NRT_MemSys_read_{stat}", [got_ptr]
256
265
  )
257
266
 
258
267
  cuda.synchronize()
@@ -309,15 +318,13 @@ class _Runtime:
309
318
  """
310
319
  if self._memsys is None:
311
320
  raise RuntimeError(
312
- "Please allocate NRT Memsys first before setting to module.")
321
+ "Please allocate NRT Memsys first before setting to module."
322
+ )
313
323
 
314
324
  memsys_ptr = self._ctypes_pointer(self._memsys)
315
325
 
316
326
  self._single_thread_launch(
317
- module,
318
- stream,
319
- "NRT_MemSys_set",
320
- [memsys_ptr]
327
+ module, stream, "NRT_MemSys_set", [memsys_ptr]
321
328
  )
322
329
 
323
330
  @_alloc_init_guard
@@ -327,9 +334,7 @@ class _Runtime:
327
334
  """
328
335
  cuda.synchronize()
329
336
  self._single_thread_launch(
330
- self._memsys_module,
331
- stream,
332
- "NRT_MemSys_print"
337
+ self._memsys_module, stream, "NRT_MemSys_print"
333
338
  )
334
339
 
335
340
 
@@ -3,14 +3,22 @@ import sys
3
3
  from .api import *
4
4
  from .vector_types import vector_types
5
5
  from .reduction import Reduce
6
- from .cudadrv.devicearray import (device_array, device_array_like, pinned,
7
- pinned_array, pinned_array_like,
8
- mapped_array, to_device, auto_device)
6
+ from .cudadrv.devicearray import (
7
+ device_array,
8
+ device_array_like,
9
+ pinned,
10
+ pinned_array,
11
+ pinned_array_like,
12
+ mapped_array,
13
+ to_device,
14
+ auto_device,
15
+ )
9
16
  from .cudadrv import devicearray
10
17
  from .cudadrv.devices import require_context, gpus
11
18
  from .cudadrv.devices import get_context as current_context
12
19
  from .cudadrv.runtime import runtime
13
20
  from numba.core import config
21
+
14
22
  reduce = Reduce
15
23
 
16
24
  # Register simulated vector types as module level variables
@@ -25,14 +33,16 @@ del vector_types, name, svty, alias
25
33
  if config.ENABLE_CUDASIM:
26
34
  import sys
27
35
  from numba.cuda.simulator import cudadrv
28
- sys.modules['numba.cuda.cudadrv'] = cudadrv
29
- sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray
30
- sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices
31
- sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver
32
- sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime
33
- sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi
34
- sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error
35
- sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm
36
+
37
+ sys.modules["numba.cuda.cudadrv"] = cudadrv
38
+ sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
39
+ sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
40
+ sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
41
+ sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
42
+ sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
43
+ sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
44
+ sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
36
45
 
37
46
  from . import compiler
38
- sys.modules['numba.cuda.compiler'] = compiler
47
+
48
+ sys.modules["numba.cuda.compiler"] = compiler