numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/random.py
CHANGED
@@ -1,7 +1,16 @@
|
|
1
1
|
import math
|
2
2
|
|
3
|
-
from numba import (
|
4
|
-
|
3
|
+
from numba import (
|
4
|
+
config,
|
5
|
+
cuda,
|
6
|
+
float32,
|
7
|
+
float64,
|
8
|
+
uint32,
|
9
|
+
int64,
|
10
|
+
uint64,
|
11
|
+
from_dtype,
|
12
|
+
jit,
|
13
|
+
)
|
5
14
|
|
6
15
|
import numpy as np
|
7
16
|
|
@@ -29,8 +38,9 @@ import numpy as np
|
|
29
38
|
# using the CPU @jit decorator everywhere to create functions that work as
|
30
39
|
# both CPU and CUDA device functions.
|
31
40
|
|
32
|
-
xoroshiro128p_dtype = np.dtype(
|
33
|
-
|
41
|
+
xoroshiro128p_dtype = np.dtype(
|
42
|
+
[("s0", np.uint64), ("s1", np.uint64)], align=True
|
43
|
+
)
|
34
44
|
xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
|
35
45
|
|
36
46
|
# When cudasim is enabled, Fake CUDA arrays are passed to some of the
|
@@ -45,7 +55,7 @@ _nopython = not config.ENABLE_CUDASIM
|
|
45
55
|
|
46
56
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
47
57
|
def init_xoroshiro128p_state(states, index, seed):
|
48
|
-
|
58
|
+
"""Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
|
49
59
|
|
50
60
|
This ensures that manually set small seeds don't result in a predictable
|
51
61
|
initial sequence from the random number generator.
|
@@ -56,7 +66,7 @@ def init_xoroshiro128p_state(states, index, seed):
|
|
56
66
|
:param index: offset in states to update
|
57
67
|
:type seed: int64
|
58
68
|
:param seed: seed value to use when initializing state
|
59
|
-
|
69
|
+
"""
|
60
70
|
index = int64(index)
|
61
71
|
seed = uint64(seed)
|
62
72
|
|
@@ -65,13 +75,13 @@ def init_xoroshiro128p_state(states, index, seed):
|
|
65
75
|
z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
|
66
76
|
z = z ^ (z >> uint32(31))
|
67
77
|
|
68
|
-
states[index][
|
69
|
-
states[index][
|
78
|
+
states[index]["s0"] = z
|
79
|
+
states[index]["s1"] = z
|
70
80
|
|
71
81
|
|
72
82
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
73
83
|
def rotl(x, k):
|
74
|
-
|
84
|
+
"""Left rotate x by k bits."""
|
75
85
|
x = uint64(x)
|
76
86
|
k = uint32(k)
|
77
87
|
return (x << k) | (x >> uint32(64 - k))
|
@@ -79,38 +89,38 @@ def rotl(x, k):
|
|
79
89
|
|
80
90
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
81
91
|
def xoroshiro128p_next(states, index):
|
82
|
-
|
92
|
+
"""Return the next random uint64 and advance the RNG in states[index].
|
83
93
|
|
84
94
|
:type states: 1D array, dtype=xoroshiro128p_dtype
|
85
95
|
:param states: array of RNG states
|
86
96
|
:type index: int64
|
87
97
|
:param index: offset in states to update
|
88
98
|
:rtype: uint64
|
89
|
-
|
99
|
+
"""
|
90
100
|
index = int64(index)
|
91
|
-
s0 = states[index][
|
92
|
-
s1 = states[index][
|
101
|
+
s0 = states[index]["s0"]
|
102
|
+
s1 = states[index]["s1"]
|
93
103
|
result = s0 + s1
|
94
104
|
|
95
105
|
s1 ^= s0
|
96
|
-
states[index][
|
97
|
-
states[index][
|
106
|
+
states[index]["s0"] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
|
107
|
+
states[index]["s1"] = uint64(rotl(s1, uint32(36)))
|
98
108
|
|
99
109
|
return result
|
100
110
|
|
101
111
|
|
102
112
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
103
113
|
def xoroshiro128p_jump(states, index):
|
104
|
-
|
114
|
+
"""Advance the RNG in ``states[index]`` by 2**64 steps.
|
105
115
|
|
106
116
|
:type states: 1D array, dtype=xoroshiro128p_dtype
|
107
117
|
:param states: array of RNG states
|
108
118
|
:type index: int64
|
109
119
|
:param index: offset in states to update
|
110
|
-
|
120
|
+
"""
|
111
121
|
index = int64(index)
|
112
122
|
|
113
|
-
jump = (uint64(
|
123
|
+
jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922))
|
114
124
|
|
115
125
|
s0 = uint64(0)
|
116
126
|
s1 = uint64(0)
|
@@ -118,52 +128,52 @@ def xoroshiro128p_jump(states, index):
|
|
118
128
|
for i in range(2):
|
119
129
|
for b in range(64):
|
120
130
|
if jump[i] & (uint64(1) << uint32(b)):
|
121
|
-
s0 ^= states[index][
|
122
|
-
s1 ^= states[index][
|
131
|
+
s0 ^= states[index]["s0"]
|
132
|
+
s1 ^= states[index]["s1"]
|
123
133
|
xoroshiro128p_next(states, index)
|
124
134
|
|
125
|
-
states[index][
|
126
|
-
states[index][
|
135
|
+
states[index]["s0"] = s0
|
136
|
+
states[index]["s1"] = s1
|
127
137
|
|
128
138
|
|
129
139
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
130
140
|
def uint64_to_unit_float64(x):
|
131
|
-
|
141
|
+
"""Convert uint64 to float64 value in the range [0.0, 1.0)"""
|
132
142
|
x = uint64(x)
|
133
143
|
return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
|
134
144
|
|
135
145
|
|
136
146
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
137
147
|
def uint64_to_unit_float32(x):
|
138
|
-
|
148
|
+
"""Convert uint64 to float32 value in the range [0.0, 1.0)"""
|
139
149
|
x = uint64(x)
|
140
150
|
return float32(uint64_to_unit_float64(x))
|
141
151
|
|
142
152
|
|
143
153
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
144
154
|
def xoroshiro128p_uniform_float32(states, index):
|
145
|
-
|
155
|
+
"""Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
|
146
156
|
|
147
157
|
:type states: 1D array, dtype=xoroshiro128p_dtype
|
148
158
|
:param states: array of RNG states
|
149
159
|
:type index: int64
|
150
160
|
:param index: offset in states to update
|
151
161
|
:rtype: float32
|
152
|
-
|
162
|
+
"""
|
153
163
|
index = int64(index)
|
154
164
|
return uint64_to_unit_float32(xoroshiro128p_next(states, index))
|
155
165
|
|
156
166
|
|
157
167
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
158
168
|
def xoroshiro128p_uniform_float64(states, index):
|
159
|
-
|
169
|
+
"""Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
|
160
170
|
|
161
171
|
:type states: 1D array, dtype=xoroshiro128p_dtype
|
162
172
|
:param states: array of RNG states
|
163
173
|
:type index: int64
|
164
174
|
:param index: offset in states to update
|
165
175
|
:rtype: float64
|
166
|
-
|
176
|
+
"""
|
167
177
|
index = int64(index)
|
168
178
|
return uint64_to_unit_float64(xoroshiro128p_next(states, index))
|
169
179
|
|
@@ -174,7 +184,7 @@ TWO_PI_FLOAT64 = np.float64(2 * math.pi)
|
|
174
184
|
|
175
185
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
176
186
|
def xoroshiro128p_normal_float32(states, index):
|
177
|
-
|
187
|
+
"""Return a normally distributed float32 and advance ``states[index]``.
|
178
188
|
|
179
189
|
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
180
190
|
Box-Muller transform. This advances the RNG sequence by two steps.
|
@@ -184,7 +194,7 @@ def xoroshiro128p_normal_float32(states, index):
|
|
184
194
|
:type index: int64
|
185
195
|
:param index: offset in states to update
|
186
196
|
:rtype: float32
|
187
|
-
|
197
|
+
"""
|
188
198
|
index = int64(index)
|
189
199
|
|
190
200
|
u1 = xoroshiro128p_uniform_float32(states, index)
|
@@ -199,7 +209,7 @@ def xoroshiro128p_normal_float32(states, index):
|
|
199
209
|
|
200
210
|
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
201
211
|
def xoroshiro128p_normal_float64(states, index):
|
202
|
-
|
212
|
+
"""Return a normally distributed float32 and advance ``states[index]``.
|
203
213
|
|
204
214
|
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
205
215
|
Box-Muller transform. This advances the RNG sequence by two steps.
|
@@ -209,7 +219,7 @@ def xoroshiro128p_normal_float64(states, index):
|
|
209
219
|
:type index: int64
|
210
220
|
:param index: offset in states to update
|
211
221
|
:rtype: float64
|
212
|
-
|
222
|
+
"""
|
213
223
|
index = int64(index)
|
214
224
|
|
215
225
|
u1 = xoroshiro128p_uniform_float32(states, index)
|
@@ -242,7 +252,7 @@ def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
|
|
242
252
|
|
243
253
|
|
244
254
|
def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
|
245
|
-
|
255
|
+
"""Initialize RNG states on the GPU for parallel generators.
|
246
256
|
|
247
257
|
This initializes the RNG states so that each state in the array corresponds
|
248
258
|
subsequences in the separated by 2**64 steps from each other in the main
|
@@ -257,7 +267,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
|
|
257
267
|
:param states: array of RNG states
|
258
268
|
:type seed: uint64
|
259
269
|
:param seed: starting seed for list of generators
|
260
|
-
|
270
|
+
"""
|
261
271
|
|
262
272
|
# Initialization on CPU is much faster than the GPU
|
263
273
|
states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
|
@@ -267,7 +277,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
|
|
267
277
|
|
268
278
|
|
269
279
|
def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
|
270
|
-
|
280
|
+
"""Returns a new device array initialized for n random number generators.
|
271
281
|
|
272
282
|
This initializes the RNG states so that each state in the array corresponds
|
273
283
|
subsequences in the separated by 2**64 steps from each other in the main
|
@@ -286,7 +296,7 @@ def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
|
|
286
296
|
:param subsequence_start:
|
287
297
|
:type stream: CUDA stream
|
288
298
|
:param stream: stream to run initialization kernel on
|
289
|
-
|
299
|
+
"""
|
290
300
|
states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
|
291
301
|
init_xoroshiro128p_states(states, seed, subsequence_start, stream)
|
292
302
|
return states
|
@@ -1 +1 @@
|
|
1
|
-
from numba.cuda.runtime.nrt import rtsys
|
1
|
+
from numba.cuda.runtime.nrt import rtsys # noqa: F401
|
@@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
|
|
33
33
|
{
|
34
34
|
void* ptr = NULL;
|
35
35
|
ptr = malloc(size);
|
36
|
-
if (TheMSys && TheMSys->stats.enabled) {
|
36
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
37
37
|
TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
38
38
|
return ptr;
|
39
39
|
}
|
@@ -49,7 +49,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
|
49
49
|
mi->dtor_info = dtor_info;
|
50
50
|
mi->data = data;
|
51
51
|
mi->size = size;
|
52
|
-
if (TheMSys && TheMSys->stats.enabled) {
|
52
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
53
53
|
TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
54
54
|
}
|
55
55
|
|
@@ -77,7 +77,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
|
77
77
|
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
|
78
78
|
{
|
79
79
|
NRT_dealloc(mi);
|
80
|
-
if (TheMSys && TheMSys->stats.enabled) {
|
80
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
81
81
|
TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
|
82
82
|
}
|
83
83
|
|
@@ -5,26 +5,28 @@ import numpy as np
|
|
5
5
|
|
6
6
|
from numba import cuda, config
|
7
7
|
from numba.core.runtime.nrt import _nrt_mstats
|
8
|
-
from numba.cuda.cudadrv.driver import (
|
9
|
-
|
8
|
+
from numba.cuda.cudadrv.driver import (
|
9
|
+
Linker,
|
10
|
+
driver,
|
11
|
+
launch_kernel,
|
12
|
+
USE_NV_BINDING,
|
13
|
+
)
|
10
14
|
from numba.cuda.cudadrv import devices
|
11
15
|
from numba.cuda.api import get_current_device
|
12
16
|
from numba.cuda.utils import _readenv
|
13
17
|
|
14
18
|
|
15
19
|
# Check environment variable or config for NRT statistics enablement
|
16
|
-
NRT_STATS = (
|
17
|
-
|
18
|
-
getattr(config, "NUMBA_CUDA_NRT_STATS", False)
|
20
|
+
NRT_STATS = _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or getattr(
|
21
|
+
config, "NUMBA_CUDA_NRT_STATS", False
|
19
22
|
)
|
20
23
|
if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
|
21
24
|
config.CUDA_NRT_STATS = NRT_STATS
|
22
25
|
|
23
26
|
|
24
27
|
# Check environment variable or config for NRT enablement
|
25
|
-
ENABLE_NRT = (
|
26
|
-
|
27
|
-
getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
|
28
|
+
ENABLE_NRT = _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or getattr(
|
29
|
+
config, "NUMBA_CUDA_ENABLE_NRT", False
|
28
30
|
)
|
29
31
|
if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
|
30
32
|
config.CUDA_ENABLE_NRT = ENABLE_NRT
|
@@ -35,16 +37,19 @@ def _alloc_init_guard(method):
|
|
35
37
|
"""
|
36
38
|
Ensure NRT memory allocation and initialization before running the method
|
37
39
|
"""
|
40
|
+
|
38
41
|
@wraps(method)
|
39
42
|
def wrapper(self, *args, **kwargs):
|
40
43
|
self.ensure_allocated()
|
41
44
|
self.ensure_initialized()
|
42
45
|
return method(self, *args, **kwargs)
|
46
|
+
|
43
47
|
return wrapper
|
44
48
|
|
45
49
|
|
46
50
|
class _Runtime:
|
47
51
|
"""Singleton class for Numba CUDA runtime"""
|
52
|
+
|
48
53
|
_instance = None
|
49
54
|
|
50
55
|
def __new__(cls, *args, **kwargs):
|
@@ -64,8 +69,7 @@ class _Runtime:
|
|
64
69
|
"""
|
65
70
|
# Define the path for memsys.cu
|
66
71
|
memsys_mod = os.path.join(
|
67
|
-
os.path.dirname(os.path.abspath(__file__)),
|
68
|
-
"memsys.cu"
|
72
|
+
os.path.dirname(os.path.abspath(__file__)), "memsys.cu"
|
69
73
|
)
|
70
74
|
cc = get_current_device().compute_capability
|
71
75
|
|
@@ -105,10 +109,12 @@ class _Runtime:
|
|
105
109
|
# Allocate space for NRT_MemSys
|
106
110
|
ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
|
107
111
|
memsys_size = ctypes.c_uint64()
|
108
|
-
driver.cuMemcpyDtoH(
|
109
|
-
|
112
|
+
driver.cuMemcpyDtoH(
|
113
|
+
ctypes.addressof(memsys_size), ptr.device_ctypes_pointer, nbytes
|
114
|
+
)
|
110
115
|
self._memsys = device_array(
|
111
|
-
(memsys_size.value,), dtype="i1", stream=stream
|
116
|
+
(memsys_size.value,), dtype="i1", stream=stream
|
117
|
+
)
|
112
118
|
self.set_memsys_to_module(self._memsys_module, stream=stream)
|
113
119
|
|
114
120
|
def _single_thread_launch(self, module, stream, name, params=()):
|
@@ -121,12 +127,16 @@ class _Runtime:
|
|
121
127
|
func = module.get_function(name)
|
122
128
|
launch_kernel(
|
123
129
|
func.handle,
|
124
|
-
1,
|
125
|
-
1,
|
130
|
+
1,
|
131
|
+
1,
|
132
|
+
1,
|
133
|
+
1,
|
134
|
+
1,
|
135
|
+
1,
|
126
136
|
0,
|
127
137
|
stream.handle,
|
128
138
|
params,
|
129
|
-
cooperative=False
|
139
|
+
cooperative=False,
|
130
140
|
)
|
131
141
|
|
132
142
|
def _ctypes_pointer(self, array):
|
@@ -158,7 +168,8 @@ class _Runtime:
|
|
158
168
|
self.ensure_allocated()
|
159
169
|
|
160
170
|
self._single_thread_launch(
|
161
|
-
self._memsys_module, stream, "NRT_MemSys_init"
|
171
|
+
self._memsys_module, stream, "NRT_MemSys_init"
|
172
|
+
)
|
162
173
|
self._initialized = True
|
163
174
|
|
164
175
|
if config.CUDA_NRT_STATS:
|
@@ -170,7 +181,8 @@ class _Runtime:
|
|
170
181
|
Enable memsys statistics
|
171
182
|
"""
|
172
183
|
self._single_thread_launch(
|
173
|
-
self._memsys_module, stream, "NRT_MemSys_enable_stats"
|
184
|
+
self._memsys_module, stream, "NRT_MemSys_enable_stats"
|
185
|
+
)
|
174
186
|
|
175
187
|
@_alloc_init_guard
|
176
188
|
def memsys_disable_stats(self, stream=None):
|
@@ -178,7 +190,8 @@ class _Runtime:
|
|
178
190
|
Disable memsys statistics
|
179
191
|
"""
|
180
192
|
self._single_thread_launch(
|
181
|
-
self._memsys_module, stream, "NRT_MemSys_disable_stats"
|
193
|
+
self._memsys_module, stream, "NRT_MemSys_disable_stats"
|
194
|
+
)
|
182
195
|
|
183
196
|
@_alloc_init_guard
|
184
197
|
def memsys_stats_enabled(self, stream=None):
|
@@ -193,7 +206,7 @@ class _Runtime:
|
|
193
206
|
self._memsys_module,
|
194
207
|
stream,
|
195
208
|
"NRT_MemSys_stats_enabled",
|
196
|
-
(enabled_ptr,)
|
209
|
+
(enabled_ptr,),
|
197
210
|
)
|
198
211
|
|
199
212
|
cuda.synchronize()
|
@@ -204,21 +217,20 @@ class _Runtime:
|
|
204
217
|
"""
|
205
218
|
Copy all statistics of memsys to the host
|
206
219
|
"""
|
207
|
-
dt = np.dtype(
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
220
|
+
dt = np.dtype(
|
221
|
+
[
|
222
|
+
("alloc", np.uint64),
|
223
|
+
("free", np.uint64),
|
224
|
+
("mi_alloc", np.uint64),
|
225
|
+
("mi_free", np.uint64),
|
226
|
+
]
|
227
|
+
)
|
213
228
|
|
214
229
|
stats_for_read = cuda.managed_array(1, dt)
|
215
230
|
stats_ptr = self._ctypes_pointer(stats_for_read)
|
216
231
|
|
217
232
|
self._single_thread_launch(
|
218
|
-
self._memsys_module,
|
219
|
-
stream,
|
220
|
-
"NRT_MemSys_read",
|
221
|
-
[stats_ptr]
|
233
|
+
self._memsys_module, stream, "NRT_MemSys_read", [stats_ptr]
|
222
234
|
)
|
223
235
|
cuda.synchronize()
|
224
236
|
|
@@ -237,7 +249,7 @@ class _Runtime:
|
|
237
249
|
alloc=memsys["alloc"],
|
238
250
|
free=memsys["free"],
|
239
251
|
mi_alloc=memsys["mi_alloc"],
|
240
|
-
mi_free=memsys["mi_free"]
|
252
|
+
mi_free=memsys["mi_free"],
|
241
253
|
)
|
242
254
|
|
243
255
|
@_alloc_init_guard
|
@@ -249,10 +261,7 @@ class _Runtime:
|
|
249
261
|
got_ptr = self._ctypes_pointer(got)
|
250
262
|
|
251
263
|
self._single_thread_launch(
|
252
|
-
self._memsys_module,
|
253
|
-
stream,
|
254
|
-
f"NRT_MemSys_read_{stat}",
|
255
|
-
[got_ptr]
|
264
|
+
self._memsys_module, stream, f"NRT_MemSys_read_{stat}", [got_ptr]
|
256
265
|
)
|
257
266
|
|
258
267
|
cuda.synchronize()
|
@@ -309,15 +318,13 @@ class _Runtime:
|
|
309
318
|
"""
|
310
319
|
if self._memsys is None:
|
311
320
|
raise RuntimeError(
|
312
|
-
"Please allocate NRT Memsys first before setting to module."
|
321
|
+
"Please allocate NRT Memsys first before setting to module."
|
322
|
+
)
|
313
323
|
|
314
324
|
memsys_ptr = self._ctypes_pointer(self._memsys)
|
315
325
|
|
316
326
|
self._single_thread_launch(
|
317
|
-
module,
|
318
|
-
stream,
|
319
|
-
"NRT_MemSys_set",
|
320
|
-
[memsys_ptr]
|
327
|
+
module, stream, "NRT_MemSys_set", [memsys_ptr]
|
321
328
|
)
|
322
329
|
|
323
330
|
@_alloc_init_guard
|
@@ -327,9 +334,7 @@ class _Runtime:
|
|
327
334
|
"""
|
328
335
|
cuda.synchronize()
|
329
336
|
self._single_thread_launch(
|
330
|
-
self._memsys_module,
|
331
|
-
stream,
|
332
|
-
"NRT_MemSys_print"
|
337
|
+
self._memsys_module, stream, "NRT_MemSys_print"
|
333
338
|
)
|
334
339
|
|
335
340
|
|
@@ -3,14 +3,22 @@ import sys
|
|
3
3
|
from .api import *
|
4
4
|
from .vector_types import vector_types
|
5
5
|
from .reduction import Reduce
|
6
|
-
from .cudadrv.devicearray import (
|
7
|
-
|
8
|
-
|
6
|
+
from .cudadrv.devicearray import (
|
7
|
+
device_array,
|
8
|
+
device_array_like,
|
9
|
+
pinned,
|
10
|
+
pinned_array,
|
11
|
+
pinned_array_like,
|
12
|
+
mapped_array,
|
13
|
+
to_device,
|
14
|
+
auto_device,
|
15
|
+
)
|
9
16
|
from .cudadrv import devicearray
|
10
17
|
from .cudadrv.devices import require_context, gpus
|
11
18
|
from .cudadrv.devices import get_context as current_context
|
12
19
|
from .cudadrv.runtime import runtime
|
13
20
|
from numba.core import config
|
21
|
+
|
14
22
|
reduce = Reduce
|
15
23
|
|
16
24
|
# Register simulated vector types as module level variables
|
@@ -25,14 +33,16 @@ del vector_types, name, svty, alias
|
|
25
33
|
if config.ENABLE_CUDASIM:
|
26
34
|
import sys
|
27
35
|
from numba.cuda.simulator import cudadrv
|
28
|
-
|
29
|
-
sys.modules[
|
30
|
-
sys.modules[
|
31
|
-
sys.modules[
|
32
|
-
sys.modules[
|
33
|
-
sys.modules[
|
34
|
-
sys.modules[
|
35
|
-
sys.modules[
|
36
|
+
|
37
|
+
sys.modules["numba.cuda.cudadrv"] = cudadrv
|
38
|
+
sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
|
39
|
+
sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
|
40
|
+
sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
|
41
|
+
sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
|
42
|
+
sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
|
43
|
+
sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
|
44
|
+
sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
|
36
45
|
|
37
46
|
from . import compiler
|
38
|
-
|
47
|
+
|
48
|
+
sys.modules["numba.cuda.compiler"] = compiler
|