numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.0.dist-info/METADATA +0 -6
- numba_cuda-0.0.0.dist-info/RECORD +0 -5
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
import itertools
|
2
|
+
from llvmlite import ir
|
3
|
+
from numba.core import cgutils, targetconfig
|
4
|
+
from .cudadrv import nvvm
|
5
|
+
|
6
|
+
|
7
|
+
def declare_atomic_cas_int(lmod, isize):
|
8
|
+
fname = '___numba_atomic_i' + str(isize) + '_cas_hack'
|
9
|
+
fnty = ir.FunctionType(ir.IntType(isize),
|
10
|
+
(ir.PointerType(ir.IntType(isize)),
|
11
|
+
ir.IntType(isize),
|
12
|
+
ir.IntType(isize)))
|
13
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
14
|
+
|
15
|
+
|
16
|
+
def atomic_cmpxchg(builder, lmod, isize, ptr, cmp, val):
|
17
|
+
out = builder.cmpxchg(ptr, cmp, val, 'monotonic', 'monotonic')
|
18
|
+
return builder.extract_value(out, 0)
|
19
|
+
|
20
|
+
|
21
|
+
def declare_atomic_add_float32(lmod):
|
22
|
+
fname = 'llvm.nvvm.atomic.load.add.f32.p0f32'
|
23
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
24
|
+
(ir.PointerType(ir.FloatType(), 0), ir.FloatType()))
|
25
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
26
|
+
|
27
|
+
|
28
|
+
def declare_atomic_add_float64(lmod):
|
29
|
+
flags = targetconfig.ConfigStack().top()
|
30
|
+
if flags.compute_capability >= (6, 0):
|
31
|
+
fname = 'llvm.nvvm.atomic.load.add.f64.p0f64'
|
32
|
+
else:
|
33
|
+
fname = '___numba_atomic_double_add'
|
34
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
35
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
36
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
37
|
+
|
38
|
+
|
39
|
+
def declare_atomic_sub_float32(lmod):
|
40
|
+
fname = '___numba_atomic_float_sub'
|
41
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
42
|
+
(ir.PointerType(ir.FloatType()), ir.FloatType()))
|
43
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
44
|
+
|
45
|
+
|
46
|
+
def declare_atomic_sub_float64(lmod):
|
47
|
+
fname = '___numba_atomic_double_sub'
|
48
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
49
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
50
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
51
|
+
|
52
|
+
|
53
|
+
def declare_atomic_inc_int32(lmod):
|
54
|
+
fname = 'llvm.nvvm.atomic.load.inc.32.p0i32'
|
55
|
+
fnty = ir.FunctionType(ir.IntType(32),
|
56
|
+
(ir.PointerType(ir.IntType(32)), ir.IntType(32)))
|
57
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
58
|
+
|
59
|
+
|
60
|
+
def declare_atomic_inc_int64(lmod):
|
61
|
+
fname = '___numba_atomic_u64_inc'
|
62
|
+
fnty = ir.FunctionType(ir.IntType(64),
|
63
|
+
(ir.PointerType(ir.IntType(64)), ir.IntType(64)))
|
64
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
65
|
+
|
66
|
+
|
67
|
+
def declare_atomic_dec_int32(lmod):
|
68
|
+
fname = 'llvm.nvvm.atomic.load.dec.32.p0i32'
|
69
|
+
fnty = ir.FunctionType(ir.IntType(32),
|
70
|
+
(ir.PointerType(ir.IntType(32)), ir.IntType(32)))
|
71
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
72
|
+
|
73
|
+
|
74
|
+
def declare_atomic_dec_int64(lmod):
|
75
|
+
fname = '___numba_atomic_u64_dec'
|
76
|
+
fnty = ir.FunctionType(ir.IntType(64),
|
77
|
+
(ir.PointerType(ir.IntType(64)), ir.IntType(64)))
|
78
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
79
|
+
|
80
|
+
|
81
|
+
def declare_atomic_max_float32(lmod):
|
82
|
+
fname = '___numba_atomic_float_max'
|
83
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
84
|
+
(ir.PointerType(ir.FloatType()), ir.FloatType()))
|
85
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
86
|
+
|
87
|
+
|
88
|
+
def declare_atomic_max_float64(lmod):
|
89
|
+
fname = '___numba_atomic_double_max'
|
90
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
91
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
92
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
93
|
+
|
94
|
+
|
95
|
+
def declare_atomic_min_float32(lmod):
|
96
|
+
fname = '___numba_atomic_float_min'
|
97
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
98
|
+
(ir.PointerType(ir.FloatType()), ir.FloatType()))
|
99
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
100
|
+
|
101
|
+
|
102
|
+
def declare_atomic_min_float64(lmod):
|
103
|
+
fname = '___numba_atomic_double_min'
|
104
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
105
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
106
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
107
|
+
|
108
|
+
|
109
|
+
def declare_atomic_nanmax_float32(lmod):
|
110
|
+
fname = '___numba_atomic_float_nanmax'
|
111
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
112
|
+
(ir.PointerType(ir.FloatType()), ir.FloatType()))
|
113
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
114
|
+
|
115
|
+
|
116
|
+
def declare_atomic_nanmax_float64(lmod):
|
117
|
+
fname = '___numba_atomic_double_nanmax'
|
118
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
119
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
120
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
121
|
+
|
122
|
+
|
123
|
+
def declare_atomic_nanmin_float32(lmod):
|
124
|
+
fname = '___numba_atomic_float_nanmin'
|
125
|
+
fnty = ir.FunctionType(ir.FloatType(),
|
126
|
+
(ir.PointerType(ir.FloatType()), ir.FloatType()))
|
127
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
128
|
+
|
129
|
+
|
130
|
+
def declare_atomic_nanmin_float64(lmod):
|
131
|
+
fname = '___numba_atomic_double_nanmin'
|
132
|
+
fnty = ir.FunctionType(ir.DoubleType(),
|
133
|
+
(ir.PointerType(ir.DoubleType()), ir.DoubleType()))
|
134
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
135
|
+
|
136
|
+
|
137
|
+
def declare_cudaCGGetIntrinsicHandle(lmod):
|
138
|
+
fname = 'cudaCGGetIntrinsicHandle'
|
139
|
+
fnty = ir.FunctionType(ir.IntType(64),
|
140
|
+
(ir.IntType(32),))
|
141
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
142
|
+
|
143
|
+
|
144
|
+
def declare_cudaCGSynchronize(lmod):
|
145
|
+
fname = 'cudaCGSynchronize'
|
146
|
+
fnty = ir.FunctionType(ir.IntType(32),
|
147
|
+
(ir.IntType(64), ir.IntType(32)))
|
148
|
+
return cgutils.get_or_insert_function(lmod, fnty, fname)
|
149
|
+
|
150
|
+
|
151
|
+
def declare_string(builder, value):
|
152
|
+
lmod = builder.basic_block.function.module
|
153
|
+
cval = cgutils.make_bytearray(value.encode("utf-8") + b"\x00")
|
154
|
+
gl = cgutils.add_global_variable(lmod, cval.type, name="_str",
|
155
|
+
addrspace=nvvm.ADDRSPACE_CONSTANT)
|
156
|
+
gl.linkage = 'internal'
|
157
|
+
gl.global_constant = True
|
158
|
+
gl.initializer = cval
|
159
|
+
|
160
|
+
return builder.addrspacecast(gl, ir.PointerType(ir.IntType(8)), 'generic')
|
161
|
+
|
162
|
+
|
163
|
+
def declare_vprint(lmod):
|
164
|
+
voidptrty = ir.PointerType(ir.IntType(8))
|
165
|
+
# NOTE: the second argument to vprintf() points to the variable-length
|
166
|
+
# array of arguments (after the format)
|
167
|
+
vprintfty = ir.FunctionType(ir.IntType(32), [voidptrty, voidptrty])
|
168
|
+
vprintf = cgutils.get_or_insert_function(lmod, vprintfty, "vprintf")
|
169
|
+
return vprintf
|
170
|
+
|
171
|
+
|
172
|
+
# -----------------------------------------------------------------------------
|
173
|
+
|
174
|
+
SREG_MAPPING = {
|
175
|
+
'tid.x': 'llvm.nvvm.read.ptx.sreg.tid.x',
|
176
|
+
'tid.y': 'llvm.nvvm.read.ptx.sreg.tid.y',
|
177
|
+
'tid.z': 'llvm.nvvm.read.ptx.sreg.tid.z',
|
178
|
+
|
179
|
+
'ntid.x': 'llvm.nvvm.read.ptx.sreg.ntid.x',
|
180
|
+
'ntid.y': 'llvm.nvvm.read.ptx.sreg.ntid.y',
|
181
|
+
'ntid.z': 'llvm.nvvm.read.ptx.sreg.ntid.z',
|
182
|
+
|
183
|
+
'ctaid.x': 'llvm.nvvm.read.ptx.sreg.ctaid.x',
|
184
|
+
'ctaid.y': 'llvm.nvvm.read.ptx.sreg.ctaid.y',
|
185
|
+
'ctaid.z': 'llvm.nvvm.read.ptx.sreg.ctaid.z',
|
186
|
+
|
187
|
+
'nctaid.x': 'llvm.nvvm.read.ptx.sreg.nctaid.x',
|
188
|
+
'nctaid.y': 'llvm.nvvm.read.ptx.sreg.nctaid.y',
|
189
|
+
'nctaid.z': 'llvm.nvvm.read.ptx.sreg.nctaid.z',
|
190
|
+
|
191
|
+
'warpsize': 'llvm.nvvm.read.ptx.sreg.warpsize',
|
192
|
+
'laneid': 'llvm.nvvm.read.ptx.sreg.laneid',
|
193
|
+
}
|
194
|
+
|
195
|
+
|
196
|
+
def call_sreg(builder, name):
|
197
|
+
module = builder.module
|
198
|
+
fnty = ir.FunctionType(ir.IntType(32), ())
|
199
|
+
fn = cgutils.get_or_insert_function(module, fnty, SREG_MAPPING[name])
|
200
|
+
return builder.call(fn, ())
|
201
|
+
|
202
|
+
|
203
|
+
class SRegBuilder(object):
|
204
|
+
def __init__(self, builder):
|
205
|
+
self.builder = builder
|
206
|
+
|
207
|
+
def tid(self, xyz):
|
208
|
+
return call_sreg(self.builder, 'tid.%s' % xyz)
|
209
|
+
|
210
|
+
def ctaid(self, xyz):
|
211
|
+
return call_sreg(self.builder, 'ctaid.%s' % xyz)
|
212
|
+
|
213
|
+
def ntid(self, xyz):
|
214
|
+
return call_sreg(self.builder, 'ntid.%s' % xyz)
|
215
|
+
|
216
|
+
def nctaid(self, xyz):
|
217
|
+
return call_sreg(self.builder, 'nctaid.%s' % xyz)
|
218
|
+
|
219
|
+
def getdim(self, xyz):
|
220
|
+
i64 = ir.IntType(64)
|
221
|
+
tid = self.builder.sext(self.tid(xyz), i64)
|
222
|
+
ntid = self.builder.sext(self.ntid(xyz), i64)
|
223
|
+
nctaid = self.builder.sext(self.ctaid(xyz), i64)
|
224
|
+
res = self.builder.add(self.builder.mul(ntid, nctaid), tid)
|
225
|
+
return res
|
226
|
+
|
227
|
+
|
228
|
+
def get_global_id(builder, dim):
|
229
|
+
sreg = SRegBuilder(builder)
|
230
|
+
it = (sreg.getdim(xyz) for xyz in 'xyz')
|
231
|
+
seq = list(itertools.islice(it, None, dim))
|
232
|
+
if dim == 1:
|
233
|
+
return seq[0]
|
234
|
+
else:
|
235
|
+
return seq
|
@@ -0,0 +1,86 @@
|
|
1
|
+
from functools import singledispatch
|
2
|
+
from llvmlite import ir
|
3
|
+
from numba.core import types, cgutils
|
4
|
+
from numba.core.errors import NumbaWarning
|
5
|
+
from numba.core.imputils import Registry
|
6
|
+
from numba.cuda import nvvmutils
|
7
|
+
from warnings import warn
|
8
|
+
|
9
|
+
registry = Registry()
|
10
|
+
lower = registry.lower
|
11
|
+
|
12
|
+
voidptr = ir.PointerType(ir.IntType(8))
|
13
|
+
|
14
|
+
|
15
|
+
# NOTE: we don't use @lower here since print_item() doesn't return a LLVM value
|
16
|
+
|
17
|
+
@singledispatch
|
18
|
+
def print_item(ty, context, builder, val):
|
19
|
+
"""
|
20
|
+
Handle printing of a single value of the given Numba type.
|
21
|
+
A (format string, [list of arguments]) is returned that will allow
|
22
|
+
forming the final printf()-like call.
|
23
|
+
"""
|
24
|
+
raise NotImplementedError("printing unimplemented for values of type %s"
|
25
|
+
% (ty,))
|
26
|
+
|
27
|
+
|
28
|
+
@print_item.register(types.Integer)
|
29
|
+
@print_item.register(types.IntegerLiteral)
|
30
|
+
def int_print_impl(ty, context, builder, val):
|
31
|
+
if ty in types.unsigned_domain:
|
32
|
+
rawfmt = "%llu"
|
33
|
+
dsttype = types.uint64
|
34
|
+
else:
|
35
|
+
rawfmt = "%lld"
|
36
|
+
dsttype = types.int64
|
37
|
+
lld = context.cast(builder, val, ty, dsttype)
|
38
|
+
return rawfmt, [lld]
|
39
|
+
|
40
|
+
|
41
|
+
@print_item.register(types.Float)
|
42
|
+
def real_print_impl(ty, context, builder, val):
|
43
|
+
lld = context.cast(builder, val, ty, types.float64)
|
44
|
+
return "%f", [lld]
|
45
|
+
|
46
|
+
|
47
|
+
@print_item.register(types.StringLiteral)
|
48
|
+
def const_print_impl(ty, context, builder, sigval):
|
49
|
+
pyval = ty.literal_value
|
50
|
+
assert isinstance(pyval, str) # Ensured by lowering
|
51
|
+
rawfmt = "%s"
|
52
|
+
val = context.insert_string_const_addrspace(builder, pyval)
|
53
|
+
return rawfmt, [val]
|
54
|
+
|
55
|
+
|
56
|
+
@lower(print, types.VarArg(types.Any))
|
57
|
+
def print_varargs(context, builder, sig, args):
|
58
|
+
"""This function is a generic 'print' wrapper for arbitrary types.
|
59
|
+
It dispatches to the appropriate 'print' implementations above
|
60
|
+
depending on the detected real types in the signature."""
|
61
|
+
|
62
|
+
vprint = nvvmutils.declare_vprint(builder.module)
|
63
|
+
|
64
|
+
formats = []
|
65
|
+
values = []
|
66
|
+
|
67
|
+
for i, (argtype, argval) in enumerate(zip(sig.args, args)):
|
68
|
+
argfmt, argvals = print_item(argtype, context, builder, argval)
|
69
|
+
formats.append(argfmt)
|
70
|
+
values.extend(argvals)
|
71
|
+
|
72
|
+
rawfmt = " ".join(formats) + "\n"
|
73
|
+
if len(args) > 32:
|
74
|
+
msg = ('CUDA print() cannot print more than 32 items. '
|
75
|
+
'The raw format string will be emitted by the kernel instead.')
|
76
|
+
warn(msg, NumbaWarning)
|
77
|
+
|
78
|
+
rawfmt = rawfmt.replace('%', '%%')
|
79
|
+
fmt = context.insert_string_const_addrspace(builder, rawfmt)
|
80
|
+
array = cgutils.make_anonymous_struct(builder, values)
|
81
|
+
arrayptr = cgutils.alloca_once_value(builder, array)
|
82
|
+
|
83
|
+
vprint = nvvmutils.declare_vprint(builder.module)
|
84
|
+
builder.call(vprint, (fmt, builder.bitcast(arrayptr, voidptr)))
|
85
|
+
|
86
|
+
return context.get_dummy_value()
|
@@ -0,0 +1,292 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
from numba import (config, cuda, float32, float64, uint32, int64, uint64,
|
4
|
+
from_dtype, jit)
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
# This implementation is based upon the xoroshiro128+ and splitmix64 algorithms
|
9
|
+
# described at:
|
10
|
+
#
|
11
|
+
# http://xoroshiro.di.unimi.it/
|
12
|
+
#
|
13
|
+
# and originally implemented by David Blackman and Sebastiano Vigna.
|
14
|
+
#
|
15
|
+
# The implementations below are based on the C source code:
|
16
|
+
#
|
17
|
+
# * http://xoroshiro.di.unimi.it/xoroshiro128plus.c
|
18
|
+
# * http://xoroshiro.di.unimi.it/splitmix64.c
|
19
|
+
#
|
20
|
+
# Splitmix64 is used to generate the initial state of the xoroshiro128+
|
21
|
+
# generator to ensure that small seeds don't result in predictable output.
|
22
|
+
|
23
|
+
# **WARNING**: There is a lot of verbose casting in this file to ensure that
|
24
|
+
# NumPy casting conventions (which cast uint64 [op] int32 to float64) don't
|
25
|
+
# turn integers into floats when using these functions in the CUDA simulator.
|
26
|
+
#
|
27
|
+
# There are also no function type signatures to ensure that compilation is
|
28
|
+
# deferred so that import is quick, and Sphinx autodoc works. We are also
|
29
|
+
# using the CPU @jit decorator everywhere to create functions that work as
|
30
|
+
# both CPU and CUDA device functions.
|
31
|
+
|
32
|
+
xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)],
|
33
|
+
align=True)
|
34
|
+
xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
|
35
|
+
|
36
|
+
# When cudasim is enabled, Fake CUDA arrays are passed to some of the
|
37
|
+
# @jit-decorated functions. This required fallback to object mode. With
|
38
|
+
# Numba 0.59.0 object mode must be explicitly enabled.
|
39
|
+
# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
|
40
|
+
# In order to avoid the warning / future error, we explicitly specify that
|
41
|
+
# object mode with loop lifting is acceptable when using the simulator.
|
42
|
+
_forceobj = _looplift = config.ENABLE_CUDASIM
|
43
|
+
_nopython = not config.ENABLE_CUDASIM
|
44
|
+
|
45
|
+
|
46
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
47
|
+
def init_xoroshiro128p_state(states, index, seed):
|
48
|
+
'''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
|
49
|
+
|
50
|
+
This ensures that manually set small seeds don't result in a predictable
|
51
|
+
initial sequence from the random number generator.
|
52
|
+
|
53
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
54
|
+
:param states: array of RNG states
|
55
|
+
:type index: uint64
|
56
|
+
:param index: offset in states to update
|
57
|
+
:type seed: int64
|
58
|
+
:param seed: seed value to use when initializing state
|
59
|
+
'''
|
60
|
+
index = int64(index)
|
61
|
+
seed = uint64(seed)
|
62
|
+
|
63
|
+
z = seed + uint64(0x9E3779B97F4A7C15)
|
64
|
+
z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9)
|
65
|
+
z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
|
66
|
+
z = z ^ (z >> uint32(31))
|
67
|
+
|
68
|
+
states[index]['s0'] = z
|
69
|
+
states[index]['s1'] = z
|
70
|
+
|
71
|
+
|
72
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
73
|
+
def rotl(x, k):
|
74
|
+
'''Left rotate x by k bits.'''
|
75
|
+
x = uint64(x)
|
76
|
+
k = uint32(k)
|
77
|
+
return (x << k) | (x >> uint32(64 - k))
|
78
|
+
|
79
|
+
|
80
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
81
|
+
def xoroshiro128p_next(states, index):
|
82
|
+
'''Return the next random uint64 and advance the RNG in states[index].
|
83
|
+
|
84
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
85
|
+
:param states: array of RNG states
|
86
|
+
:type index: int64
|
87
|
+
:param index: offset in states to update
|
88
|
+
:rtype: uint64
|
89
|
+
'''
|
90
|
+
index = int64(index)
|
91
|
+
s0 = states[index]['s0']
|
92
|
+
s1 = states[index]['s1']
|
93
|
+
result = s0 + s1
|
94
|
+
|
95
|
+
s1 ^= s0
|
96
|
+
states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
|
97
|
+
states[index]['s1'] = uint64(rotl(s1, uint32(36)))
|
98
|
+
|
99
|
+
return result
|
100
|
+
|
101
|
+
|
102
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
103
|
+
def xoroshiro128p_jump(states, index):
|
104
|
+
'''Advance the RNG in ``states[index]`` by 2**64 steps.
|
105
|
+
|
106
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
107
|
+
:param states: array of RNG states
|
108
|
+
:type index: int64
|
109
|
+
:param index: offset in states to update
|
110
|
+
'''
|
111
|
+
index = int64(index)
|
112
|
+
|
113
|
+
jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922))
|
114
|
+
|
115
|
+
s0 = uint64(0)
|
116
|
+
s1 = uint64(0)
|
117
|
+
|
118
|
+
for i in range(2):
|
119
|
+
for b in range(64):
|
120
|
+
if jump[i] & (uint64(1) << uint32(b)):
|
121
|
+
s0 ^= states[index]['s0']
|
122
|
+
s1 ^= states[index]['s1']
|
123
|
+
xoroshiro128p_next(states, index)
|
124
|
+
|
125
|
+
states[index]['s0'] = s0
|
126
|
+
states[index]['s1'] = s1
|
127
|
+
|
128
|
+
|
129
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
130
|
+
def uint64_to_unit_float64(x):
|
131
|
+
'''Convert uint64 to float64 value in the range [0.0, 1.0)'''
|
132
|
+
x = uint64(x)
|
133
|
+
return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
|
134
|
+
|
135
|
+
|
136
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
137
|
+
def uint64_to_unit_float32(x):
|
138
|
+
'''Convert uint64 to float32 value in the range [0.0, 1.0)'''
|
139
|
+
x = uint64(x)
|
140
|
+
return float32(uint64_to_unit_float64(x))
|
141
|
+
|
142
|
+
|
143
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
144
|
+
def xoroshiro128p_uniform_float32(states, index):
|
145
|
+
'''Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
|
146
|
+
|
147
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
148
|
+
:param states: array of RNG states
|
149
|
+
:type index: int64
|
150
|
+
:param index: offset in states to update
|
151
|
+
:rtype: float32
|
152
|
+
'''
|
153
|
+
index = int64(index)
|
154
|
+
return uint64_to_unit_float32(xoroshiro128p_next(states, index))
|
155
|
+
|
156
|
+
|
157
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
158
|
+
def xoroshiro128p_uniform_float64(states, index):
|
159
|
+
'''Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
|
160
|
+
|
161
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
162
|
+
:param states: array of RNG states
|
163
|
+
:type index: int64
|
164
|
+
:param index: offset in states to update
|
165
|
+
:rtype: float64
|
166
|
+
'''
|
167
|
+
index = int64(index)
|
168
|
+
return uint64_to_unit_float64(xoroshiro128p_next(states, index))
|
169
|
+
|
170
|
+
|
171
|
+
TWO_PI_FLOAT32 = np.float32(2 * math.pi)
|
172
|
+
TWO_PI_FLOAT64 = np.float64(2 * math.pi)
|
173
|
+
|
174
|
+
|
175
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
176
|
+
def xoroshiro128p_normal_float32(states, index):
|
177
|
+
'''Return a normally distributed float32 and advance ``states[index]``.
|
178
|
+
|
179
|
+
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
180
|
+
Box-Muller transform. This advances the RNG sequence by two steps.
|
181
|
+
|
182
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
183
|
+
:param states: array of RNG states
|
184
|
+
:type index: int64
|
185
|
+
:param index: offset in states to update
|
186
|
+
:rtype: float32
|
187
|
+
'''
|
188
|
+
index = int64(index)
|
189
|
+
|
190
|
+
u1 = xoroshiro128p_uniform_float32(states, index)
|
191
|
+
u2 = xoroshiro128p_uniform_float32(states, index)
|
192
|
+
|
193
|
+
z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2)
|
194
|
+
# discarding second normal value
|
195
|
+
# z1 = math.sqrt(-float32(2.0) * math.log(u1))
|
196
|
+
# * math.sin(TWO_PI_FLOAT32 * u2)
|
197
|
+
return z0
|
198
|
+
|
199
|
+
|
200
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
201
|
+
def xoroshiro128p_normal_float64(states, index):
|
202
|
+
'''Return a normally distributed float32 and advance ``states[index]``.
|
203
|
+
|
204
|
+
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
205
|
+
Box-Muller transform. This advances the RNG sequence by two steps.
|
206
|
+
|
207
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
208
|
+
:param states: array of RNG states
|
209
|
+
:type index: int64
|
210
|
+
:param index: offset in states to update
|
211
|
+
:rtype: float64
|
212
|
+
'''
|
213
|
+
index = int64(index)
|
214
|
+
|
215
|
+
u1 = xoroshiro128p_uniform_float32(states, index)
|
216
|
+
u2 = xoroshiro128p_uniform_float32(states, index)
|
217
|
+
|
218
|
+
z0 = math.sqrt(-float64(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT64 * u2)
|
219
|
+
# discarding second normal value
|
220
|
+
# z1 = math.sqrt(-float64(2.0) * math.log(u1))
|
221
|
+
# * math.sin(TWO_PI_FLOAT64 * u2)
|
222
|
+
return z0
|
223
|
+
|
224
|
+
|
225
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
226
|
+
def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
|
227
|
+
n = states.shape[0]
|
228
|
+
seed = uint64(seed)
|
229
|
+
subsequence_start = uint64(subsequence_start)
|
230
|
+
|
231
|
+
if n >= 1:
|
232
|
+
init_xoroshiro128p_state(states, 0, seed)
|
233
|
+
|
234
|
+
# advance to starting subsequence number
|
235
|
+
for _ in range(subsequence_start):
|
236
|
+
xoroshiro128p_jump(states, 0)
|
237
|
+
|
238
|
+
# populate the rest of the array
|
239
|
+
for i in range(1, n):
|
240
|
+
states[i] = states[i - 1] # take state of previous generator
|
241
|
+
xoroshiro128p_jump(states, i) # and jump forward 2**64 steps
|
242
|
+
|
243
|
+
|
244
|
+
def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
|
245
|
+
'''Initialize RNG states on the GPU for parallel generators.
|
246
|
+
|
247
|
+
This initializes the RNG states so that each state in the array corresponds
|
248
|
+
subsequences in the separated by 2**64 steps from each other in the main
|
249
|
+
sequence. Therefore, as long no CUDA thread requests more than 2**64
|
250
|
+
random numbers, all of the RNG states produced by this function are
|
251
|
+
guaranteed to be independent.
|
252
|
+
|
253
|
+
The subsequence_start parameter can be used to advance the first RNG state
|
254
|
+
by a multiple of 2**64 steps.
|
255
|
+
|
256
|
+
:type states: 1D DeviceNDArray, dtype=xoroshiro128p_dtype
|
257
|
+
:param states: array of RNG states
|
258
|
+
:type seed: uint64
|
259
|
+
:param seed: starting seed for list of generators
|
260
|
+
'''
|
261
|
+
|
262
|
+
# Initialization on CPU is much faster than the GPU
|
263
|
+
states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
|
264
|
+
init_xoroshiro128p_states_cpu(states_cpu, seed, subsequence_start)
|
265
|
+
|
266
|
+
states.copy_to_device(states_cpu, stream=stream)
|
267
|
+
|
268
|
+
|
269
|
+
def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
|
270
|
+
'''Returns a new device array initialized for n random number generators.
|
271
|
+
|
272
|
+
This initializes the RNG states so that each state in the array corresponds
|
273
|
+
subsequences in the separated by 2**64 steps from each other in the main
|
274
|
+
sequence. Therefore, as long no CUDA thread requests more than 2**64
|
275
|
+
random numbers, all of the RNG states produced by this function are
|
276
|
+
guaranteed to be independent.
|
277
|
+
|
278
|
+
The subsequence_start parameter can be used to advance the first RNG state
|
279
|
+
by a multiple of 2**64 steps.
|
280
|
+
|
281
|
+
:type n: int
|
282
|
+
:param n: number of RNG states to create
|
283
|
+
:type seed: uint64
|
284
|
+
:param seed: starting seed for list of generators
|
285
|
+
:type subsequence_start: uint64
|
286
|
+
:param subsequence_start:
|
287
|
+
:type stream: CUDA stream
|
288
|
+
:param stream: stream to run initialization kernel on
|
289
|
+
'''
|
290
|
+
states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
|
291
|
+
init_xoroshiro128p_states(states, seed, subsequence_start, stream)
|
292
|
+
return states
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import sys
|
2
|
+
|
3
|
+
from .api import *
|
4
|
+
from .vector_types import vector_types
|
5
|
+
from .reduction import Reduce
|
6
|
+
from .cudadrv.devicearray import (device_array, device_array_like, pinned,
|
7
|
+
pinned_array, pinned_array_like,
|
8
|
+
mapped_array, to_device, auto_device)
|
9
|
+
from .cudadrv import devicearray
|
10
|
+
from .cudadrv.devices import require_context, gpus
|
11
|
+
from .cudadrv.devices import get_context as current_context
|
12
|
+
from .cudadrv.runtime import runtime
|
13
|
+
from numba.core import config
|
14
|
+
reduce = Reduce
|
15
|
+
|
16
|
+
# Register simulated vector types as module level variables
|
17
|
+
for name, svty in vector_types.items():
|
18
|
+
setattr(sys.modules[__name__], name, svty)
|
19
|
+
for alias in svty.aliases:
|
20
|
+
setattr(sys.modules[__name__], alias, svty)
|
21
|
+
del vector_types, name, svty, alias
|
22
|
+
|
23
|
+
# Ensure that any user code attempting to import cudadrv etc. gets the
|
24
|
+
# simulator's version and not the real version if the simulator is enabled.
|
25
|
+
if config.ENABLE_CUDASIM:
|
26
|
+
import sys
|
27
|
+
from numba.cuda.simulator import cudadrv
|
28
|
+
sys.modules['numba.cuda.cudadrv'] = cudadrv
|
29
|
+
sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray
|
30
|
+
sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices
|
31
|
+
sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver
|
32
|
+
sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime
|
33
|
+
sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi
|
34
|
+
sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error
|
35
|
+
sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm
|
36
|
+
|
37
|
+
from . import compiler
|
38
|
+
sys.modules['numba.cuda.compiler'] = compiler
|