numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,806 @@
|
|
1
|
+
import operator
|
2
|
+
from numba.core import types
|
3
|
+
from numba.core.typing.npydecl import (parse_dtype, parse_shape,
|
4
|
+
register_number_classes,
|
5
|
+
register_numpy_ufunc,
|
6
|
+
trigonometric_functions,
|
7
|
+
comparison_functions,
|
8
|
+
math_operations,
|
9
|
+
bit_twiddling_functions)
|
10
|
+
from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
|
11
|
+
AbstractTemplate, CallableTemplate,
|
12
|
+
signature, Registry)
|
13
|
+
from numba.cuda.types import dim3
|
14
|
+
from numba.core.typeconv import Conversion
|
15
|
+
from numba import cuda
|
16
|
+
from numba.cuda.compiler import declare_device_function_template
|
17
|
+
|
18
|
+
registry = Registry()
|
19
|
+
register = registry.register
|
20
|
+
register_attr = registry.register_attr
|
21
|
+
register_global = registry.register_global
|
22
|
+
|
23
|
+
register_number_classes(register_global)
|
24
|
+
|
25
|
+
|
26
|
+
class Cuda_array_decl(CallableTemplate):
|
27
|
+
def generic(self):
|
28
|
+
def typer(shape, dtype):
|
29
|
+
|
30
|
+
# Only integer literals and tuples of integer literals are valid
|
31
|
+
# shapes
|
32
|
+
if isinstance(shape, types.Integer):
|
33
|
+
if not isinstance(shape, types.IntegerLiteral):
|
34
|
+
return None
|
35
|
+
elif isinstance(shape, (types.Tuple, types.UniTuple)):
|
36
|
+
if any([not isinstance(s, types.IntegerLiteral)
|
37
|
+
for s in shape]):
|
38
|
+
return None
|
39
|
+
else:
|
40
|
+
return None
|
41
|
+
|
42
|
+
ndim = parse_shape(shape)
|
43
|
+
nb_dtype = parse_dtype(dtype)
|
44
|
+
if nb_dtype is not None and ndim is not None:
|
45
|
+
return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
|
46
|
+
|
47
|
+
return typer
|
48
|
+
|
49
|
+
|
50
|
+
@register
|
51
|
+
class Cuda_shared_array(Cuda_array_decl):
|
52
|
+
key = cuda.shared.array
|
53
|
+
|
54
|
+
|
55
|
+
@register
|
56
|
+
class Cuda_local_array(Cuda_array_decl):
|
57
|
+
key = cuda.local.array
|
58
|
+
|
59
|
+
|
60
|
+
@register
|
61
|
+
class Cuda_const_array_like(CallableTemplate):
|
62
|
+
key = cuda.const.array_like
|
63
|
+
|
64
|
+
def generic(self):
|
65
|
+
def typer(ndarray):
|
66
|
+
return ndarray
|
67
|
+
return typer
|
68
|
+
|
69
|
+
|
70
|
+
@register
|
71
|
+
class Cuda_threadfence_device(ConcreteTemplate):
|
72
|
+
key = cuda.threadfence
|
73
|
+
cases = [signature(types.none)]
|
74
|
+
|
75
|
+
|
76
|
+
@register
|
77
|
+
class Cuda_threadfence_block(ConcreteTemplate):
|
78
|
+
key = cuda.threadfence_block
|
79
|
+
cases = [signature(types.none)]
|
80
|
+
|
81
|
+
|
82
|
+
@register
|
83
|
+
class Cuda_threadfence_system(ConcreteTemplate):
|
84
|
+
key = cuda.threadfence_system
|
85
|
+
cases = [signature(types.none)]
|
86
|
+
|
87
|
+
|
88
|
+
@register
|
89
|
+
class Cuda_syncwarp(ConcreteTemplate):
|
90
|
+
key = cuda.syncwarp
|
91
|
+
cases = [signature(types.none), signature(types.none, types.i4)]
|
92
|
+
|
93
|
+
|
94
|
+
@register
|
95
|
+
class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
|
96
|
+
key = cuda.shfl_sync_intrinsic
|
97
|
+
cases = [
|
98
|
+
signature(types.Tuple((types.i4, types.b1)),
|
99
|
+
types.i4, types.i4, types.i4, types.i4, types.i4),
|
100
|
+
signature(types.Tuple((types.i8, types.b1)),
|
101
|
+
types.i4, types.i4, types.i8, types.i4, types.i4),
|
102
|
+
signature(types.Tuple((types.f4, types.b1)),
|
103
|
+
types.i4, types.i4, types.f4, types.i4, types.i4),
|
104
|
+
signature(types.Tuple((types.f8, types.b1)),
|
105
|
+
types.i4, types.i4, types.f8, types.i4, types.i4),
|
106
|
+
]
|
107
|
+
|
108
|
+
|
109
|
+
@register
|
110
|
+
class Cuda_vote_sync_intrinsic(ConcreteTemplate):
|
111
|
+
key = cuda.vote_sync_intrinsic
|
112
|
+
cases = [signature(types.Tuple((types.i4, types.b1)),
|
113
|
+
types.i4, types.i4, types.b1)]
|
114
|
+
|
115
|
+
|
116
|
+
@register
|
117
|
+
class Cuda_match_any_sync(ConcreteTemplate):
|
118
|
+
key = cuda.match_any_sync
|
119
|
+
cases = [
|
120
|
+
signature(types.i4, types.i4, types.i4),
|
121
|
+
signature(types.i4, types.i4, types.i8),
|
122
|
+
signature(types.i4, types.i4, types.f4),
|
123
|
+
signature(types.i4, types.i4, types.f8),
|
124
|
+
]
|
125
|
+
|
126
|
+
|
127
|
+
@register
|
128
|
+
class Cuda_match_all_sync(ConcreteTemplate):
|
129
|
+
key = cuda.match_all_sync
|
130
|
+
cases = [
|
131
|
+
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i4),
|
132
|
+
signature(types.Tuple((types.i4, types.b1)), types.i4, types.i8),
|
133
|
+
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f4),
|
134
|
+
signature(types.Tuple((types.i4, types.b1)), types.i4, types.f8),
|
135
|
+
]
|
136
|
+
|
137
|
+
|
138
|
+
@register
|
139
|
+
class Cuda_activemask(ConcreteTemplate):
|
140
|
+
key = cuda.activemask
|
141
|
+
cases = [signature(types.uint32)]
|
142
|
+
|
143
|
+
|
144
|
+
@register
|
145
|
+
class Cuda_lanemask_lt(ConcreteTemplate):
|
146
|
+
key = cuda.lanemask_lt
|
147
|
+
cases = [signature(types.uint32)]
|
148
|
+
|
149
|
+
|
150
|
+
@register
|
151
|
+
class Cuda_popc(ConcreteTemplate):
|
152
|
+
"""
|
153
|
+
Supported types from `llvm.popc`
|
154
|
+
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
155
|
+
"""
|
156
|
+
key = cuda.popc
|
157
|
+
cases = [
|
158
|
+
signature(types.int8, types.int8),
|
159
|
+
signature(types.int16, types.int16),
|
160
|
+
signature(types.int32, types.int32),
|
161
|
+
signature(types.int64, types.int64),
|
162
|
+
signature(types.uint8, types.uint8),
|
163
|
+
signature(types.uint16, types.uint16),
|
164
|
+
signature(types.uint32, types.uint32),
|
165
|
+
signature(types.uint64, types.uint64),
|
166
|
+
]
|
167
|
+
|
168
|
+
|
169
|
+
@register
|
170
|
+
class Cuda_fma(ConcreteTemplate):
|
171
|
+
"""
|
172
|
+
Supported types from `llvm.fma`
|
173
|
+
[here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
|
174
|
+
"""
|
175
|
+
key = cuda.fma
|
176
|
+
cases = [
|
177
|
+
signature(types.float32, types.float32, types.float32, types.float32),
|
178
|
+
signature(types.float64, types.float64, types.float64, types.float64),
|
179
|
+
]
|
180
|
+
|
181
|
+
|
182
|
+
@register
|
183
|
+
class Cuda_hfma(ConcreteTemplate):
|
184
|
+
key = cuda.fp16.hfma
|
185
|
+
cases = [
|
186
|
+
signature(types.float16, types.float16, types.float16, types.float16)
|
187
|
+
]
|
188
|
+
|
189
|
+
|
190
|
+
@register
|
191
|
+
class Cuda_cbrt(ConcreteTemplate):
|
192
|
+
|
193
|
+
key = cuda.cbrt
|
194
|
+
cases = [
|
195
|
+
signature(types.float32, types.float32),
|
196
|
+
signature(types.float64, types.float64),
|
197
|
+
]
|
198
|
+
|
199
|
+
|
200
|
+
@register
|
201
|
+
class Cuda_brev(ConcreteTemplate):
|
202
|
+
key = cuda.brev
|
203
|
+
cases = [
|
204
|
+
signature(types.uint32, types.uint32),
|
205
|
+
signature(types.uint64, types.uint64),
|
206
|
+
]
|
207
|
+
|
208
|
+
|
209
|
+
@register
|
210
|
+
class Cuda_clz(ConcreteTemplate):
|
211
|
+
"""
|
212
|
+
Supported types from `llvm.ctlz`
|
213
|
+
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
214
|
+
"""
|
215
|
+
key = cuda.clz
|
216
|
+
cases = [
|
217
|
+
signature(types.int8, types.int8),
|
218
|
+
signature(types.int16, types.int16),
|
219
|
+
signature(types.int32, types.int32),
|
220
|
+
signature(types.int64, types.int64),
|
221
|
+
signature(types.uint8, types.uint8),
|
222
|
+
signature(types.uint16, types.uint16),
|
223
|
+
signature(types.uint32, types.uint32),
|
224
|
+
signature(types.uint64, types.uint64),
|
225
|
+
]
|
226
|
+
|
227
|
+
|
228
|
+
@register
|
229
|
+
class Cuda_ffs(ConcreteTemplate):
|
230
|
+
"""
|
231
|
+
Supported types from `llvm.cttz`
|
232
|
+
[here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
|
233
|
+
"""
|
234
|
+
key = cuda.ffs
|
235
|
+
cases = [
|
236
|
+
signature(types.uint32, types.int8),
|
237
|
+
signature(types.uint32, types.int16),
|
238
|
+
signature(types.uint32, types.int32),
|
239
|
+
signature(types.uint32, types.int64),
|
240
|
+
signature(types.uint32, types.uint8),
|
241
|
+
signature(types.uint32, types.uint16),
|
242
|
+
signature(types.uint32, types.uint32),
|
243
|
+
signature(types.uint32, types.uint64),
|
244
|
+
]
|
245
|
+
|
246
|
+
|
247
|
+
@register
|
248
|
+
class Cuda_selp(AbstractTemplate):
|
249
|
+
key = cuda.selp
|
250
|
+
|
251
|
+
def generic(self, args, kws):
|
252
|
+
assert not kws
|
253
|
+
test, a, b = args
|
254
|
+
|
255
|
+
# per docs
|
256
|
+
# http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
|
257
|
+
supported_types = (types.float64, types.float32,
|
258
|
+
types.int16, types.uint16,
|
259
|
+
types.int32, types.uint32,
|
260
|
+
types.int64, types.uint64)
|
261
|
+
|
262
|
+
if a != b or a not in supported_types:
|
263
|
+
return
|
264
|
+
|
265
|
+
return signature(a, test, a, a)
|
266
|
+
|
267
|
+
|
268
|
+
def _genfp16_unary(l_key):
|
269
|
+
@register
|
270
|
+
class Cuda_fp16_unary(ConcreteTemplate):
|
271
|
+
key = l_key
|
272
|
+
cases = [signature(types.float16, types.float16)]
|
273
|
+
|
274
|
+
return Cuda_fp16_unary
|
275
|
+
|
276
|
+
|
277
|
+
def _genfp16_unary_operator(l_key):
|
278
|
+
@register_global(l_key)
|
279
|
+
class Cuda_fp16_unary(AbstractTemplate):
|
280
|
+
key = l_key
|
281
|
+
|
282
|
+
def generic(self, args, kws):
|
283
|
+
assert not kws
|
284
|
+
if len(args) == 1 and args[0] == types.float16:
|
285
|
+
return signature(types.float16, types.float16)
|
286
|
+
|
287
|
+
return Cuda_fp16_unary
|
288
|
+
|
289
|
+
|
290
|
+
def _genfp16_binary(l_key):
|
291
|
+
@register
|
292
|
+
class Cuda_fp16_binary(ConcreteTemplate):
|
293
|
+
key = l_key
|
294
|
+
cases = [signature(types.float16, types.float16, types.float16)]
|
295
|
+
|
296
|
+
return Cuda_fp16_binary
|
297
|
+
|
298
|
+
|
299
|
+
@register_global(float)
|
300
|
+
class Float(AbstractTemplate):
|
301
|
+
|
302
|
+
def generic(self, args, kws):
|
303
|
+
assert not kws
|
304
|
+
|
305
|
+
[arg] = args
|
306
|
+
|
307
|
+
if arg == types.float16:
|
308
|
+
return signature(arg, arg)
|
309
|
+
|
310
|
+
|
311
|
+
def _genfp16_binary_comparison(l_key):
|
312
|
+
@register
|
313
|
+
class Cuda_fp16_cmp(ConcreteTemplate):
|
314
|
+
key = l_key
|
315
|
+
|
316
|
+
cases = [
|
317
|
+
signature(types.b1, types.float16, types.float16)
|
318
|
+
]
|
319
|
+
return Cuda_fp16_cmp
|
320
|
+
|
321
|
+
# If multiple ConcreteTemplates provide typing for a single function, then
|
322
|
+
# function resolution will pick the first compatible typing it finds even if it
|
323
|
+
# involves inserting a cast that would be considered undesirable (in this
|
324
|
+
# specific case, float16s could be cast to float32s for comparisons).
|
325
|
+
#
|
326
|
+
# To work around this, we instead use an AbstractTemplate that implements
|
327
|
+
# exactly the casting logic that we desire. The AbstractTemplate gets
|
328
|
+
# considered in preference to ConcreteTemplates during typing.
|
329
|
+
#
|
330
|
+
# This is tracked as Issue #7863 (https://github.com/numba/numba/issues/7863) -
|
331
|
+
# once this is resolved it should be possible to replace this AbstractTemplate
|
332
|
+
# with a ConcreteTemplate to simplify the logic.
|
333
|
+
|
334
|
+
|
335
|
+
def _fp16_binary_operator(l_key, retty):
|
336
|
+
@register_global(l_key)
|
337
|
+
class Cuda_fp16_operator(AbstractTemplate):
|
338
|
+
key = l_key
|
339
|
+
|
340
|
+
def generic(self, args, kws):
|
341
|
+
assert not kws
|
342
|
+
|
343
|
+
if len(args) == 2 and \
|
344
|
+
(args[0] == types.float16 or args[1] == types.float16):
|
345
|
+
if (args[0] == types.float16):
|
346
|
+
convertible = self.context.can_convert(args[1], args[0])
|
347
|
+
else:
|
348
|
+
convertible = self.context.can_convert(args[0], args[1])
|
349
|
+
|
350
|
+
# We allow three cases here:
|
351
|
+
#
|
352
|
+
# 1. fp16 to fp16 - Conversion.exact
|
353
|
+
# 2. fp16 to other types fp16 can be promoted to
|
354
|
+
# - Conversion.promote
|
355
|
+
# 3. fp16 to int8 (safe conversion) -
|
356
|
+
# - Conversion.safe
|
357
|
+
|
358
|
+
if (convertible == Conversion.exact) or \
|
359
|
+
(convertible == Conversion.promote) or \
|
360
|
+
(convertible == Conversion.safe):
|
361
|
+
return signature(retty, types.float16, types.float16)
|
362
|
+
|
363
|
+
return Cuda_fp16_operator
|
364
|
+
|
365
|
+
|
366
|
+
def _genfp16_comparison_operator(op):
|
367
|
+
return _fp16_binary_operator(op, types.b1)
|
368
|
+
|
369
|
+
|
370
|
+
def _genfp16_binary_operator(op):
|
371
|
+
return _fp16_binary_operator(op, types.float16)
|
372
|
+
|
373
|
+
|
374
|
+
Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
|
375
|
+
Cuda_add = _genfp16_binary_operator(operator.add)
|
376
|
+
Cuda_iadd = _genfp16_binary_operator(operator.iadd)
|
377
|
+
Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
|
378
|
+
Cuda_sub = _genfp16_binary_operator(operator.sub)
|
379
|
+
Cuda_isub = _genfp16_binary_operator(operator.isub)
|
380
|
+
Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
|
381
|
+
Cuda_mul = _genfp16_binary_operator(operator.mul)
|
382
|
+
Cuda_imul = _genfp16_binary_operator(operator.imul)
|
383
|
+
Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
|
384
|
+
Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
|
385
|
+
Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
|
386
|
+
Cuda_neg = _genfp16_unary_operator(operator.neg)
|
387
|
+
Cuda_habs = _genfp16_unary(cuda.fp16.habs)
|
388
|
+
Cuda_abs = _genfp16_unary_operator(abs)
|
389
|
+
Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
|
390
|
+
_genfp16_comparison_operator(operator.eq)
|
391
|
+
Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
|
392
|
+
_genfp16_comparison_operator(operator.ne)
|
393
|
+
Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
|
394
|
+
_genfp16_comparison_operator(operator.ge)
|
395
|
+
Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
|
396
|
+
_genfp16_comparison_operator(operator.gt)
|
397
|
+
Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
|
398
|
+
_genfp16_comparison_operator(operator.le)
|
399
|
+
Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
|
400
|
+
_genfp16_comparison_operator(operator.lt)
|
401
|
+
_genfp16_binary_operator(operator.truediv)
|
402
|
+
_genfp16_binary_operator(operator.itruediv)
|
403
|
+
|
404
|
+
|
405
|
+
def _resolve_wrapped_unary(fname):
|
406
|
+
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
407
|
+
types.float16,
|
408
|
+
(types.float16,))
|
409
|
+
return types.Function(decl)
|
410
|
+
|
411
|
+
|
412
|
+
def _resolve_wrapped_binary(fname):
|
413
|
+
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
414
|
+
types.float16,
|
415
|
+
(types.float16, types.float16,))
|
416
|
+
return types.Function(decl)
|
417
|
+
|
418
|
+
|
419
|
+
hsin_device = _resolve_wrapped_unary('hsin')
|
420
|
+
hcos_device = _resolve_wrapped_unary('hcos')
|
421
|
+
hlog_device = _resolve_wrapped_unary('hlog')
|
422
|
+
hlog10_device = _resolve_wrapped_unary('hlog10')
|
423
|
+
hlog2_device = _resolve_wrapped_unary('hlog2')
|
424
|
+
hexp_device = _resolve_wrapped_unary('hexp')
|
425
|
+
hexp10_device = _resolve_wrapped_unary('hexp10')
|
426
|
+
hexp2_device = _resolve_wrapped_unary('hexp2')
|
427
|
+
hsqrt_device = _resolve_wrapped_unary('hsqrt')
|
428
|
+
hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
|
429
|
+
hfloor_device = _resolve_wrapped_unary('hfloor')
|
430
|
+
hceil_device = _resolve_wrapped_unary('hceil')
|
431
|
+
hrcp_device = _resolve_wrapped_unary('hrcp')
|
432
|
+
hrint_device = _resolve_wrapped_unary('hrint')
|
433
|
+
htrunc_device = _resolve_wrapped_unary('htrunc')
|
434
|
+
hdiv_device = _resolve_wrapped_binary('hdiv')
|
435
|
+
|
436
|
+
|
437
|
+
# generate atomic operations
|
438
|
+
def _gen(l_key, supported_types):
|
439
|
+
@register
|
440
|
+
class Cuda_atomic(AbstractTemplate):
|
441
|
+
key = l_key
|
442
|
+
|
443
|
+
def generic(self, args, kws):
|
444
|
+
assert not kws
|
445
|
+
ary, idx, val = args
|
446
|
+
|
447
|
+
if ary.dtype not in supported_types:
|
448
|
+
return
|
449
|
+
|
450
|
+
if ary.ndim == 1:
|
451
|
+
return signature(ary.dtype, ary, types.intp, ary.dtype)
|
452
|
+
elif ary.ndim > 1:
|
453
|
+
return signature(ary.dtype, ary, idx, ary.dtype)
|
454
|
+
return Cuda_atomic
|
455
|
+
|
456
|
+
|
457
|
+
all_numba_types = (types.float64, types.float32,
|
458
|
+
types.int32, types.uint32,
|
459
|
+
types.int64, types.uint64)
|
460
|
+
|
461
|
+
integer_numba_types = (types.int32, types.uint32,
|
462
|
+
types.int64, types.uint64)
|
463
|
+
|
464
|
+
unsigned_int_numba_types = (types.uint32, types.uint64)
|
465
|
+
|
466
|
+
Cuda_atomic_add = _gen(cuda.atomic.add, all_numba_types)
|
467
|
+
Cuda_atomic_sub = _gen(cuda.atomic.sub, all_numba_types)
|
468
|
+
Cuda_atomic_max = _gen(cuda.atomic.max, all_numba_types)
|
469
|
+
Cuda_atomic_min = _gen(cuda.atomic.min, all_numba_types)
|
470
|
+
Cuda_atomic_nanmax = _gen(cuda.atomic.nanmax, all_numba_types)
|
471
|
+
Cuda_atomic_nanmin = _gen(cuda.atomic.nanmin, all_numba_types)
|
472
|
+
Cuda_atomic_and = _gen(cuda.atomic.and_, integer_numba_types)
|
473
|
+
Cuda_atomic_or = _gen(cuda.atomic.or_, integer_numba_types)
|
474
|
+
Cuda_atomic_xor = _gen(cuda.atomic.xor, integer_numba_types)
|
475
|
+
Cuda_atomic_inc = _gen(cuda.atomic.inc, unsigned_int_numba_types)
|
476
|
+
Cuda_atomic_dec = _gen(cuda.atomic.dec, unsigned_int_numba_types)
|
477
|
+
Cuda_atomic_exch = _gen(cuda.atomic.exch, integer_numba_types)
|
478
|
+
|
479
|
+
|
480
|
+
@register
|
481
|
+
class Cuda_atomic_compare_and_swap(AbstractTemplate):
|
482
|
+
key = cuda.atomic.compare_and_swap
|
483
|
+
|
484
|
+
def generic(self, args, kws):
|
485
|
+
assert not kws
|
486
|
+
ary, old, val = args
|
487
|
+
dty = ary.dtype
|
488
|
+
|
489
|
+
if dty in integer_numba_types and ary.ndim == 1:
|
490
|
+
return signature(dty, ary, dty, dty)
|
491
|
+
|
492
|
+
|
493
|
+
@register
|
494
|
+
class Cuda_atomic_cas(AbstractTemplate):
|
495
|
+
key = cuda.atomic.cas
|
496
|
+
|
497
|
+
def generic(self, args, kws):
|
498
|
+
assert not kws
|
499
|
+
ary, idx, old, val = args
|
500
|
+
dty = ary.dtype
|
501
|
+
|
502
|
+
if dty not in integer_numba_types:
|
503
|
+
return
|
504
|
+
|
505
|
+
if ary.ndim == 1:
|
506
|
+
return signature(dty, ary, types.intp, dty, dty)
|
507
|
+
elif ary.ndim > 1:
|
508
|
+
return signature(dty, ary, idx, dty, dty)
|
509
|
+
|
510
|
+
|
511
|
+
@register
|
512
|
+
class Cuda_nanosleep(ConcreteTemplate):
|
513
|
+
key = cuda.nanosleep
|
514
|
+
|
515
|
+
cases = [signature(types.void, types.uint32)]
|
516
|
+
|
517
|
+
|
518
|
+
@register_attr
|
519
|
+
class Dim3_attrs(AttributeTemplate):
|
520
|
+
key = dim3
|
521
|
+
|
522
|
+
def resolve_x(self, mod):
|
523
|
+
return types.int32
|
524
|
+
|
525
|
+
def resolve_y(self, mod):
|
526
|
+
return types.int32
|
527
|
+
|
528
|
+
def resolve_z(self, mod):
|
529
|
+
return types.int32
|
530
|
+
|
531
|
+
|
532
|
+
@register_attr
|
533
|
+
class CudaSharedModuleTemplate(AttributeTemplate):
|
534
|
+
key = types.Module(cuda.shared)
|
535
|
+
|
536
|
+
def resolve_array(self, mod):
|
537
|
+
return types.Function(Cuda_shared_array)
|
538
|
+
|
539
|
+
|
540
|
+
@register_attr
|
541
|
+
class CudaConstModuleTemplate(AttributeTemplate):
|
542
|
+
key = types.Module(cuda.const)
|
543
|
+
|
544
|
+
def resolve_array_like(self, mod):
|
545
|
+
return types.Function(Cuda_const_array_like)
|
546
|
+
|
547
|
+
|
548
|
+
@register_attr
|
549
|
+
class CudaLocalModuleTemplate(AttributeTemplate):
|
550
|
+
key = types.Module(cuda.local)
|
551
|
+
|
552
|
+
def resolve_array(self, mod):
|
553
|
+
return types.Function(Cuda_local_array)
|
554
|
+
|
555
|
+
|
556
|
+
@register_attr
|
557
|
+
class CudaAtomicTemplate(AttributeTemplate):
|
558
|
+
key = types.Module(cuda.atomic)
|
559
|
+
|
560
|
+
def resolve_add(self, mod):
|
561
|
+
return types.Function(Cuda_atomic_add)
|
562
|
+
|
563
|
+
def resolve_sub(self, mod):
|
564
|
+
return types.Function(Cuda_atomic_sub)
|
565
|
+
|
566
|
+
def resolve_and_(self, mod):
|
567
|
+
return types.Function(Cuda_atomic_and)
|
568
|
+
|
569
|
+
def resolve_or_(self, mod):
|
570
|
+
return types.Function(Cuda_atomic_or)
|
571
|
+
|
572
|
+
def resolve_xor(self, mod):
|
573
|
+
return types.Function(Cuda_atomic_xor)
|
574
|
+
|
575
|
+
def resolve_inc(self, mod):
|
576
|
+
return types.Function(Cuda_atomic_inc)
|
577
|
+
|
578
|
+
def resolve_dec(self, mod):
|
579
|
+
return types.Function(Cuda_atomic_dec)
|
580
|
+
|
581
|
+
def resolve_exch(self, mod):
|
582
|
+
return types.Function(Cuda_atomic_exch)
|
583
|
+
|
584
|
+
def resolve_max(self, mod):
|
585
|
+
return types.Function(Cuda_atomic_max)
|
586
|
+
|
587
|
+
def resolve_min(self, mod):
|
588
|
+
return types.Function(Cuda_atomic_min)
|
589
|
+
|
590
|
+
def resolve_nanmin(self, mod):
|
591
|
+
return types.Function(Cuda_atomic_nanmin)
|
592
|
+
|
593
|
+
def resolve_nanmax(self, mod):
|
594
|
+
return types.Function(Cuda_atomic_nanmax)
|
595
|
+
|
596
|
+
def resolve_compare_and_swap(self, mod):
|
597
|
+
return types.Function(Cuda_atomic_compare_and_swap)
|
598
|
+
|
599
|
+
def resolve_cas(self, mod):
|
600
|
+
return types.Function(Cuda_atomic_cas)
|
601
|
+
|
602
|
+
|
603
|
+
@register_attr
|
604
|
+
class CudaFp16Template(AttributeTemplate):
|
605
|
+
key = types.Module(cuda.fp16)
|
606
|
+
|
607
|
+
def resolve_hadd(self, mod):
|
608
|
+
return types.Function(Cuda_hadd)
|
609
|
+
|
610
|
+
def resolve_hsub(self, mod):
|
611
|
+
return types.Function(Cuda_hsub)
|
612
|
+
|
613
|
+
def resolve_hmul(self, mod):
|
614
|
+
return types.Function(Cuda_hmul)
|
615
|
+
|
616
|
+
def resolve_hdiv(self, mod):
|
617
|
+
return hdiv_device
|
618
|
+
|
619
|
+
def resolve_hneg(self, mod):
|
620
|
+
return types.Function(Cuda_hneg)
|
621
|
+
|
622
|
+
def resolve_habs(self, mod):
|
623
|
+
return types.Function(Cuda_habs)
|
624
|
+
|
625
|
+
def resolve_hfma(self, mod):
|
626
|
+
return types.Function(Cuda_hfma)
|
627
|
+
|
628
|
+
def resolve_hsin(self, mod):
|
629
|
+
return hsin_device
|
630
|
+
|
631
|
+
def resolve_hcos(self, mod):
|
632
|
+
return hcos_device
|
633
|
+
|
634
|
+
def resolve_hlog(self, mod):
|
635
|
+
return hlog_device
|
636
|
+
|
637
|
+
def resolve_hlog10(self, mod):
|
638
|
+
return hlog10_device
|
639
|
+
|
640
|
+
def resolve_hlog2(self, mod):
|
641
|
+
return hlog2_device
|
642
|
+
|
643
|
+
def resolve_hexp(self, mod):
|
644
|
+
return hexp_device
|
645
|
+
|
646
|
+
def resolve_hexp10(self, mod):
|
647
|
+
return hexp10_device
|
648
|
+
|
649
|
+
def resolve_hexp2(self, mod):
|
650
|
+
return hexp2_device
|
651
|
+
|
652
|
+
def resolve_hfloor(self, mod):
|
653
|
+
return hfloor_device
|
654
|
+
|
655
|
+
def resolve_hceil(self, mod):
|
656
|
+
return hceil_device
|
657
|
+
|
658
|
+
def resolve_hsqrt(self, mod):
|
659
|
+
return hsqrt_device
|
660
|
+
|
661
|
+
def resolve_hrsqrt(self, mod):
|
662
|
+
return hrsqrt_device
|
663
|
+
|
664
|
+
def resolve_hrcp(self, mod):
|
665
|
+
return hrcp_device
|
666
|
+
|
667
|
+
def resolve_hrint(self, mod):
|
668
|
+
return hrint_device
|
669
|
+
|
670
|
+
def resolve_htrunc(self, mod):
|
671
|
+
return htrunc_device
|
672
|
+
|
673
|
+
def resolve_heq(self, mod):
|
674
|
+
return types.Function(Cuda_heq)
|
675
|
+
|
676
|
+
def resolve_hne(self, mod):
|
677
|
+
return types.Function(Cuda_hne)
|
678
|
+
|
679
|
+
def resolve_hge(self, mod):
|
680
|
+
return types.Function(Cuda_hge)
|
681
|
+
|
682
|
+
def resolve_hgt(self, mod):
|
683
|
+
return types.Function(Cuda_hgt)
|
684
|
+
|
685
|
+
def resolve_hle(self, mod):
|
686
|
+
return types.Function(Cuda_hle)
|
687
|
+
|
688
|
+
def resolve_hlt(self, mod):
|
689
|
+
return types.Function(Cuda_hlt)
|
690
|
+
|
691
|
+
def resolve_hmax(self, mod):
|
692
|
+
return types.Function(Cuda_hmax)
|
693
|
+
|
694
|
+
def resolve_hmin(self, mod):
|
695
|
+
return types.Function(Cuda_hmin)
|
696
|
+
|
697
|
+
|
698
|
+
@register_attr
|
699
|
+
class CudaModuleTemplate(AttributeTemplate):
|
700
|
+
key = types.Module(cuda)
|
701
|
+
|
702
|
+
def resolve_cg(self, mod):
|
703
|
+
return types.Module(cuda.cg)
|
704
|
+
|
705
|
+
def resolve_threadIdx(self, mod):
|
706
|
+
return dim3
|
707
|
+
|
708
|
+
def resolve_blockIdx(self, mod):
|
709
|
+
return dim3
|
710
|
+
|
711
|
+
def resolve_blockDim(self, mod):
|
712
|
+
return dim3
|
713
|
+
|
714
|
+
def resolve_gridDim(self, mod):
|
715
|
+
return dim3
|
716
|
+
|
717
|
+
def resolve_laneid(self, mod):
|
718
|
+
return types.int32
|
719
|
+
|
720
|
+
def resolve_shared(self, mod):
|
721
|
+
return types.Module(cuda.shared)
|
722
|
+
|
723
|
+
def resolve_popc(self, mod):
|
724
|
+
return types.Function(Cuda_popc)
|
725
|
+
|
726
|
+
def resolve_brev(self, mod):
|
727
|
+
return types.Function(Cuda_brev)
|
728
|
+
|
729
|
+
def resolve_clz(self, mod):
|
730
|
+
return types.Function(Cuda_clz)
|
731
|
+
|
732
|
+
def resolve_ffs(self, mod):
|
733
|
+
return types.Function(Cuda_ffs)
|
734
|
+
|
735
|
+
def resolve_fma(self, mod):
|
736
|
+
return types.Function(Cuda_fma)
|
737
|
+
|
738
|
+
def resolve_cbrt(self, mod):
|
739
|
+
return types.Function(Cuda_cbrt)
|
740
|
+
|
741
|
+
def resolve_threadfence(self, mod):
|
742
|
+
return types.Function(Cuda_threadfence_device)
|
743
|
+
|
744
|
+
def resolve_threadfence_block(self, mod):
|
745
|
+
return types.Function(Cuda_threadfence_block)
|
746
|
+
|
747
|
+
def resolve_threadfence_system(self, mod):
|
748
|
+
return types.Function(Cuda_threadfence_system)
|
749
|
+
|
750
|
+
def resolve_syncwarp(self, mod):
|
751
|
+
return types.Function(Cuda_syncwarp)
|
752
|
+
|
753
|
+
def resolve_shfl_sync_intrinsic(self, mod):
|
754
|
+
return types.Function(Cuda_shfl_sync_intrinsic)
|
755
|
+
|
756
|
+
def resolve_vote_sync_intrinsic(self, mod):
|
757
|
+
return types.Function(Cuda_vote_sync_intrinsic)
|
758
|
+
|
759
|
+
def resolve_match_any_sync(self, mod):
|
760
|
+
return types.Function(Cuda_match_any_sync)
|
761
|
+
|
762
|
+
def resolve_match_all_sync(self, mod):
|
763
|
+
return types.Function(Cuda_match_all_sync)
|
764
|
+
|
765
|
+
def resolve_activemask(self, mod):
|
766
|
+
return types.Function(Cuda_activemask)
|
767
|
+
|
768
|
+
def resolve_lanemask_lt(self, mod):
|
769
|
+
return types.Function(Cuda_lanemask_lt)
|
770
|
+
|
771
|
+
def resolve_selp(self, mod):
|
772
|
+
return types.Function(Cuda_selp)
|
773
|
+
|
774
|
+
def resolve_nanosleep(self, mod):
|
775
|
+
return types.Function(Cuda_nanosleep)
|
776
|
+
|
777
|
+
def resolve_atomic(self, mod):
|
778
|
+
return types.Module(cuda.atomic)
|
779
|
+
|
780
|
+
def resolve_fp16(self, mod):
|
781
|
+
return types.Module(cuda.fp16)
|
782
|
+
|
783
|
+
def resolve_const(self, mod):
|
784
|
+
return types.Module(cuda.const)
|
785
|
+
|
786
|
+
def resolve_local(self, mod):
|
787
|
+
return types.Module(cuda.local)
|
788
|
+
|
789
|
+
|
790
|
+
register_global(cuda, types.Module(cuda))
|
791
|
+
|
792
|
+
|
793
|
+
# NumPy
|
794
|
+
|
795
|
+
for func in trigonometric_functions:
|
796
|
+
register_numpy_ufunc(func, register_global)
|
797
|
+
|
798
|
+
for func in comparison_functions:
|
799
|
+
register_numpy_ufunc(func, register_global)
|
800
|
+
|
801
|
+
for func in bit_twiddling_functions:
|
802
|
+
register_numpy_ufunc(func, register_global)
|
803
|
+
|
804
|
+
for func in math_operations:
|
805
|
+
if func in ('log', 'log2', 'log10'):
|
806
|
+
register_numpy_ufunc(func, register_global)
|