numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
|
|
1
|
+
# CUDA built-in Vector Types
|
2
|
+
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types
|
3
|
+
|
4
|
+
from typing import List, Tuple, Dict
|
5
|
+
|
6
|
+
from numba import types
|
7
|
+
from numba.core import cgutils
|
8
|
+
from numba.core.extending import make_attribute_wrapper, models, register_model
|
9
|
+
from numba.core.imputils import Registry as ImplRegistry
|
10
|
+
from numba.core.typing.templates import ConcreteTemplate
|
11
|
+
from numba.core.typing.templates import Registry as TypingRegistry
|
12
|
+
from numba.core.typing.templates import signature
|
13
|
+
from numba.cuda import stubs
|
14
|
+
from numba.cuda.errors import CudaLoweringError
|
15
|
+
|
16
|
+
typing_registry = TypingRegistry()
|
17
|
+
impl_registry = ImplRegistry()
|
18
|
+
|
19
|
+
register = typing_registry.register
|
20
|
+
register_attr = typing_registry.register_attr
|
21
|
+
register_global = typing_registry.register_global
|
22
|
+
lower = impl_registry.lower
|
23
|
+
|
24
|
+
|
25
|
+
class VectorType(types.Type):
|
26
|
+
def __init__(self, name, base_type, attr_names, user_facing_object):
|
27
|
+
self._base_type = base_type
|
28
|
+
self._attr_names = attr_names
|
29
|
+
self._user_facing_object = user_facing_object
|
30
|
+
super().__init__(name=name)
|
31
|
+
|
32
|
+
@property
|
33
|
+
def base_type(self):
|
34
|
+
return self._base_type
|
35
|
+
|
36
|
+
@property
|
37
|
+
def attr_names(self):
|
38
|
+
return self._attr_names
|
39
|
+
|
40
|
+
@property
|
41
|
+
def num_elements(self):
|
42
|
+
return len(self._attr_names)
|
43
|
+
|
44
|
+
@property
|
45
|
+
def user_facing_object(self):
|
46
|
+
return self._user_facing_object
|
47
|
+
|
48
|
+
|
49
|
+
def make_vector_type(
|
50
|
+
name: str,
|
51
|
+
base_type: types.Type,
|
52
|
+
attr_names: Tuple[str, ...],
|
53
|
+
user_facing_object
|
54
|
+
) -> types.Type:
|
55
|
+
"""Create a vector type.
|
56
|
+
|
57
|
+
Parameters
|
58
|
+
----------
|
59
|
+
name: str
|
60
|
+
The name of the type.
|
61
|
+
base_type: numba.types.Type
|
62
|
+
The primitive type for each element in the vector.
|
63
|
+
attr_names: tuple of str
|
64
|
+
Name for each attribute.
|
65
|
+
user_facing_object: object
|
66
|
+
The handle to be used in cuda kernel.
|
67
|
+
"""
|
68
|
+
|
69
|
+
class _VectorType(VectorType):
|
70
|
+
"""Internal instantiation of VectorType."""
|
71
|
+
|
72
|
+
pass
|
73
|
+
|
74
|
+
class VectorTypeModel(models.StructModel):
|
75
|
+
def __init__(self, dmm, fe_type):
|
76
|
+
members = [(attr_name, base_type) for attr_name in attr_names]
|
77
|
+
super().__init__(dmm, fe_type, members)
|
78
|
+
|
79
|
+
vector_type = _VectorType(name, base_type, attr_names, user_facing_object)
|
80
|
+
register_model(_VectorType)(VectorTypeModel)
|
81
|
+
for attr_name in attr_names:
|
82
|
+
make_attribute_wrapper(_VectorType, attr_name, attr_name)
|
83
|
+
|
84
|
+
return vector_type
|
85
|
+
|
86
|
+
|
87
|
+
def enable_vector_type_ctor(
|
88
|
+
vector_type: VectorType, overloads: List[List[types.Type]]
|
89
|
+
):
|
90
|
+
"""Create typing and lowering for vector type constructor.
|
91
|
+
|
92
|
+
Parameters
|
93
|
+
----------
|
94
|
+
vector_type: VectorType
|
95
|
+
The type whose constructor to type and lower.
|
96
|
+
overloads: List of argument types
|
97
|
+
A list containing different overloads of the constructor. Each base type
|
98
|
+
in the argument list should either be primitive type or VectorType.
|
99
|
+
"""
|
100
|
+
ctor = vector_type.user_facing_object
|
101
|
+
|
102
|
+
@register
|
103
|
+
class CtorTemplate(ConcreteTemplate):
|
104
|
+
key = ctor
|
105
|
+
cases = [signature(vector_type, *arglist) for arglist in overloads]
|
106
|
+
|
107
|
+
register_global(ctor, types.Function(CtorTemplate))
|
108
|
+
|
109
|
+
# Lowering
|
110
|
+
|
111
|
+
def make_lowering(fml_arg_list):
|
112
|
+
"""Meta function to create a lowering for the constructor. Flattens
|
113
|
+
the arguments by converting vector_type into load instructions for each
|
114
|
+
of its attributes. Such as float2 -> float2.x, float2.y.
|
115
|
+
"""
|
116
|
+
|
117
|
+
def lowering(context, builder, sig, actual_args):
|
118
|
+
# A list of elements to assign from
|
119
|
+
source_list = []
|
120
|
+
# Convert the list of argument types to a list of load IRs.
|
121
|
+
for argidx, fml_arg in enumerate(fml_arg_list):
|
122
|
+
if isinstance(fml_arg, VectorType):
|
123
|
+
pxy = cgutils.create_struct_proxy(fml_arg)(
|
124
|
+
context, builder, actual_args[argidx]
|
125
|
+
)
|
126
|
+
source_list += [
|
127
|
+
getattr(pxy, attr) for attr in fml_arg.attr_names
|
128
|
+
]
|
129
|
+
else:
|
130
|
+
# assumed primitive type
|
131
|
+
source_list.append(actual_args[argidx])
|
132
|
+
|
133
|
+
if len(source_list) != vector_type.num_elements:
|
134
|
+
raise CudaLoweringError(
|
135
|
+
f"Unmatched number of source elements ({len(source_list)}) "
|
136
|
+
"and target elements ({vector_type.num_elements})."
|
137
|
+
)
|
138
|
+
|
139
|
+
out = cgutils.create_struct_proxy(vector_type)(context, builder)
|
140
|
+
|
141
|
+
for attr_name, source in zip(vector_type.attr_names, source_list):
|
142
|
+
setattr(out, attr_name, source)
|
143
|
+
return out._getvalue()
|
144
|
+
|
145
|
+
return lowering
|
146
|
+
|
147
|
+
for arglist in overloads:
|
148
|
+
lowering = make_lowering(arglist)
|
149
|
+
lower(ctor, *arglist)(lowering)
|
150
|
+
|
151
|
+
|
152
|
+
vector_types : Dict[str, VectorType] = {}
|
153
|
+
|
154
|
+
|
155
|
+
def build_constructor_overloads(base_type, vty_name, num_elements, arglists, l):
|
156
|
+
"""
|
157
|
+
For a given vector type, build a list of overloads for its constructor.
|
158
|
+
"""
|
159
|
+
|
160
|
+
# TODO: speed up with memoization
|
161
|
+
if num_elements == 0:
|
162
|
+
arglists.append(l[:])
|
163
|
+
|
164
|
+
for i in range(1, num_elements + 1):
|
165
|
+
if i == 1:
|
166
|
+
# For 1-element component, it can construct with either a
|
167
|
+
# primitive type or other 1-element component.
|
168
|
+
l.append(base_type)
|
169
|
+
build_constructor_overloads(
|
170
|
+
base_type, vty_name, num_elements - i, arglists, l
|
171
|
+
)
|
172
|
+
l.pop(-1)
|
173
|
+
|
174
|
+
l.append(vector_types[f"{vty_name[:-1]}1"])
|
175
|
+
build_constructor_overloads(
|
176
|
+
base_type, vty_name, num_elements - i, arglists, l
|
177
|
+
)
|
178
|
+
l.pop(-1)
|
179
|
+
else:
|
180
|
+
l.append(vector_types[f"{vty_name[:-1]}{i}"])
|
181
|
+
build_constructor_overloads(
|
182
|
+
base_type, vty_name, num_elements - i, arglists, l
|
183
|
+
)
|
184
|
+
l.pop(-1)
|
185
|
+
|
186
|
+
|
187
|
+
def _initialize():
|
188
|
+
"""
|
189
|
+
Construct the vector types, populate `vector_types` dictionary, and
|
190
|
+
enable the constructors.
|
191
|
+
"""
|
192
|
+
vector_type_attribute_names = ("x", "y", "z", "w")
|
193
|
+
for stub in stubs._vector_type_stubs:
|
194
|
+
type_name = stub.__name__
|
195
|
+
base_type = getattr(types, type_name[:-2])
|
196
|
+
num_elements = int(type_name[-1])
|
197
|
+
attributes = vector_type_attribute_names[:num_elements]
|
198
|
+
vector_type = make_vector_type(type_name, base_type, attributes, stub)
|
199
|
+
vector_types[type_name] = vector_type
|
200
|
+
|
201
|
+
for vty in vector_types.values():
|
202
|
+
arglists, l = [], []
|
203
|
+
build_constructor_overloads(
|
204
|
+
vty.base_type, vty.name, vty.num_elements, arglists, l
|
205
|
+
)
|
206
|
+
enable_vector_type_ctor(vty, arglists)
|
207
|
+
|
208
|
+
|
209
|
+
_initialize()
|
@@ -0,0 +1,252 @@
|
|
1
|
+
from numba import cuda
|
2
|
+
from numpy import array as np_array
|
3
|
+
from numba.cuda import deviceufunc
|
4
|
+
from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
|
5
|
+
GUFuncCallSteps)
|
6
|
+
|
7
|
+
|
8
|
+
class CUDAUFuncDispatcher(object):
|
9
|
+
"""
|
10
|
+
Invoke the CUDA ufunc specialization for the given inputs.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, types_to_retty_kernels, pyfunc):
|
14
|
+
self.functions = types_to_retty_kernels
|
15
|
+
self.__name__ = pyfunc.__name__
|
16
|
+
|
17
|
+
def __call__(self, *args, **kws):
|
18
|
+
"""
|
19
|
+
*args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
|
20
|
+
Cannot mix the two types in one call.
|
21
|
+
|
22
|
+
**kws:
|
23
|
+
stream -- cuda stream; when defined, asynchronous mode is used.
|
24
|
+
out -- output array. Can be a numpy array or DeviceArrayBase
|
25
|
+
depending on the input arguments. Type must match
|
26
|
+
the input arguments.
|
27
|
+
"""
|
28
|
+
return CUDAUFuncMechanism.call(self.functions, args, kws)
|
29
|
+
|
30
|
+
def reduce(self, arg, stream=0):
|
31
|
+
assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
|
32
|
+
"ufunc"
|
33
|
+
assert arg.ndim == 1, "must use 1d array"
|
34
|
+
|
35
|
+
n = arg.shape[0]
|
36
|
+
gpu_mems = []
|
37
|
+
|
38
|
+
if n == 0:
|
39
|
+
raise TypeError("Reduction on an empty array.")
|
40
|
+
elif n == 1: # nothing to do
|
41
|
+
return arg[0]
|
42
|
+
|
43
|
+
# always use a stream
|
44
|
+
stream = stream or cuda.stream()
|
45
|
+
with stream.auto_synchronize():
|
46
|
+
# transfer memory to device if necessary
|
47
|
+
if cuda.cudadrv.devicearray.is_cuda_ndarray(arg):
|
48
|
+
mem = arg
|
49
|
+
else:
|
50
|
+
mem = cuda.to_device(arg, stream)
|
51
|
+
# do reduction
|
52
|
+
out = self.__reduce(mem, gpu_mems, stream)
|
53
|
+
# use a small buffer to store the result element
|
54
|
+
buf = np_array((1,), dtype=arg.dtype)
|
55
|
+
out.copy_to_host(buf, stream=stream)
|
56
|
+
|
57
|
+
return buf[0]
|
58
|
+
|
59
|
+
def __reduce(self, mem, gpu_mems, stream):
|
60
|
+
n = mem.shape[0]
|
61
|
+
if n % 2 != 0: # odd?
|
62
|
+
fatcut, thincut = mem.split(n - 1)
|
63
|
+
# prevent freeing during async mode
|
64
|
+
gpu_mems.append(fatcut)
|
65
|
+
gpu_mems.append(thincut)
|
66
|
+
# execute the kernel
|
67
|
+
out = self.__reduce(fatcut, gpu_mems, stream)
|
68
|
+
gpu_mems.append(out)
|
69
|
+
return self(out, thincut, out=out, stream=stream)
|
70
|
+
else: # even?
|
71
|
+
left, right = mem.split(n // 2)
|
72
|
+
# prevent freeing during async mode
|
73
|
+
gpu_mems.append(left)
|
74
|
+
gpu_mems.append(right)
|
75
|
+
# execute the kernel
|
76
|
+
self(left, right, out=left, stream=stream)
|
77
|
+
if n // 2 > 1:
|
78
|
+
return self.__reduce(left, gpu_mems, stream)
|
79
|
+
else:
|
80
|
+
return left
|
81
|
+
|
82
|
+
|
83
|
+
class _CUDAGUFuncCallSteps(GUFuncCallSteps):
|
84
|
+
__slots__ = [
|
85
|
+
'_stream',
|
86
|
+
]
|
87
|
+
|
88
|
+
def __init__(self, nin, nout, args, kwargs):
|
89
|
+
super().__init__(nin, nout, args, kwargs)
|
90
|
+
self._stream = kwargs.get('stream', 0)
|
91
|
+
|
92
|
+
def is_device_array(self, obj):
|
93
|
+
return cuda.is_cuda_array(obj)
|
94
|
+
|
95
|
+
def as_device_array(self, obj):
|
96
|
+
# We don't want to call as_cuda_array on objects that are already Numba
|
97
|
+
# device arrays, because this results in exporting the array as a
|
98
|
+
# Producer then importing it as a Consumer, which causes a
|
99
|
+
# synchronization on the array's stream (if it has one) by default.
|
100
|
+
# When we have a Numba device array, we can simply return it.
|
101
|
+
if cuda.cudadrv.devicearray.is_cuda_ndarray(obj):
|
102
|
+
return obj
|
103
|
+
return cuda.as_cuda_array(obj)
|
104
|
+
|
105
|
+
def to_device(self, hostary):
|
106
|
+
return cuda.to_device(hostary, stream=self._stream)
|
107
|
+
|
108
|
+
def to_host(self, devary, hostary):
|
109
|
+
out = devary.copy_to_host(hostary, stream=self._stream)
|
110
|
+
return out
|
111
|
+
|
112
|
+
def allocate_device_array(self, shape, dtype):
|
113
|
+
return cuda.device_array(shape=shape, dtype=dtype, stream=self._stream)
|
114
|
+
|
115
|
+
def launch_kernel(self, kernel, nelem, args):
|
116
|
+
kernel.forall(nelem, stream=self._stream)(*args)
|
117
|
+
|
118
|
+
|
119
|
+
class CUDAGeneralizedUFunc(GeneralizedUFunc):
|
120
|
+
def __init__(self, kernelmap, engine, pyfunc):
|
121
|
+
self.__name__ = pyfunc.__name__
|
122
|
+
super().__init__(kernelmap, engine)
|
123
|
+
|
124
|
+
@property
|
125
|
+
def _call_steps(self):
|
126
|
+
return _CUDAGUFuncCallSteps
|
127
|
+
|
128
|
+
def _broadcast_scalar_input(self, ary, shape):
|
129
|
+
return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
|
130
|
+
strides=(0,),
|
131
|
+
dtype=ary.dtype,
|
132
|
+
gpu_data=ary.gpu_data)
|
133
|
+
|
134
|
+
def _broadcast_add_axis(self, ary, newshape):
|
135
|
+
newax = len(newshape) - len(ary.shape)
|
136
|
+
# Add 0 strides for missing dimension
|
137
|
+
newstrides = (0,) * newax + ary.strides
|
138
|
+
return cuda.cudadrv.devicearray.DeviceNDArray(shape=newshape,
|
139
|
+
strides=newstrides,
|
140
|
+
dtype=ary.dtype,
|
141
|
+
gpu_data=ary.gpu_data)
|
142
|
+
|
143
|
+
|
144
|
+
class CUDAUFuncMechanism(UFuncMechanism):
|
145
|
+
"""
|
146
|
+
Provide CUDA specialization
|
147
|
+
"""
|
148
|
+
DEFAULT_STREAM = 0
|
149
|
+
|
150
|
+
def launch(self, func, count, stream, args):
|
151
|
+
func.forall(count, stream=stream)(*args)
|
152
|
+
|
153
|
+
def is_device_array(self, obj):
|
154
|
+
return cuda.is_cuda_array(obj)
|
155
|
+
|
156
|
+
def as_device_array(self, obj):
|
157
|
+
# We don't want to call as_cuda_array on objects that are already Numba
|
158
|
+
# device arrays, because this results in exporting the array as a
|
159
|
+
# Producer then importing it as a Consumer, which causes a
|
160
|
+
# synchronization on the array's stream (if it has one) by default.
|
161
|
+
# When we have a Numba device array, we can simply return it.
|
162
|
+
if cuda.cudadrv.devicearray.is_cuda_ndarray(obj):
|
163
|
+
return obj
|
164
|
+
return cuda.as_cuda_array(obj)
|
165
|
+
|
166
|
+
def to_device(self, hostary, stream):
|
167
|
+
return cuda.to_device(hostary, stream=stream)
|
168
|
+
|
169
|
+
def to_host(self, devary, stream):
|
170
|
+
return devary.copy_to_host(stream=stream)
|
171
|
+
|
172
|
+
def allocate_device_array(self, shape, dtype, stream):
|
173
|
+
return cuda.device_array(shape=shape, dtype=dtype, stream=stream)
|
174
|
+
|
175
|
+
def broadcast_device(self, ary, shape):
|
176
|
+
ax_differs = [ax for ax in range(len(shape))
|
177
|
+
if ax >= ary.ndim
|
178
|
+
or ary.shape[ax] != shape[ax]]
|
179
|
+
|
180
|
+
missingdim = len(shape) - len(ary.shape)
|
181
|
+
strides = [0] * missingdim + list(ary.strides)
|
182
|
+
|
183
|
+
for ax in ax_differs:
|
184
|
+
strides[ax] = 0
|
185
|
+
|
186
|
+
return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
|
187
|
+
strides=strides,
|
188
|
+
dtype=ary.dtype,
|
189
|
+
gpu_data=ary.gpu_data)
|
190
|
+
|
191
|
+
|
192
|
+
vectorizer_stager_source = '''
|
193
|
+
def __vectorized_{name}({args}, __out__):
|
194
|
+
__tid__ = __cuda__.grid(1)
|
195
|
+
if __tid__ < __out__.shape[0]:
|
196
|
+
__out__[__tid__] = __core__({argitems})
|
197
|
+
'''
|
198
|
+
|
199
|
+
|
200
|
+
class CUDAVectorize(deviceufunc.DeviceVectorize):
|
201
|
+
def _compile_core(self, sig):
|
202
|
+
cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc)
|
203
|
+
return cudevfn, cudevfn.overloads[sig.args].signature.return_type
|
204
|
+
|
205
|
+
def _get_globals(self, corefn):
|
206
|
+
glbl = self.pyfunc.__globals__.copy()
|
207
|
+
glbl.update({'__cuda__': cuda,
|
208
|
+
'__core__': corefn})
|
209
|
+
return glbl
|
210
|
+
|
211
|
+
def _compile_kernel(self, fnobj, sig):
|
212
|
+
return cuda.jit(fnobj)
|
213
|
+
|
214
|
+
def build_ufunc(self):
|
215
|
+
return CUDAUFuncDispatcher(self.kernelmap, self.pyfunc)
|
216
|
+
|
217
|
+
@property
|
218
|
+
def _kernel_template(self):
|
219
|
+
return vectorizer_stager_source
|
220
|
+
|
221
|
+
|
222
|
+
# ------------------------------------------------------------------------------
|
223
|
+
# Generalized CUDA ufuncs
|
224
|
+
|
225
|
+
_gufunc_stager_source = '''
|
226
|
+
def __gufunc_{name}({args}):
|
227
|
+
__tid__ = __cuda__.grid(1)
|
228
|
+
if __tid__ < {checkedarg}:
|
229
|
+
__core__({argitems})
|
230
|
+
'''
|
231
|
+
|
232
|
+
|
233
|
+
class CUDAGUFuncVectorize(deviceufunc.DeviceGUFuncVectorize):
|
234
|
+
def build_ufunc(self):
|
235
|
+
engine = deviceufunc.GUFuncEngine(self.inputsig, self.outputsig)
|
236
|
+
return CUDAGeneralizedUFunc(kernelmap=self.kernelmap,
|
237
|
+
engine=engine,
|
238
|
+
pyfunc=self.pyfunc)
|
239
|
+
|
240
|
+
def _compile_kernel(self, fnobj, sig):
|
241
|
+
return cuda.jit(sig)(fnobj)
|
242
|
+
|
243
|
+
@property
|
244
|
+
def _kernel_template(self):
|
245
|
+
return _gufunc_stager_source
|
246
|
+
|
247
|
+
def _get_globals(self, sig):
|
248
|
+
corefn = cuda.jit(sig, device=True)(self.pyfunc)
|
249
|
+
glbls = self.py_func.__globals__.copy()
|
250
|
+
glbls.update({'__cuda__': cuda,
|
251
|
+
'__core__': corefn})
|
252
|
+
return glbls
|
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2012, Anaconda, Inc.
|
2
|
+
Copyright (c) 2024, NVIDIA CORPORATION.
|
3
|
+
All rights reserved.
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
6
|
+
modification, are permitted provided that the following conditions are
|
7
|
+
met:
|
8
|
+
|
9
|
+
Redistributions of source code must retain the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer.
|
11
|
+
|
12
|
+
Redistributions in binary form must reproduce the above copyright
|
13
|
+
notice, this list of conditions and the following disclaimer in the
|
14
|
+
documentation and/or other materials provided with the distribution.
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
16
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
17
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
18
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
19
|
+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
20
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
21
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
22
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
23
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@@ -0,0 +1,68 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: numba-cuda
|
3
|
+
Version: 0.0.12
|
4
|
+
Summary: CUDA target for Numba
|
5
|
+
Author: Anaconda Inc., NVIDIA Corporation
|
6
|
+
License: BSD 2-clause
|
7
|
+
Project-URL: Homepage, https://github.com/rapidsai/numba-cuda
|
8
|
+
Project-URL: Documentation, https://github.com/rapidsai/numba-cuda/blob/main/README.md
|
9
|
+
Project-URL: Repository, https://github.com/rapidsai/numba-cuda
|
10
|
+
Project-URL: License, https://github.com/rapidsai/numba-cuda/blob/main/LICENSE
|
11
|
+
Requires-Python: >=3.9
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
|
15
|
+
# Numba CUDA Target
|
16
|
+
|
17
|
+
An out-of-tree CUDA target for Numba.
|
18
|
+
|
19
|
+
This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
|
20
|
+
and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
|
21
|
+
used as the `numba.cuda` module instead of the code from the `numba` package.
|
22
|
+
|
23
|
+
This is presently in an early state and is published for testing and feedback.
|
24
|
+
|
25
|
+
## Building / testing
|
26
|
+
|
27
|
+
Install as an editable install:
|
28
|
+
|
29
|
+
```
|
30
|
+
pip install -e .
|
31
|
+
```
|
32
|
+
|
33
|
+
Running tests:
|
34
|
+
|
35
|
+
```
|
36
|
+
python -m numba.runtests numba.cuda.tests
|
37
|
+
```
|
38
|
+
|
39
|
+
This should discover the`numba.cuda` module from the `numba_cuda` package. You
|
40
|
+
can check where `numba.cuda` files are being located by running
|
41
|
+
|
42
|
+
```
|
43
|
+
python -c "from numba import cuda; print(cuda.__file__)"
|
44
|
+
```
|
45
|
+
|
46
|
+
which will show a path like:
|
47
|
+
|
48
|
+
```
|
49
|
+
<path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
|
50
|
+
```
|
51
|
+
|
52
|
+
## Branching strategy
|
53
|
+
|
54
|
+
Presently the `main` branch is being used to target the exact behavior of the
|
55
|
+
built-in CUDA target. New feature development and bug fixes should be applied to
|
56
|
+
`develop`. Once the `main` branch is widely tested and confirmed to work well as
|
57
|
+
a drop-in replacement for the built-in `numba.cuda`, the `develop` branch will
|
58
|
+
be merged in and new feature development will proceed on `main`.
|
59
|
+
|
60
|
+
### Current PR targets
|
61
|
+
|
62
|
+
- PRs related to replacing the built-in CUDA target's features should target
|
63
|
+
`main`.
|
64
|
+
- PRs adding new features and bug fixes should target `develop`.
|
65
|
+
|
66
|
+
### Future PR targets
|
67
|
+
|
68
|
+
- In future, all PRs should target the `main` branch.
|