numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
1
+ # CUDA built-in Vector Types
2
+ # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types
3
+
4
+ from typing import List, Tuple, Dict
5
+
6
+ from numba import types
7
+ from numba.core import cgutils
8
+ from numba.core.extending import make_attribute_wrapper, models, register_model
9
+ from numba.core.imputils import Registry as ImplRegistry
10
+ from numba.core.typing.templates import ConcreteTemplate
11
+ from numba.core.typing.templates import Registry as TypingRegistry
12
+ from numba.core.typing.templates import signature
13
+ from numba.cuda import stubs
14
+ from numba.cuda.errors import CudaLoweringError
15
+
16
+ typing_registry = TypingRegistry()
17
+ impl_registry = ImplRegistry()
18
+
19
+ register = typing_registry.register
20
+ register_attr = typing_registry.register_attr
21
+ register_global = typing_registry.register_global
22
+ lower = impl_registry.lower
23
+
24
+
25
+ class VectorType(types.Type):
26
+ def __init__(self, name, base_type, attr_names, user_facing_object):
27
+ self._base_type = base_type
28
+ self._attr_names = attr_names
29
+ self._user_facing_object = user_facing_object
30
+ super().__init__(name=name)
31
+
32
+ @property
33
+ def base_type(self):
34
+ return self._base_type
35
+
36
+ @property
37
+ def attr_names(self):
38
+ return self._attr_names
39
+
40
+ @property
41
+ def num_elements(self):
42
+ return len(self._attr_names)
43
+
44
+ @property
45
+ def user_facing_object(self):
46
+ return self._user_facing_object
47
+
48
+
49
+ def make_vector_type(
50
+ name: str,
51
+ base_type: types.Type,
52
+ attr_names: Tuple[str, ...],
53
+ user_facing_object
54
+ ) -> types.Type:
55
+ """Create a vector type.
56
+
57
+ Parameters
58
+ ----------
59
+ name: str
60
+ The name of the type.
61
+ base_type: numba.types.Type
62
+ The primitive type for each element in the vector.
63
+ attr_names: tuple of str
64
+ Name for each attribute.
65
+ user_facing_object: object
66
+ The handle to be used in cuda kernel.
67
+ """
68
+
69
+ class _VectorType(VectorType):
70
+ """Internal instantiation of VectorType."""
71
+
72
+ pass
73
+
74
+ class VectorTypeModel(models.StructModel):
75
+ def __init__(self, dmm, fe_type):
76
+ members = [(attr_name, base_type) for attr_name in attr_names]
77
+ super().__init__(dmm, fe_type, members)
78
+
79
+ vector_type = _VectorType(name, base_type, attr_names, user_facing_object)
80
+ register_model(_VectorType)(VectorTypeModel)
81
+ for attr_name in attr_names:
82
+ make_attribute_wrapper(_VectorType, attr_name, attr_name)
83
+
84
+ return vector_type
85
+
86
+
87
+ def enable_vector_type_ctor(
88
+ vector_type: VectorType, overloads: List[List[types.Type]]
89
+ ):
90
+ """Create typing and lowering for vector type constructor.
91
+
92
+ Parameters
93
+ ----------
94
+ vector_type: VectorType
95
+ The type whose constructor to type and lower.
96
+ overloads: List of argument types
97
+ A list containing different overloads of the constructor. Each base type
98
+ in the argument list should either be primitive type or VectorType.
99
+ """
100
+ ctor = vector_type.user_facing_object
101
+
102
+ @register
103
+ class CtorTemplate(ConcreteTemplate):
104
+ key = ctor
105
+ cases = [signature(vector_type, *arglist) for arglist in overloads]
106
+
107
+ register_global(ctor, types.Function(CtorTemplate))
108
+
109
+ # Lowering
110
+
111
+ def make_lowering(fml_arg_list):
112
+ """Meta function to create a lowering for the constructor. Flattens
113
+ the arguments by converting vector_type into load instructions for each
114
+ of its attributes. Such as float2 -> float2.x, float2.y.
115
+ """
116
+
117
+ def lowering(context, builder, sig, actual_args):
118
+ # A list of elements to assign from
119
+ source_list = []
120
+ # Convert the list of argument types to a list of load IRs.
121
+ for argidx, fml_arg in enumerate(fml_arg_list):
122
+ if isinstance(fml_arg, VectorType):
123
+ pxy = cgutils.create_struct_proxy(fml_arg)(
124
+ context, builder, actual_args[argidx]
125
+ )
126
+ source_list += [
127
+ getattr(pxy, attr) for attr in fml_arg.attr_names
128
+ ]
129
+ else:
130
+ # assumed primitive type
131
+ source_list.append(actual_args[argidx])
132
+
133
+ if len(source_list) != vector_type.num_elements:
134
+ raise CudaLoweringError(
135
+ f"Unmatched number of source elements ({len(source_list)}) "
136
+ "and target elements ({vector_type.num_elements})."
137
+ )
138
+
139
+ out = cgutils.create_struct_proxy(vector_type)(context, builder)
140
+
141
+ for attr_name, source in zip(vector_type.attr_names, source_list):
142
+ setattr(out, attr_name, source)
143
+ return out._getvalue()
144
+
145
+ return lowering
146
+
147
+ for arglist in overloads:
148
+ lowering = make_lowering(arglist)
149
+ lower(ctor, *arglist)(lowering)
150
+
151
+
152
+ vector_types : Dict[str, VectorType] = {}
153
+
154
+
155
+ def build_constructor_overloads(base_type, vty_name, num_elements, arglists, l):
156
+ """
157
+ For a given vector type, build a list of overloads for its constructor.
158
+ """
159
+
160
+ # TODO: speed up with memoization
161
+ if num_elements == 0:
162
+ arglists.append(l[:])
163
+
164
+ for i in range(1, num_elements + 1):
165
+ if i == 1:
166
+ # For 1-element component, it can construct with either a
167
+ # primitive type or other 1-element component.
168
+ l.append(base_type)
169
+ build_constructor_overloads(
170
+ base_type, vty_name, num_elements - i, arglists, l
171
+ )
172
+ l.pop(-1)
173
+
174
+ l.append(vector_types[f"{vty_name[:-1]}1"])
175
+ build_constructor_overloads(
176
+ base_type, vty_name, num_elements - i, arglists, l
177
+ )
178
+ l.pop(-1)
179
+ else:
180
+ l.append(vector_types[f"{vty_name[:-1]}{i}"])
181
+ build_constructor_overloads(
182
+ base_type, vty_name, num_elements - i, arglists, l
183
+ )
184
+ l.pop(-1)
185
+
186
+
187
+ def _initialize():
188
+ """
189
+ Construct the vector types, populate `vector_types` dictionary, and
190
+ enable the constructors.
191
+ """
192
+ vector_type_attribute_names = ("x", "y", "z", "w")
193
+ for stub in stubs._vector_type_stubs:
194
+ type_name = stub.__name__
195
+ base_type = getattr(types, type_name[:-2])
196
+ num_elements = int(type_name[-1])
197
+ attributes = vector_type_attribute_names[:num_elements]
198
+ vector_type = make_vector_type(type_name, base_type, attributes, stub)
199
+ vector_types[type_name] = vector_type
200
+
201
+ for vty in vector_types.values():
202
+ arglists, l = [], []
203
+ build_constructor_overloads(
204
+ vty.base_type, vty.name, vty.num_elements, arglists, l
205
+ )
206
+ enable_vector_type_ctor(vty, arglists)
207
+
208
+
209
+ _initialize()
@@ -0,0 +1,252 @@
1
+ from numba import cuda
2
+ from numpy import array as np_array
3
+ from numba.cuda import deviceufunc
4
+ from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
5
+ GUFuncCallSteps)
6
+
7
+
8
+ class CUDAUFuncDispatcher(object):
9
+ """
10
+ Invoke the CUDA ufunc specialization for the given inputs.
11
+ """
12
+
13
+ def __init__(self, types_to_retty_kernels, pyfunc):
14
+ self.functions = types_to_retty_kernels
15
+ self.__name__ = pyfunc.__name__
16
+
17
+ def __call__(self, *args, **kws):
18
+ """
19
+ *args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
20
+ Cannot mix the two types in one call.
21
+
22
+ **kws:
23
+ stream -- cuda stream; when defined, asynchronous mode is used.
24
+ out -- output array. Can be a numpy array or DeviceArrayBase
25
+ depending on the input arguments. Type must match
26
+ the input arguments.
27
+ """
28
+ return CUDAUFuncMechanism.call(self.functions, args, kws)
29
+
30
+ def reduce(self, arg, stream=0):
31
+ assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
32
+ "ufunc"
33
+ assert arg.ndim == 1, "must use 1d array"
34
+
35
+ n = arg.shape[0]
36
+ gpu_mems = []
37
+
38
+ if n == 0:
39
+ raise TypeError("Reduction on an empty array.")
40
+ elif n == 1: # nothing to do
41
+ return arg[0]
42
+
43
+ # always use a stream
44
+ stream = stream or cuda.stream()
45
+ with stream.auto_synchronize():
46
+ # transfer memory to device if necessary
47
+ if cuda.cudadrv.devicearray.is_cuda_ndarray(arg):
48
+ mem = arg
49
+ else:
50
+ mem = cuda.to_device(arg, stream)
51
+ # do reduction
52
+ out = self.__reduce(mem, gpu_mems, stream)
53
+ # use a small buffer to store the result element
54
+ buf = np_array((1,), dtype=arg.dtype)
55
+ out.copy_to_host(buf, stream=stream)
56
+
57
+ return buf[0]
58
+
59
+ def __reduce(self, mem, gpu_mems, stream):
60
+ n = mem.shape[0]
61
+ if n % 2 != 0: # odd?
62
+ fatcut, thincut = mem.split(n - 1)
63
+ # prevent freeing during async mode
64
+ gpu_mems.append(fatcut)
65
+ gpu_mems.append(thincut)
66
+ # execute the kernel
67
+ out = self.__reduce(fatcut, gpu_mems, stream)
68
+ gpu_mems.append(out)
69
+ return self(out, thincut, out=out, stream=stream)
70
+ else: # even?
71
+ left, right = mem.split(n // 2)
72
+ # prevent freeing during async mode
73
+ gpu_mems.append(left)
74
+ gpu_mems.append(right)
75
+ # execute the kernel
76
+ self(left, right, out=left, stream=stream)
77
+ if n // 2 > 1:
78
+ return self.__reduce(left, gpu_mems, stream)
79
+ else:
80
+ return left
81
+
82
+
83
+ class _CUDAGUFuncCallSteps(GUFuncCallSteps):
84
+ __slots__ = [
85
+ '_stream',
86
+ ]
87
+
88
+ def __init__(self, nin, nout, args, kwargs):
89
+ super().__init__(nin, nout, args, kwargs)
90
+ self._stream = kwargs.get('stream', 0)
91
+
92
+ def is_device_array(self, obj):
93
+ return cuda.is_cuda_array(obj)
94
+
95
+ def as_device_array(self, obj):
96
+ # We don't want to call as_cuda_array on objects that are already Numba
97
+ # device arrays, because this results in exporting the array as a
98
+ # Producer then importing it as a Consumer, which causes a
99
+ # synchronization on the array's stream (if it has one) by default.
100
+ # When we have a Numba device array, we can simply return it.
101
+ if cuda.cudadrv.devicearray.is_cuda_ndarray(obj):
102
+ return obj
103
+ return cuda.as_cuda_array(obj)
104
+
105
+ def to_device(self, hostary):
106
+ return cuda.to_device(hostary, stream=self._stream)
107
+
108
+ def to_host(self, devary, hostary):
109
+ out = devary.copy_to_host(hostary, stream=self._stream)
110
+ return out
111
+
112
+ def allocate_device_array(self, shape, dtype):
113
+ return cuda.device_array(shape=shape, dtype=dtype, stream=self._stream)
114
+
115
+ def launch_kernel(self, kernel, nelem, args):
116
+ kernel.forall(nelem, stream=self._stream)(*args)
117
+
118
+
119
+ class CUDAGeneralizedUFunc(GeneralizedUFunc):
120
+ def __init__(self, kernelmap, engine, pyfunc):
121
+ self.__name__ = pyfunc.__name__
122
+ super().__init__(kernelmap, engine)
123
+
124
+ @property
125
+ def _call_steps(self):
126
+ return _CUDAGUFuncCallSteps
127
+
128
+ def _broadcast_scalar_input(self, ary, shape):
129
+ return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
130
+ strides=(0,),
131
+ dtype=ary.dtype,
132
+ gpu_data=ary.gpu_data)
133
+
134
+ def _broadcast_add_axis(self, ary, newshape):
135
+ newax = len(newshape) - len(ary.shape)
136
+ # Add 0 strides for missing dimension
137
+ newstrides = (0,) * newax + ary.strides
138
+ return cuda.cudadrv.devicearray.DeviceNDArray(shape=newshape,
139
+ strides=newstrides,
140
+ dtype=ary.dtype,
141
+ gpu_data=ary.gpu_data)
142
+
143
+
144
+ class CUDAUFuncMechanism(UFuncMechanism):
145
+ """
146
+ Provide CUDA specialization
147
+ """
148
+ DEFAULT_STREAM = 0
149
+
150
+ def launch(self, func, count, stream, args):
151
+ func.forall(count, stream=stream)(*args)
152
+
153
+ def is_device_array(self, obj):
154
+ return cuda.is_cuda_array(obj)
155
+
156
+ def as_device_array(self, obj):
157
+ # We don't want to call as_cuda_array on objects that are already Numba
158
+ # device arrays, because this results in exporting the array as a
159
+ # Producer then importing it as a Consumer, which causes a
160
+ # synchronization on the array's stream (if it has one) by default.
161
+ # When we have a Numba device array, we can simply return it.
162
+ if cuda.cudadrv.devicearray.is_cuda_ndarray(obj):
163
+ return obj
164
+ return cuda.as_cuda_array(obj)
165
+
166
+ def to_device(self, hostary, stream):
167
+ return cuda.to_device(hostary, stream=stream)
168
+
169
+ def to_host(self, devary, stream):
170
+ return devary.copy_to_host(stream=stream)
171
+
172
+ def allocate_device_array(self, shape, dtype, stream):
173
+ return cuda.device_array(shape=shape, dtype=dtype, stream=stream)
174
+
175
+ def broadcast_device(self, ary, shape):
176
+ ax_differs = [ax for ax in range(len(shape))
177
+ if ax >= ary.ndim
178
+ or ary.shape[ax] != shape[ax]]
179
+
180
+ missingdim = len(shape) - len(ary.shape)
181
+ strides = [0] * missingdim + list(ary.strides)
182
+
183
+ for ax in ax_differs:
184
+ strides[ax] = 0
185
+
186
+ return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
187
+ strides=strides,
188
+ dtype=ary.dtype,
189
+ gpu_data=ary.gpu_data)
190
+
191
+
192
+ vectorizer_stager_source = '''
193
+ def __vectorized_{name}({args}, __out__):
194
+ __tid__ = __cuda__.grid(1)
195
+ if __tid__ < __out__.shape[0]:
196
+ __out__[__tid__] = __core__({argitems})
197
+ '''
198
+
199
+
200
+ class CUDAVectorize(deviceufunc.DeviceVectorize):
201
+ def _compile_core(self, sig):
202
+ cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc)
203
+ return cudevfn, cudevfn.overloads[sig.args].signature.return_type
204
+
205
+ def _get_globals(self, corefn):
206
+ glbl = self.pyfunc.__globals__.copy()
207
+ glbl.update({'__cuda__': cuda,
208
+ '__core__': corefn})
209
+ return glbl
210
+
211
+ def _compile_kernel(self, fnobj, sig):
212
+ return cuda.jit(fnobj)
213
+
214
+ def build_ufunc(self):
215
+ return CUDAUFuncDispatcher(self.kernelmap, self.pyfunc)
216
+
217
+ @property
218
+ def _kernel_template(self):
219
+ return vectorizer_stager_source
220
+
221
+
222
+ # ------------------------------------------------------------------------------
223
+ # Generalized CUDA ufuncs
224
+
225
+ _gufunc_stager_source = '''
226
+ def __gufunc_{name}({args}):
227
+ __tid__ = __cuda__.grid(1)
228
+ if __tid__ < {checkedarg}:
229
+ __core__({argitems})
230
+ '''
231
+
232
+
233
+ class CUDAGUFuncVectorize(deviceufunc.DeviceGUFuncVectorize):
234
+ def build_ufunc(self):
235
+ engine = deviceufunc.GUFuncEngine(self.inputsig, self.outputsig)
236
+ return CUDAGeneralizedUFunc(kernelmap=self.kernelmap,
237
+ engine=engine,
238
+ pyfunc=self.pyfunc)
239
+
240
+ def _compile_kernel(self, fnobj, sig):
241
+ return cuda.jit(sig)(fnobj)
242
+
243
+ @property
244
+ def _kernel_template(self):
245
+ return _gufunc_stager_source
246
+
247
+ def _get_globals(self, sig):
248
+ corefn = cuda.jit(sig, device=True)(self.pyfunc)
249
+ glbls = self.py_func.__globals__.copy()
250
+ glbls.update({'__cuda__': cuda,
251
+ '__core__': corefn})
252
+ return glbls
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2012, Anaconda, Inc.
2
+ Copyright (c) 2024, NVIDIA CORPORATION.
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are
7
+ met:
8
+
9
+ Redistributions of source code must retain the above copyright notice,
10
+ this list of conditions and the following disclaimer.
11
+
12
+ Redistributions in binary form must reproduce the above copyright
13
+ notice, this list of conditions and the following disclaimer in the
14
+ documentation and/or other materials provided with the distribution.
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.1
2
+ Name: numba-cuda
3
+ Version: 0.0.12
4
+ Summary: CUDA target for Numba
5
+ Author: Anaconda Inc., NVIDIA Corporation
6
+ License: BSD 2-clause
7
+ Project-URL: Homepage, https://github.com/rapidsai/numba-cuda
8
+ Project-URL: Documentation, https://github.com/rapidsai/numba-cuda/blob/main/README.md
9
+ Project-URL: Repository, https://github.com/rapidsai/numba-cuda
10
+ Project-URL: License, https://github.com/rapidsai/numba-cuda/blob/main/LICENSE
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+
15
+ # Numba CUDA Target
16
+
17
+ An out-of-tree CUDA target for Numba.
18
+
19
+ This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
20
+ and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
21
+ used as the `numba.cuda` module instead of the code from the `numba` package.
22
+
23
+ This is presently in an early state and is published for testing and feedback.
24
+
25
+ ## Building / testing
26
+
27
+ Install as an editable install:
28
+
29
+ ```
30
+ pip install -e .
31
+ ```
32
+
33
+ Running tests:
34
+
35
+ ```
36
+ python -m numba.runtests numba.cuda.tests
37
+ ```
38
+
39
+ This should discover the`numba.cuda` module from the `numba_cuda` package. You
40
+ can check where `numba.cuda` files are being located by running
41
+
42
+ ```
43
+ python -c "from numba import cuda; print(cuda.__file__)"
44
+ ```
45
+
46
+ which will show a path like:
47
+
48
+ ```
49
+ <path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
50
+ ```
51
+
52
+ ## Branching strategy
53
+
54
+ Presently the `main` branch is being used to target the exact behavior of the
55
+ built-in CUDA target. New feature development and bug fixes should be applied to
56
+ `develop`. Once the `main` branch is widely tested and confirmed to work well as
57
+ a drop-in replacement for the built-in `numba.cuda`, the `develop` branch will
58
+ be merged in and new feature development will proceed on `main`.
59
+
60
+ ### Current PR targets
61
+
62
+ - PRs related to replacing the built-in CUDA target's features should target
63
+ `main`.
64
+ - PRs adding new features and bug fixes should target `develop`.
65
+
66
+ ### Future PR targets
67
+
68
+ - In future, all PRs should target the `main` branch.