numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,806 @@
1
+ import operator
2
+ from numba.core import types
3
+ from numba.core.typing.npydecl import (parse_dtype, parse_shape,
4
+ register_number_classes,
5
+ register_numpy_ufunc,
6
+ trigonometric_functions,
7
+ comparison_functions,
8
+ math_operations,
9
+ bit_twiddling_functions)
10
+ from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
11
+ AbstractTemplate, CallableTemplate,
12
+ signature, Registry)
13
+ from numba.cuda.types import dim3
14
+ from numba.core.typeconv import Conversion
15
+ from numba import cuda
16
+ from numba.cuda.compiler import declare_device_function_template
17
+
18
+ registry = Registry()
19
+ register = registry.register
20
+ register_attr = registry.register_attr
21
+ register_global = registry.register_global
22
+
23
+ register_number_classes(register_global)
24
+
25
+
26
+ class Cuda_array_decl(CallableTemplate):
27
+ def generic(self):
28
+ def typer(shape, dtype):
29
+
30
+ # Only integer literals and tuples of integer literals are valid
31
+ # shapes
32
+ if isinstance(shape, types.Integer):
33
+ if not isinstance(shape, types.IntegerLiteral):
34
+ return None
35
+ elif isinstance(shape, (types.Tuple, types.UniTuple)):
36
+ if any([not isinstance(s, types.IntegerLiteral)
37
+ for s in shape]):
38
+ return None
39
+ else:
40
+ return None
41
+
42
+ ndim = parse_shape(shape)
43
+ nb_dtype = parse_dtype(dtype)
44
+ if nb_dtype is not None and ndim is not None:
45
+ return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
46
+
47
+ return typer
48
+
49
+
50
+ @register
51
+ class Cuda_shared_array(Cuda_array_decl):
52
+ key = cuda.shared.array
53
+
54
+
55
+ @register
56
+ class Cuda_local_array(Cuda_array_decl):
57
+ key = cuda.local.array
58
+
59
+
60
+ @register
61
+ class Cuda_const_array_like(CallableTemplate):
62
+ key = cuda.const.array_like
63
+
64
+ def generic(self):
65
+ def typer(ndarray):
66
+ return ndarray
67
+ return typer
68
+
69
+
70
+ @register
71
+ class Cuda_threadfence_device(ConcreteTemplate):
72
+ key = cuda.threadfence
73
+ cases = [signature(types.none)]
74
+
75
+
76
+ @register
77
+ class Cuda_threadfence_block(ConcreteTemplate):
78
+ key = cuda.threadfence_block
79
+ cases = [signature(types.none)]
80
+
81
+
82
+ @register
83
+ class Cuda_threadfence_system(ConcreteTemplate):
84
+ key = cuda.threadfence_system
85
+ cases = [signature(types.none)]
86
+
87
+
88
+ @register
89
+ class Cuda_syncwarp(ConcreteTemplate):
90
+ key = cuda.syncwarp
91
+ cases = [signature(types.none), signature(types.none, types.i4)]
92
+
93
+
94
+ @register
95
+ class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
96
+ key = cuda.shfl_sync_intrinsic
97
+ cases = [
98
+ signature(types.Tuple((types.i4, types.b1)),
99
+ types.i4, types.i4, types.i4, types.i4, types.i4),
100
+ signature(types.Tuple((types.i8, types.b1)),
101
+ types.i4, types.i4, types.i8, types.i4, types.i4),
102
+ signature(types.Tuple((types.f4, types.b1)),
103
+ types.i4, types.i4, types.f4, types.i4, types.i4),
104
+ signature(types.Tuple((types.f8, types.b1)),
105
+ types.i4, types.i4, types.f8, types.i4, types.i4),
106
+ ]
107
+
108
+
109
+ @register
110
+ class Cuda_vote_sync_intrinsic(ConcreteTemplate):
111
+ key = cuda.vote_sync_intrinsic
112
+ cases = [signature(types.Tuple((types.i4, types.b1)),
113
+ types.i4, types.i4, types.b1)]
114
+
115
+
116
+ @register
117
+ class Cuda_match_any_sync(ConcreteTemplate):
118
+ key = cuda.match_any_sync
119
+ cases = [
120
+ signature(types.i4, types.i4, types.i4),
121
+ signature(types.i4, types.i4, types.i8),
122
+ signature(types.i4, types.i4, types.f4),
123
+ signature(types.i4, types.i4, types.f8),
124
+ ]
125
+
126
+
127
+ @register
128
+ class Cuda_match_all_sync(ConcreteTemplate):
129
+ key = cuda.match_all_sync
130
+ cases = [
131
+ signature(types.Tuple((types.i4, types.b1)), types.i4, types.i4),
132
+ signature(types.Tuple((types.i4, types.b1)), types.i4, types.i8),
133
+ signature(types.Tuple((types.i4, types.b1)), types.i4, types.f4),
134
+ signature(types.Tuple((types.i4, types.b1)), types.i4, types.f8),
135
+ ]
136
+
137
+
138
+ @register
139
+ class Cuda_activemask(ConcreteTemplate):
140
+ key = cuda.activemask
141
+ cases = [signature(types.uint32)]
142
+
143
+
144
+ @register
145
+ class Cuda_lanemask_lt(ConcreteTemplate):
146
+ key = cuda.lanemask_lt
147
+ cases = [signature(types.uint32)]
148
+
149
+
150
+ @register
151
+ class Cuda_popc(ConcreteTemplate):
152
+ """
153
+ Supported types from `llvm.popc`
154
+ [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
155
+ """
156
+ key = cuda.popc
157
+ cases = [
158
+ signature(types.int8, types.int8),
159
+ signature(types.int16, types.int16),
160
+ signature(types.int32, types.int32),
161
+ signature(types.int64, types.int64),
162
+ signature(types.uint8, types.uint8),
163
+ signature(types.uint16, types.uint16),
164
+ signature(types.uint32, types.uint32),
165
+ signature(types.uint64, types.uint64),
166
+ ]
167
+
168
+
169
+ @register
170
+ class Cuda_fma(ConcreteTemplate):
171
+ """
172
+ Supported types from `llvm.fma`
173
+ [here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
174
+ """
175
+ key = cuda.fma
176
+ cases = [
177
+ signature(types.float32, types.float32, types.float32, types.float32),
178
+ signature(types.float64, types.float64, types.float64, types.float64),
179
+ ]
180
+
181
+
182
+ @register
183
+ class Cuda_hfma(ConcreteTemplate):
184
+ key = cuda.fp16.hfma
185
+ cases = [
186
+ signature(types.float16, types.float16, types.float16, types.float16)
187
+ ]
188
+
189
+
190
+ @register
191
+ class Cuda_cbrt(ConcreteTemplate):
192
+
193
+ key = cuda.cbrt
194
+ cases = [
195
+ signature(types.float32, types.float32),
196
+ signature(types.float64, types.float64),
197
+ ]
198
+
199
+
200
+ @register
201
+ class Cuda_brev(ConcreteTemplate):
202
+ key = cuda.brev
203
+ cases = [
204
+ signature(types.uint32, types.uint32),
205
+ signature(types.uint64, types.uint64),
206
+ ]
207
+
208
+
209
+ @register
210
+ class Cuda_clz(ConcreteTemplate):
211
+ """
212
+ Supported types from `llvm.ctlz`
213
+ [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
214
+ """
215
+ key = cuda.clz
216
+ cases = [
217
+ signature(types.int8, types.int8),
218
+ signature(types.int16, types.int16),
219
+ signature(types.int32, types.int32),
220
+ signature(types.int64, types.int64),
221
+ signature(types.uint8, types.uint8),
222
+ signature(types.uint16, types.uint16),
223
+ signature(types.uint32, types.uint32),
224
+ signature(types.uint64, types.uint64),
225
+ ]
226
+
227
+
228
+ @register
229
+ class Cuda_ffs(ConcreteTemplate):
230
+ """
231
+ Supported types from `llvm.cttz`
232
+ [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
233
+ """
234
+ key = cuda.ffs
235
+ cases = [
236
+ signature(types.uint32, types.int8),
237
+ signature(types.uint32, types.int16),
238
+ signature(types.uint32, types.int32),
239
+ signature(types.uint32, types.int64),
240
+ signature(types.uint32, types.uint8),
241
+ signature(types.uint32, types.uint16),
242
+ signature(types.uint32, types.uint32),
243
+ signature(types.uint32, types.uint64),
244
+ ]
245
+
246
+
247
+ @register
248
+ class Cuda_selp(AbstractTemplate):
249
+ key = cuda.selp
250
+
251
+ def generic(self, args, kws):
252
+ assert not kws
253
+ test, a, b = args
254
+
255
+ # per docs
256
+ # http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
257
+ supported_types = (types.float64, types.float32,
258
+ types.int16, types.uint16,
259
+ types.int32, types.uint32,
260
+ types.int64, types.uint64)
261
+
262
+ if a != b or a not in supported_types:
263
+ return
264
+
265
+ return signature(a, test, a, a)
266
+
267
+
268
+ def _genfp16_unary(l_key):
269
+ @register
270
+ class Cuda_fp16_unary(ConcreteTemplate):
271
+ key = l_key
272
+ cases = [signature(types.float16, types.float16)]
273
+
274
+ return Cuda_fp16_unary
275
+
276
+
277
+ def _genfp16_unary_operator(l_key):
278
+ @register_global(l_key)
279
+ class Cuda_fp16_unary(AbstractTemplate):
280
+ key = l_key
281
+
282
+ def generic(self, args, kws):
283
+ assert not kws
284
+ if len(args) == 1 and args[0] == types.float16:
285
+ return signature(types.float16, types.float16)
286
+
287
+ return Cuda_fp16_unary
288
+
289
+
290
+ def _genfp16_binary(l_key):
291
+ @register
292
+ class Cuda_fp16_binary(ConcreteTemplate):
293
+ key = l_key
294
+ cases = [signature(types.float16, types.float16, types.float16)]
295
+
296
+ return Cuda_fp16_binary
297
+
298
+
299
+ @register_global(float)
300
+ class Float(AbstractTemplate):
301
+
302
+ def generic(self, args, kws):
303
+ assert not kws
304
+
305
+ [arg] = args
306
+
307
+ if arg == types.float16:
308
+ return signature(arg, arg)
309
+
310
+
311
+ def _genfp16_binary_comparison(l_key):
312
+ @register
313
+ class Cuda_fp16_cmp(ConcreteTemplate):
314
+ key = l_key
315
+
316
+ cases = [
317
+ signature(types.b1, types.float16, types.float16)
318
+ ]
319
+ return Cuda_fp16_cmp
320
+
321
+ # If multiple ConcreteTemplates provide typing for a single function, then
322
+ # function resolution will pick the first compatible typing it finds even if it
323
+ # involves inserting a cast that would be considered undesirable (in this
324
+ # specific case, float16s could be cast to float32s for comparisons).
325
+ #
326
+ # To work around this, we instead use an AbstractTemplate that implements
327
+ # exactly the casting logic that we desire. The AbstractTemplate gets
328
+ # considered in preference to ConcreteTemplates during typing.
329
+ #
330
+ # This is tracked as Issue #7863 (https://github.com/numba/numba/issues/7863) -
331
+ # once this is resolved it should be possible to replace this AbstractTemplate
332
+ # with a ConcreteTemplate to simplify the logic.
333
+
334
+
335
+ def _fp16_binary_operator(l_key, retty):
336
+ @register_global(l_key)
337
+ class Cuda_fp16_operator(AbstractTemplate):
338
+ key = l_key
339
+
340
+ def generic(self, args, kws):
341
+ assert not kws
342
+
343
+ if len(args) == 2 and \
344
+ (args[0] == types.float16 or args[1] == types.float16):
345
+ if (args[0] == types.float16):
346
+ convertible = self.context.can_convert(args[1], args[0])
347
+ else:
348
+ convertible = self.context.can_convert(args[0], args[1])
349
+
350
+ # We allow three cases here:
351
+ #
352
+ # 1. fp16 to fp16 - Conversion.exact
353
+ # 2. fp16 to other types fp16 can be promoted to
354
+ # - Conversion.promote
355
+ # 3. fp16 to int8 (safe conversion) -
356
+ # - Conversion.safe
357
+
358
+ if (convertible == Conversion.exact) or \
359
+ (convertible == Conversion.promote) or \
360
+ (convertible == Conversion.safe):
361
+ return signature(retty, types.float16, types.float16)
362
+
363
+ return Cuda_fp16_operator
364
+
365
+
366
+ def _genfp16_comparison_operator(op):
367
+ return _fp16_binary_operator(op, types.b1)
368
+
369
+
370
+ def _genfp16_binary_operator(op):
371
+ return _fp16_binary_operator(op, types.float16)
372
+
373
+
374
+ Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
375
+ Cuda_add = _genfp16_binary_operator(operator.add)
376
+ Cuda_iadd = _genfp16_binary_operator(operator.iadd)
377
+ Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
378
+ Cuda_sub = _genfp16_binary_operator(operator.sub)
379
+ Cuda_isub = _genfp16_binary_operator(operator.isub)
380
+ Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
381
+ Cuda_mul = _genfp16_binary_operator(operator.mul)
382
+ Cuda_imul = _genfp16_binary_operator(operator.imul)
383
+ Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
384
+ Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
385
+ Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
386
+ Cuda_neg = _genfp16_unary_operator(operator.neg)
387
+ Cuda_habs = _genfp16_unary(cuda.fp16.habs)
388
+ Cuda_abs = _genfp16_unary_operator(abs)
389
+ Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
390
+ _genfp16_comparison_operator(operator.eq)
391
+ Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
392
+ _genfp16_comparison_operator(operator.ne)
393
+ Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
394
+ _genfp16_comparison_operator(operator.ge)
395
+ Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
396
+ _genfp16_comparison_operator(operator.gt)
397
+ Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
398
+ _genfp16_comparison_operator(operator.le)
399
+ Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
400
+ _genfp16_comparison_operator(operator.lt)
401
+ _genfp16_binary_operator(operator.truediv)
402
+ _genfp16_binary_operator(operator.itruediv)
403
+
404
+
405
+ def _resolve_wrapped_unary(fname):
406
+ decl = declare_device_function_template(f'__numba_wrapper_{fname}',
407
+ types.float16,
408
+ (types.float16,))
409
+ return types.Function(decl)
410
+
411
+
412
+ def _resolve_wrapped_binary(fname):
413
+ decl = declare_device_function_template(f'__numba_wrapper_{fname}',
414
+ types.float16,
415
+ (types.float16, types.float16,))
416
+ return types.Function(decl)
417
+
418
+
419
+ hsin_device = _resolve_wrapped_unary('hsin')
420
+ hcos_device = _resolve_wrapped_unary('hcos')
421
+ hlog_device = _resolve_wrapped_unary('hlog')
422
+ hlog10_device = _resolve_wrapped_unary('hlog10')
423
+ hlog2_device = _resolve_wrapped_unary('hlog2')
424
+ hexp_device = _resolve_wrapped_unary('hexp')
425
+ hexp10_device = _resolve_wrapped_unary('hexp10')
426
+ hexp2_device = _resolve_wrapped_unary('hexp2')
427
+ hsqrt_device = _resolve_wrapped_unary('hsqrt')
428
+ hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
429
+ hfloor_device = _resolve_wrapped_unary('hfloor')
430
+ hceil_device = _resolve_wrapped_unary('hceil')
431
+ hrcp_device = _resolve_wrapped_unary('hrcp')
432
+ hrint_device = _resolve_wrapped_unary('hrint')
433
+ htrunc_device = _resolve_wrapped_unary('htrunc')
434
+ hdiv_device = _resolve_wrapped_binary('hdiv')
435
+
436
+
437
+ # generate atomic operations
438
+ def _gen(l_key, supported_types):
439
+ @register
440
+ class Cuda_atomic(AbstractTemplate):
441
+ key = l_key
442
+
443
+ def generic(self, args, kws):
444
+ assert not kws
445
+ ary, idx, val = args
446
+
447
+ if ary.dtype not in supported_types:
448
+ return
449
+
450
+ if ary.ndim == 1:
451
+ return signature(ary.dtype, ary, types.intp, ary.dtype)
452
+ elif ary.ndim > 1:
453
+ return signature(ary.dtype, ary, idx, ary.dtype)
454
+ return Cuda_atomic
455
+
456
+
457
+ all_numba_types = (types.float64, types.float32,
458
+ types.int32, types.uint32,
459
+ types.int64, types.uint64)
460
+
461
+ integer_numba_types = (types.int32, types.uint32,
462
+ types.int64, types.uint64)
463
+
464
+ unsigned_int_numba_types = (types.uint32, types.uint64)
465
+
466
+ Cuda_atomic_add = _gen(cuda.atomic.add, all_numba_types)
467
+ Cuda_atomic_sub = _gen(cuda.atomic.sub, all_numba_types)
468
+ Cuda_atomic_max = _gen(cuda.atomic.max, all_numba_types)
469
+ Cuda_atomic_min = _gen(cuda.atomic.min, all_numba_types)
470
+ Cuda_atomic_nanmax = _gen(cuda.atomic.nanmax, all_numba_types)
471
+ Cuda_atomic_nanmin = _gen(cuda.atomic.nanmin, all_numba_types)
472
+ Cuda_atomic_and = _gen(cuda.atomic.and_, integer_numba_types)
473
+ Cuda_atomic_or = _gen(cuda.atomic.or_, integer_numba_types)
474
+ Cuda_atomic_xor = _gen(cuda.atomic.xor, integer_numba_types)
475
+ Cuda_atomic_inc = _gen(cuda.atomic.inc, unsigned_int_numba_types)
476
+ Cuda_atomic_dec = _gen(cuda.atomic.dec, unsigned_int_numba_types)
477
+ Cuda_atomic_exch = _gen(cuda.atomic.exch, integer_numba_types)
478
+
479
+
480
+ @register
481
+ class Cuda_atomic_compare_and_swap(AbstractTemplate):
482
+ key = cuda.atomic.compare_and_swap
483
+
484
+ def generic(self, args, kws):
485
+ assert not kws
486
+ ary, old, val = args
487
+ dty = ary.dtype
488
+
489
+ if dty in integer_numba_types and ary.ndim == 1:
490
+ return signature(dty, ary, dty, dty)
491
+
492
+
493
+ @register
494
+ class Cuda_atomic_cas(AbstractTemplate):
495
+ key = cuda.atomic.cas
496
+
497
+ def generic(self, args, kws):
498
+ assert not kws
499
+ ary, idx, old, val = args
500
+ dty = ary.dtype
501
+
502
+ if dty not in integer_numba_types:
503
+ return
504
+
505
+ if ary.ndim == 1:
506
+ return signature(dty, ary, types.intp, dty, dty)
507
+ elif ary.ndim > 1:
508
+ return signature(dty, ary, idx, dty, dty)
509
+
510
+
511
+ @register
512
+ class Cuda_nanosleep(ConcreteTemplate):
513
+ key = cuda.nanosleep
514
+
515
+ cases = [signature(types.void, types.uint32)]
516
+
517
+
518
+ @register_attr
519
+ class Dim3_attrs(AttributeTemplate):
520
+ key = dim3
521
+
522
+ def resolve_x(self, mod):
523
+ return types.int32
524
+
525
+ def resolve_y(self, mod):
526
+ return types.int32
527
+
528
+ def resolve_z(self, mod):
529
+ return types.int32
530
+
531
+
532
+ @register_attr
533
+ class CudaSharedModuleTemplate(AttributeTemplate):
534
+ key = types.Module(cuda.shared)
535
+
536
+ def resolve_array(self, mod):
537
+ return types.Function(Cuda_shared_array)
538
+
539
+
540
+ @register_attr
541
+ class CudaConstModuleTemplate(AttributeTemplate):
542
+ key = types.Module(cuda.const)
543
+
544
+ def resolve_array_like(self, mod):
545
+ return types.Function(Cuda_const_array_like)
546
+
547
+
548
+ @register_attr
549
+ class CudaLocalModuleTemplate(AttributeTemplate):
550
+ key = types.Module(cuda.local)
551
+
552
+ def resolve_array(self, mod):
553
+ return types.Function(Cuda_local_array)
554
+
555
+
556
+ @register_attr
557
+ class CudaAtomicTemplate(AttributeTemplate):
558
+ key = types.Module(cuda.atomic)
559
+
560
+ def resolve_add(self, mod):
561
+ return types.Function(Cuda_atomic_add)
562
+
563
+ def resolve_sub(self, mod):
564
+ return types.Function(Cuda_atomic_sub)
565
+
566
+ def resolve_and_(self, mod):
567
+ return types.Function(Cuda_atomic_and)
568
+
569
+ def resolve_or_(self, mod):
570
+ return types.Function(Cuda_atomic_or)
571
+
572
+ def resolve_xor(self, mod):
573
+ return types.Function(Cuda_atomic_xor)
574
+
575
+ def resolve_inc(self, mod):
576
+ return types.Function(Cuda_atomic_inc)
577
+
578
+ def resolve_dec(self, mod):
579
+ return types.Function(Cuda_atomic_dec)
580
+
581
+ def resolve_exch(self, mod):
582
+ return types.Function(Cuda_atomic_exch)
583
+
584
+ def resolve_max(self, mod):
585
+ return types.Function(Cuda_atomic_max)
586
+
587
+ def resolve_min(self, mod):
588
+ return types.Function(Cuda_atomic_min)
589
+
590
+ def resolve_nanmin(self, mod):
591
+ return types.Function(Cuda_atomic_nanmin)
592
+
593
+ def resolve_nanmax(self, mod):
594
+ return types.Function(Cuda_atomic_nanmax)
595
+
596
+ def resolve_compare_and_swap(self, mod):
597
+ return types.Function(Cuda_atomic_compare_and_swap)
598
+
599
+ def resolve_cas(self, mod):
600
+ return types.Function(Cuda_atomic_cas)
601
+
602
+
603
+ @register_attr
604
+ class CudaFp16Template(AttributeTemplate):
605
+ key = types.Module(cuda.fp16)
606
+
607
+ def resolve_hadd(self, mod):
608
+ return types.Function(Cuda_hadd)
609
+
610
+ def resolve_hsub(self, mod):
611
+ return types.Function(Cuda_hsub)
612
+
613
+ def resolve_hmul(self, mod):
614
+ return types.Function(Cuda_hmul)
615
+
616
+ def resolve_hdiv(self, mod):
617
+ return hdiv_device
618
+
619
+ def resolve_hneg(self, mod):
620
+ return types.Function(Cuda_hneg)
621
+
622
+ def resolve_habs(self, mod):
623
+ return types.Function(Cuda_habs)
624
+
625
+ def resolve_hfma(self, mod):
626
+ return types.Function(Cuda_hfma)
627
+
628
+ def resolve_hsin(self, mod):
629
+ return hsin_device
630
+
631
+ def resolve_hcos(self, mod):
632
+ return hcos_device
633
+
634
+ def resolve_hlog(self, mod):
635
+ return hlog_device
636
+
637
+ def resolve_hlog10(self, mod):
638
+ return hlog10_device
639
+
640
+ def resolve_hlog2(self, mod):
641
+ return hlog2_device
642
+
643
+ def resolve_hexp(self, mod):
644
+ return hexp_device
645
+
646
+ def resolve_hexp10(self, mod):
647
+ return hexp10_device
648
+
649
+ def resolve_hexp2(self, mod):
650
+ return hexp2_device
651
+
652
+ def resolve_hfloor(self, mod):
653
+ return hfloor_device
654
+
655
+ def resolve_hceil(self, mod):
656
+ return hceil_device
657
+
658
+ def resolve_hsqrt(self, mod):
659
+ return hsqrt_device
660
+
661
+ def resolve_hrsqrt(self, mod):
662
+ return hrsqrt_device
663
+
664
+ def resolve_hrcp(self, mod):
665
+ return hrcp_device
666
+
667
+ def resolve_hrint(self, mod):
668
+ return hrint_device
669
+
670
+ def resolve_htrunc(self, mod):
671
+ return htrunc_device
672
+
673
+ def resolve_heq(self, mod):
674
+ return types.Function(Cuda_heq)
675
+
676
+ def resolve_hne(self, mod):
677
+ return types.Function(Cuda_hne)
678
+
679
+ def resolve_hge(self, mod):
680
+ return types.Function(Cuda_hge)
681
+
682
+ def resolve_hgt(self, mod):
683
+ return types.Function(Cuda_hgt)
684
+
685
+ def resolve_hle(self, mod):
686
+ return types.Function(Cuda_hle)
687
+
688
+ def resolve_hlt(self, mod):
689
+ return types.Function(Cuda_hlt)
690
+
691
+ def resolve_hmax(self, mod):
692
+ return types.Function(Cuda_hmax)
693
+
694
+ def resolve_hmin(self, mod):
695
+ return types.Function(Cuda_hmin)
696
+
697
+
698
+ @register_attr
699
+ class CudaModuleTemplate(AttributeTemplate):
700
+ key = types.Module(cuda)
701
+
702
+ def resolve_cg(self, mod):
703
+ return types.Module(cuda.cg)
704
+
705
+ def resolve_threadIdx(self, mod):
706
+ return dim3
707
+
708
+ def resolve_blockIdx(self, mod):
709
+ return dim3
710
+
711
+ def resolve_blockDim(self, mod):
712
+ return dim3
713
+
714
+ def resolve_gridDim(self, mod):
715
+ return dim3
716
+
717
+ def resolve_laneid(self, mod):
718
+ return types.int32
719
+
720
+ def resolve_shared(self, mod):
721
+ return types.Module(cuda.shared)
722
+
723
+ def resolve_popc(self, mod):
724
+ return types.Function(Cuda_popc)
725
+
726
+ def resolve_brev(self, mod):
727
+ return types.Function(Cuda_brev)
728
+
729
+ def resolve_clz(self, mod):
730
+ return types.Function(Cuda_clz)
731
+
732
+ def resolve_ffs(self, mod):
733
+ return types.Function(Cuda_ffs)
734
+
735
+ def resolve_fma(self, mod):
736
+ return types.Function(Cuda_fma)
737
+
738
+ def resolve_cbrt(self, mod):
739
+ return types.Function(Cuda_cbrt)
740
+
741
+ def resolve_threadfence(self, mod):
742
+ return types.Function(Cuda_threadfence_device)
743
+
744
+ def resolve_threadfence_block(self, mod):
745
+ return types.Function(Cuda_threadfence_block)
746
+
747
+ def resolve_threadfence_system(self, mod):
748
+ return types.Function(Cuda_threadfence_system)
749
+
750
+ def resolve_syncwarp(self, mod):
751
+ return types.Function(Cuda_syncwarp)
752
+
753
+ def resolve_shfl_sync_intrinsic(self, mod):
754
+ return types.Function(Cuda_shfl_sync_intrinsic)
755
+
756
+ def resolve_vote_sync_intrinsic(self, mod):
757
+ return types.Function(Cuda_vote_sync_intrinsic)
758
+
759
+ def resolve_match_any_sync(self, mod):
760
+ return types.Function(Cuda_match_any_sync)
761
+
762
+ def resolve_match_all_sync(self, mod):
763
+ return types.Function(Cuda_match_all_sync)
764
+
765
+ def resolve_activemask(self, mod):
766
+ return types.Function(Cuda_activemask)
767
+
768
+ def resolve_lanemask_lt(self, mod):
769
+ return types.Function(Cuda_lanemask_lt)
770
+
771
+ def resolve_selp(self, mod):
772
+ return types.Function(Cuda_selp)
773
+
774
+ def resolve_nanosleep(self, mod):
775
+ return types.Function(Cuda_nanosleep)
776
+
777
+ def resolve_atomic(self, mod):
778
+ return types.Module(cuda.atomic)
779
+
780
+ def resolve_fp16(self, mod):
781
+ return types.Module(cuda.fp16)
782
+
783
+ def resolve_const(self, mod):
784
+ return types.Module(cuda.const)
785
+
786
+ def resolve_local(self, mod):
787
+ return types.Module(cuda.local)
788
+
789
+
790
+ register_global(cuda, types.Module(cuda))
791
+
792
+
793
+ # NumPy
794
+
795
+ for func in trigonometric_functions:
796
+ register_numpy_ufunc(func, register_global)
797
+
798
+ for func in comparison_functions:
799
+ register_numpy_ufunc(func, register_global)
800
+
801
+ for func in bit_twiddling_functions:
802
+ register_numpy_ufunc(func, register_global)
803
+
804
+ for func in math_operations:
805
+ if func in ('log', 'log2', 'log10'):
806
+ register_numpy_ufunc(func, register_global)