numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,495 @@
1
+ '''
2
+ Implements the cuda module as called from within an executing kernel
3
+ (@cuda.jit-decorated function).
4
+ '''
5
+
6
+ from contextlib import contextmanager
7
+ import sys
8
+ import threading
9
+ import traceback
10
+ from numba.core import types
11
+ import numpy as np
12
+
13
+ from numba.np import numpy_support
14
+
15
+ from .vector_types import vector_types
16
+
17
+
18
+ class Dim3(object):
19
+ '''
20
+ Used to implement thread/block indices/dimensions
21
+ '''
22
+ def __init__(self, x, y, z):
23
+ self.x = x
24
+ self.y = y
25
+ self.z = z
26
+
27
+ def __str__(self):
28
+ return '(%s, %s, %s)' % (self.x, self.y, self.z)
29
+
30
+ def __repr__(self):
31
+ return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
32
+
33
+ def __iter__(self):
34
+ yield self.x
35
+ yield self.y
36
+ yield self.z
37
+
38
+
39
+ class GridGroup:
40
+ '''
41
+ Used to implement the grid group.
42
+ '''
43
+
44
+ def sync(self):
45
+ # Synchronization of the grid group is equivalent to synchronization of
46
+ # the thread block, because we only support cooperative grids with one
47
+ # block.
48
+ threading.current_thread().syncthreads()
49
+
50
+
51
+ class FakeCUDACg:
52
+ '''
53
+ CUDA Cooperative Groups
54
+ '''
55
+ def this_grid(self):
56
+ return GridGroup()
57
+
58
+
59
+ class FakeCUDALocal(object):
60
+ '''
61
+ CUDA Local arrays
62
+ '''
63
+ def array(self, shape, dtype):
64
+ if isinstance(dtype, types.Type):
65
+ dtype = numpy_support.as_dtype(dtype)
66
+ return np.empty(shape, dtype)
67
+
68
+
69
+ class FakeCUDAConst(object):
70
+ '''
71
+ CUDA Const arrays
72
+ '''
73
+ def array_like(self, ary):
74
+ return ary
75
+
76
+
77
+ class FakeCUDAShared(object):
78
+ '''
79
+ CUDA Shared arrays.
80
+
81
+ Limitations: assumes that only one call to cuda.shared.array is on a line,
82
+ and that that line is only executed once per thread. i.e.::
83
+
84
+ a = cuda.shared.array(...); b = cuda.shared.array(...)
85
+
86
+ will erroneously alias a and b, and::
87
+
88
+ for i in range(10):
89
+ sharedarrs[i] = cuda.shared.array(...)
90
+
91
+ will alias all arrays created at that point (though it is not certain that
92
+ this would be supported by Numba anyway).
93
+ '''
94
+
95
+ def __init__(self, dynshared_size):
96
+ self._allocations = {}
97
+ self._dynshared_size = dynshared_size
98
+ self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
99
+
100
+ def array(self, shape, dtype):
101
+ if isinstance(dtype, types.Type):
102
+ dtype = numpy_support.as_dtype(dtype)
103
+ # Dynamic shared memory is requested with size 0 - this all shares the
104
+ # same underlying memory
105
+ if shape == 0:
106
+ # Count must be the maximum number of whole elements that fit in the
107
+ # buffer (Numpy complains if the buffer is not a multiple of the
108
+ # element size)
109
+ count = self._dynshared_size // dtype.itemsize
110
+ return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)
111
+
112
+ # Otherwise, identify allocations by source file and line number
113
+ # We pass the reference frame explicitly to work around
114
+ # http://bugs.python.org/issue25108
115
+ stack = traceback.extract_stack(sys._getframe())
116
+ caller = stack[-2][0:2]
117
+ res = self._allocations.get(caller)
118
+ if res is None:
119
+ res = np.empty(shape, dtype)
120
+ self._allocations[caller] = res
121
+ return res
122
+
123
+
124
+ addlock = threading.Lock()
125
+ sublock = threading.Lock()
126
+ andlock = threading.Lock()
127
+ orlock = threading.Lock()
128
+ xorlock = threading.Lock()
129
+ maxlock = threading.Lock()
130
+ minlock = threading.Lock()
131
+ compare_and_swaplock = threading.Lock()
132
+ caslock = threading.Lock()
133
+ inclock = threading.Lock()
134
+ declock = threading.Lock()
135
+ exchlock = threading.Lock()
136
+
137
+
138
+ class FakeCUDAAtomic(object):
139
+ def add(self, array, index, val):
140
+ with addlock:
141
+ old = array[index]
142
+ array[index] += val
143
+ return old
144
+
145
+ def sub(self, array, index, val):
146
+ with sublock:
147
+ old = array[index]
148
+ array[index] -= val
149
+ return old
150
+
151
+ def and_(self, array, index, val):
152
+ with andlock:
153
+ old = array[index]
154
+ array[index] &= val
155
+ return old
156
+
157
+ def or_(self, array, index, val):
158
+ with orlock:
159
+ old = array[index]
160
+ array[index] |= val
161
+ return old
162
+
163
+ def xor(self, array, index, val):
164
+ with xorlock:
165
+ old = array[index]
166
+ array[index] ^= val
167
+ return old
168
+
169
+ def inc(self, array, index, val):
170
+ with inclock:
171
+ old = array[index]
172
+ if old >= val:
173
+ array[index] = 0
174
+ else:
175
+ array[index] += 1
176
+ return old
177
+
178
+ def dec(self, array, index, val):
179
+ with declock:
180
+ old = array[index]
181
+ if (old == 0) or (old > val):
182
+ array[index] = val
183
+ else:
184
+ array[index] -= 1
185
+ return old
186
+
187
+ def exch(self, array, index, val):
188
+ with exchlock:
189
+ old = array[index]
190
+ array[index] = val
191
+ return old
192
+
193
+ def max(self, array, index, val):
194
+ with maxlock:
195
+ old = array[index]
196
+ array[index] = max(old, val)
197
+ return old
198
+
199
+ def min(self, array, index, val):
200
+ with minlock:
201
+ old = array[index]
202
+ array[index] = min(old, val)
203
+ return old
204
+
205
+ def nanmax(self, array, index, val):
206
+ with maxlock:
207
+ old = array[index]
208
+ array[index] = np.nanmax([array[index], val])
209
+ return old
210
+
211
+ def nanmin(self, array, index, val):
212
+ with minlock:
213
+ old = array[index]
214
+ array[index] = np.nanmin([array[index], val])
215
+ return old
216
+
217
+ def compare_and_swap(self, array, old, val):
218
+ with compare_and_swaplock:
219
+ index = (0,) * array.ndim
220
+ loaded = array[index]
221
+ if loaded == old:
222
+ array[index] = val
223
+ return loaded
224
+
225
+ def cas(self, array, index, old, val):
226
+ with caslock:
227
+ loaded = array[index]
228
+ if loaded == old:
229
+ array[index] = val
230
+ return loaded
231
+
232
+
233
+ class FakeCUDAFp16(object):
234
+ def hadd(self, a, b):
235
+ return a + b
236
+
237
+ def hsub(self, a, b):
238
+ return a - b
239
+
240
+ def hmul(self, a, b):
241
+ return a * b
242
+
243
+ def hdiv(self, a, b):
244
+ return a / b
245
+
246
+ def hfma(self, a, b, c):
247
+ return a * b + c
248
+
249
+ def hneg(self, a):
250
+ return -a
251
+
252
+ def habs(self, a):
253
+ return abs(a)
254
+
255
+ def hsin(self, x):
256
+ return np.sin(x, dtype=np.float16)
257
+
258
+ def hcos(self, x):
259
+ return np.cos(x, dtype=np.float16)
260
+
261
+ def hlog(self, x):
262
+ return np.log(x, dtype=np.float16)
263
+
264
+ def hlog2(self, x):
265
+ return np.log2(x, dtype=np.float16)
266
+
267
+ def hlog10(self, x):
268
+ return np.log10(x, dtype=np.float16)
269
+
270
+ def hexp(self, x):
271
+ return np.exp(x, dtype=np.float16)
272
+
273
+ def hexp2(self, x):
274
+ return np.exp2(x, dtype=np.float16)
275
+
276
+ def hexp10(self, x):
277
+ return np.float16(10 ** x)
278
+
279
+ def hsqrt(self, x):
280
+ return np.sqrt(x, dtype=np.float16)
281
+
282
+ def hrsqrt(self, x):
283
+ return np.float16(x ** -0.5)
284
+
285
+ def hceil(self, x):
286
+ return np.ceil(x, dtype=np.float16)
287
+
288
+ def hfloor(self, x):
289
+ return np.ceil(x, dtype=np.float16)
290
+
291
+ def hrcp(self, x):
292
+ return np.reciprocal(x, dtype=np.float16)
293
+
294
+ def htrunc(self, x):
295
+ return np.trunc(x, dtype=np.float16)
296
+
297
+ def hrint(self, x):
298
+ return np.rint(x, dtype=np.float16)
299
+
300
+ def heq(self, a, b):
301
+ return a == b
302
+
303
+ def hne(self, a, b):
304
+ return a != b
305
+
306
+ def hge(self, a, b):
307
+ return a >= b
308
+
309
+ def hgt(self, a, b):
310
+ return a > b
311
+
312
+ def hle(self, a, b):
313
+ return a <= b
314
+
315
+ def hlt(self, a, b):
316
+ return a < b
317
+
318
+ def hmax(self, a, b):
319
+ return max(a, b)
320
+
321
+ def hmin(self, a, b):
322
+ return min(a, b)
323
+
324
+
325
+ class FakeCUDAModule(object):
326
+ '''
327
+ An instance of this class will be injected into the __globals__ for an
328
+ executing function in order to implement calls to cuda.*. This will fail to
329
+ work correctly if the user code does::
330
+
331
+ from numba import cuda as something_else
332
+
333
+ In other words, the CUDA module must be called cuda.
334
+ '''
335
+
336
+ def __init__(self, grid_dim, block_dim, dynshared_size):
337
+ self.gridDim = Dim3(*grid_dim)
338
+ self.blockDim = Dim3(*block_dim)
339
+ self._cg = FakeCUDACg()
340
+ self._local = FakeCUDALocal()
341
+ self._shared = FakeCUDAShared(dynshared_size)
342
+ self._const = FakeCUDAConst()
343
+ self._atomic = FakeCUDAAtomic()
344
+ self._fp16 = FakeCUDAFp16()
345
+ # Insert the vector types into the kernel context
346
+ # Note that we need to do this in addition to exposing them as module
347
+ # variables in `simulator.__init__.py`, because the test cases need
348
+ # to access the actual cuda module as well as the fake cuda module
349
+ # for vector types.
350
+ for name, svty in vector_types.items():
351
+ setattr(self, name, svty)
352
+ for alias in svty.aliases:
353
+ setattr(self, alias, svty)
354
+
355
+ @property
356
+ def cg(self):
357
+ return self._cg
358
+
359
+ @property
360
+ def local(self):
361
+ return self._local
362
+
363
+ @property
364
+ def shared(self):
365
+ return self._shared
366
+
367
+ @property
368
+ def const(self):
369
+ return self._const
370
+
371
+ @property
372
+ def atomic(self):
373
+ return self._atomic
374
+
375
+ @property
376
+ def fp16(self):
377
+ return self._fp16
378
+
379
+ @property
380
+ def threadIdx(self):
381
+ return threading.current_thread().threadIdx
382
+
383
+ @property
384
+ def blockIdx(self):
385
+ return threading.current_thread().blockIdx
386
+
387
+ @property
388
+ def warpsize(self):
389
+ return 32
390
+
391
+ @property
392
+ def laneid(self):
393
+ return threading.current_thread().thread_id % 32
394
+
395
+ def syncthreads(self):
396
+ threading.current_thread().syncthreads()
397
+
398
+ def threadfence(self):
399
+ # No-op
400
+ pass
401
+
402
+ def threadfence_block(self):
403
+ # No-op
404
+ pass
405
+
406
+ def threadfence_system(self):
407
+ # No-op
408
+ pass
409
+
410
+ def syncthreads_count(self, val):
411
+ return threading.current_thread().syncthreads_count(val)
412
+
413
+ def syncthreads_and(self, val):
414
+ return threading.current_thread().syncthreads_and(val)
415
+
416
+ def syncthreads_or(self, val):
417
+ return threading.current_thread().syncthreads_or(val)
418
+
419
+ def popc(self, val):
420
+ return bin(val).count("1")
421
+
422
+ def fma(self, a, b, c):
423
+ return a * b + c
424
+
425
+ def cbrt(self, a):
426
+ return a ** (1 / 3)
427
+
428
+ def brev(self, val):
429
+ return int('{:032b}'.format(val)[::-1], 2)
430
+
431
+ def clz(self, val):
432
+ s = '{:032b}'.format(val)
433
+ return len(s) - len(s.lstrip('0'))
434
+
435
+ def ffs(self, val):
436
+ # The algorithm is:
437
+ # 1. Count the number of trailing zeros.
438
+ # 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
439
+ # 3. If we've counted 32 zeros (resulting in 33), there were no bits
440
+ # set so we need to return zero.
441
+ s = '{:032b}'.format(val)
442
+ r = (len(s) - len(s.rstrip('0')) + 1) % 33
443
+ return r
444
+
445
+ def selp(self, a, b, c):
446
+ return b if a else c
447
+
448
+ def grid(self, n):
449
+ bdim = self.blockDim
450
+ bid = self.blockIdx
451
+ tid = self.threadIdx
452
+ x = bid.x * bdim.x + tid.x
453
+ if n == 1:
454
+ return x
455
+ y = bid.y * bdim.y + tid.y
456
+ if n == 2:
457
+ return (x, y)
458
+ z = bid.z * bdim.z + tid.z
459
+ if n == 3:
460
+ return (x, y, z)
461
+
462
+ raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)
463
+
464
+ def gridsize(self, n):
465
+ bdim = self.blockDim
466
+ gdim = self.gridDim
467
+ x = bdim.x * gdim.x
468
+ if n == 1:
469
+ return x
470
+ y = bdim.y * gdim.y
471
+ if n == 2:
472
+ return (x, y)
473
+ z = bdim.z * gdim.z
474
+ if n == 3:
475
+ return (x, y, z)
476
+
477
+ raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)
478
+
479
+
480
+ @contextmanager
481
+ def swapped_cuda_module(fn, fake_cuda_module):
482
+ from numba import cuda
483
+
484
+ fn_globs = fn.__globals__
485
+ # get all globals that is the "cuda" module
486
+ orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
487
+ # build replacement dict
488
+ repl = dict((k, fake_cuda_module) for k, v in orig.items())
489
+ # replace
490
+ fn_globs.update(repl)
491
+ try:
492
+ yield
493
+ finally:
494
+ # revert
495
+ fn_globs.update(orig)
@@ -0,0 +1,15 @@
1
+ from functools import reduce as pyreduce
2
+
3
+
4
+ def Reduce(func):
5
+ def reduce_wrapper(seq, res=None, init=0):
6
+ r = pyreduce(func, seq, init)
7
+ if res is not None:
8
+ res[0] = r
9
+ return None
10
+ else:
11
+ return r
12
+ return reduce_wrapper
13
+
14
+
15
+ reduce = Reduce
@@ -0,0 +1,58 @@
1
+ from numba import types
2
+ from numba.cuda.stubs import _vector_type_stubs
3
+
4
+
5
+ class SimulatedVectorType:
6
+ attributes = ['x', 'y', 'z', 'w']
7
+
8
+ def __init__(self, *args):
9
+ args_flattened = []
10
+ for arg in args:
11
+ if isinstance(arg, SimulatedVectorType):
12
+ args_flattened += arg.as_list()
13
+ else:
14
+ args_flattened.append(arg)
15
+ self._attrs = self.attributes[:len(args_flattened)]
16
+ if not self.num_elements == len(args_flattened):
17
+ raise TypeError(
18
+ f"{self.name} expects {self.num_elements}"
19
+ f" elements, got {len(args_flattened)}"
20
+ )
21
+
22
+ for arg, attr in zip(args_flattened, self._attrs):
23
+ setattr(self, attr, arg)
24
+
25
+ @property
26
+ def name(self):
27
+ raise NotImplementedError()
28
+
29
+ @property
30
+ def num_elements(self):
31
+ raise NotImplementedError()
32
+
33
+ def as_list(self):
34
+ return [getattr(self, attr) for attr in self._attrs]
35
+
36
+
37
+ def make_simulated_vector_type(num_elements, name):
38
+ obj = type(name, (SimulatedVectorType,), {
39
+ "num_elements": num_elements,
40
+ "base_type": types.float32,
41
+ "name": name
42
+ })
43
+ obj.user_facing_object = obj
44
+ return obj
45
+
46
+
47
+ def _initialize():
48
+ _simulated_vector_types = {}
49
+ for stub in _vector_type_stubs:
50
+ num_elements = int(stub.__name__[-1])
51
+ _simulated_vector_types[stub.__name__] = (
52
+ make_simulated_vector_type(num_elements, stub.__name__)
53
+ )
54
+ _simulated_vector_types[stub.__name__].aliases = stub.aliases
55
+ return _simulated_vector_types
56
+
57
+
58
+ vector_types = _initialize()
@@ -0,0 +1,17 @@
1
+ # We import * from simulator here because * is imported from simulator_init by
2
+ # numba.cuda.__init__.
3
+ from .simulator import * # noqa: F403, F401
4
+
5
+
6
+ def is_available():
7
+ """Returns a boolean to indicate the availability of a CUDA GPU.
8
+ """
9
+ # Simulator is always available
10
+ return True
11
+
12
+
13
+ def cuda_error():
14
+ """Returns None or an exception if the CUDA driver fails to initialize.
15
+ """
16
+ # Simulator never fails to initialize
17
+ return None