numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.13.dist-info/LICENSE +25 -0
- numba_cuda-0.0.13.dist-info/METADATA +69 -0
- numba_cuda-0.0.13.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,49 @@
|
|
1
|
+
// magictoken.ex_mul_f32_f32.begin
|
2
|
+
// Foreign function example: multiplication of a pair of floats
|
3
|
+
|
4
|
+
extern "C" __device__ int
|
5
|
+
mul_f32_f32(
|
6
|
+
float* return_value,
|
7
|
+
float x,
|
8
|
+
float y)
|
9
|
+
{
|
10
|
+
// Compute result and store in caller-provided slot
|
11
|
+
*return_value = x * y;
|
12
|
+
|
13
|
+
// Signal that no Python exception occurred
|
14
|
+
return 0;
|
15
|
+
}
|
16
|
+
// magictoken.ex_mul_f32_f32.end
|
17
|
+
|
18
|
+
|
19
|
+
// magictoken.ex_sum_reduce_proto.begin
|
20
|
+
extern "C"
|
21
|
+
__device__ int
|
22
|
+
sum_reduce(
|
23
|
+
float* return_value,
|
24
|
+
float* array,
|
25
|
+
int n
|
26
|
+
);
|
27
|
+
// magictoken.ex_sum_reduce_proto.end
|
28
|
+
|
29
|
+
|
30
|
+
// Performs a simple reduction on an array passed by pointer using the
|
31
|
+
// ffi.from_buffer() method. Implements the prototype above.
|
32
|
+
extern "C"
|
33
|
+
__device__ int
|
34
|
+
sum_reduce(
|
35
|
+
float* return_value,
|
36
|
+
float* array,
|
37
|
+
int n
|
38
|
+
)
|
39
|
+
{
|
40
|
+
double sum = 0.0;
|
41
|
+
|
42
|
+
for (size_t i = 0; i < n; ++i) {
|
43
|
+
sum += array[i];
|
44
|
+
}
|
45
|
+
|
46
|
+
*return_value = (float)sum;
|
47
|
+
|
48
|
+
return 0;
|
49
|
+
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# Contents in this file are referenced from the sphinx-generated docs.
|
2
|
+
# "magictoken" is used for markers as beginning and ending of example text.
|
3
|
+
|
4
|
+
import unittest
|
5
|
+
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
|
6
|
+
skip_if_cudadevrt_missing, skip_unless_cc_60,
|
7
|
+
skip_if_mvc_enabled)
|
8
|
+
|
9
|
+
|
10
|
+
@skip_if_cudadevrt_missing
|
11
|
+
@skip_unless_cc_60
|
12
|
+
@skip_if_mvc_enabled('CG not supported with MVC')
|
13
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
14
|
+
class TestCooperativeGroups(CUDATestCase):
|
15
|
+
def test_ex_grid_sync(self):
|
16
|
+
# magictoken.ex_grid_sync_kernel.begin
|
17
|
+
from numba import cuda, int32
|
18
|
+
import numpy as np
|
19
|
+
|
20
|
+
sig = (int32[:,::1],)
|
21
|
+
|
22
|
+
@cuda.jit(sig)
|
23
|
+
def sequential_rows(M):
|
24
|
+
col = cuda.grid(1)
|
25
|
+
g = cuda.cg.this_grid()
|
26
|
+
|
27
|
+
rows = M.shape[0]
|
28
|
+
cols = M.shape[1]
|
29
|
+
|
30
|
+
for row in range(1, rows):
|
31
|
+
opposite = cols - col - 1
|
32
|
+
# Each row's elements are one greater than the previous row
|
33
|
+
M[row, col] = M[row - 1, opposite] + 1
|
34
|
+
# Wait until all threads have written their column element,
|
35
|
+
# and that the write is visible to all other threads
|
36
|
+
g.sync()
|
37
|
+
# magictoken.ex_grid_sync_kernel.end
|
38
|
+
|
39
|
+
# magictoken.ex_grid_sync_data.begin
|
40
|
+
# Empty input data
|
41
|
+
A = np.zeros((1024, 1024), dtype=np.int32)
|
42
|
+
# A somewhat arbitrary choice (one warp), but generally smaller block sizes
|
43
|
+
# allow more blocks to be launched (noting that other limitations on
|
44
|
+
# occupancy apply such as shared memory size)
|
45
|
+
blockdim = 32
|
46
|
+
griddim = A.shape[1] // blockdim
|
47
|
+
# magictoken.ex_grid_sync_data.end
|
48
|
+
|
49
|
+
# Skip this test if the grid size used in the example is too large for
|
50
|
+
# a cooperative launch on the current GPU
|
51
|
+
mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim)
|
52
|
+
if mb < griddim:
|
53
|
+
self.skipTest('Device does not support a large enough coop grid')
|
54
|
+
|
55
|
+
# magictoken.ex_grid_sync_launch.begin
|
56
|
+
# Kernel launch - this is implicitly a cooperative launch
|
57
|
+
sequential_rows[griddim, blockdim](A)
|
58
|
+
|
59
|
+
# What do the results look like?
|
60
|
+
# print(A)
|
61
|
+
#
|
62
|
+
# [[ 0 0 0 ... 0 0 0]
|
63
|
+
# [ 1 1 1 ... 1 1 1]
|
64
|
+
# [ 2 2 2 ... 2 2 2]
|
65
|
+
# ...
|
66
|
+
# [1021 1021 1021 ... 1021 1021 1021]
|
67
|
+
# [1022 1022 1022 ... 1022 1022 1022]
|
68
|
+
# [1023 1023 1023 ... 1023 1023 1023]]
|
69
|
+
# magictoken.ex_grid_sync_launch.end
|
70
|
+
|
71
|
+
# Sanity check - are the results what we expect?
|
72
|
+
reference = np.tile(np.arange(1024), (1024, 1)).T
|
73
|
+
np.testing.assert_equal(A, reference)
|
74
|
+
|
75
|
+
|
76
|
+
if __name__ == '__main__':
|
77
|
+
unittest.main()
|
@@ -0,0 +1,76 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
4
|
+
from numba.tests.support import captured_stdout
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
|
8
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
9
|
+
class TestCpuGpuCompat(CUDATestCase):
|
10
|
+
"""
|
11
|
+
Test compatibility of CPU and GPU functions
|
12
|
+
"""
|
13
|
+
|
14
|
+
def setUp(self):
|
15
|
+
# Prevent output from this test showing up when running the test suite
|
16
|
+
self._captured_stdout = captured_stdout()
|
17
|
+
self._captured_stdout.__enter__()
|
18
|
+
super().setUp()
|
19
|
+
|
20
|
+
def tearDown(self):
|
21
|
+
# No exception type, value, or traceback
|
22
|
+
self._captured_stdout.__exit__(None, None, None)
|
23
|
+
super().tearDown()
|
24
|
+
|
25
|
+
def test_ex_cpu_gpu_compat(self):
|
26
|
+
# ex_cpu_gpu_compat.import.begin
|
27
|
+
from math import pi
|
28
|
+
|
29
|
+
import numba
|
30
|
+
from numba import cuda
|
31
|
+
# ex_cpu_gpu_compat.import.end
|
32
|
+
|
33
|
+
# ex_cpu_gpu_compat.allocate.begin
|
34
|
+
X = cuda.to_device([1, 10, 234])
|
35
|
+
Y = cuda.to_device([2, 2, 4014])
|
36
|
+
Z = cuda.to_device([3, 14, 2211])
|
37
|
+
results = cuda.to_device([0.0, 0.0, 0.0])
|
38
|
+
# ex_cpu_gpu_compat.allocate.end
|
39
|
+
|
40
|
+
# ex_cpu_gpu_compat.define.begin
|
41
|
+
@numba.jit
|
42
|
+
def business_logic(x, y, z):
|
43
|
+
return 4 * z * (2 * x - (4 * y) / 2 * pi)
|
44
|
+
# ex_cpu_gpu_compat.define.end
|
45
|
+
|
46
|
+
# ex_cpu_gpu_compat.cpurun.begin
|
47
|
+
print(business_logic(1, 2, 3)) # -126.79644737231007
|
48
|
+
# ex_cpu_gpu_compat.cpurun.end
|
49
|
+
|
50
|
+
# ex_cpu_gpu_compat.usegpu.begin
|
51
|
+
@cuda.jit
|
52
|
+
def f(res, xarr, yarr, zarr):
|
53
|
+
tid = cuda.grid(1)
|
54
|
+
if tid < len(xarr):
|
55
|
+
# The function decorated with numba.jit may be directly reused
|
56
|
+
res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
|
57
|
+
# ex_cpu_gpu_compat.usegpu.end
|
58
|
+
|
59
|
+
# ex_cpu_gpu_compat.launch.begin
|
60
|
+
f.forall(len(X))(results, X, Y, Z)
|
61
|
+
print(results)
|
62
|
+
# [-126.79644737231007, 416.28324559588634, -218912930.2987788]
|
63
|
+
# ex_cpu_gpu_compat.launch.end
|
64
|
+
|
65
|
+
expect = [
|
66
|
+
business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
|
67
|
+
]
|
68
|
+
|
69
|
+
np.testing.assert_equal(
|
70
|
+
expect,
|
71
|
+
results.copy_to_host()
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
if __name__ == "__main__":
|
76
|
+
unittest.main()
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# Contents in this file are referenced from the sphinx-generated docs.
|
2
|
+
# "magictoken" is used for markers as beginning and ending of example text.
|
3
|
+
|
4
|
+
import unittest
|
5
|
+
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim)
|
6
|
+
from numba.tests.support import skip_unless_cffi
|
7
|
+
|
8
|
+
|
9
|
+
@skip_unless_cffi
|
10
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
11
|
+
class TestFFI(CUDATestCase):
|
12
|
+
def test_ex_linking_cu(self):
|
13
|
+
# magictoken.ex_linking_cu.begin
|
14
|
+
from numba import cuda
|
15
|
+
import numpy as np
|
16
|
+
import os
|
17
|
+
|
18
|
+
# Declaration of the foreign function
|
19
|
+
mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
|
20
|
+
|
21
|
+
# Path to the source containing the foreign function
|
22
|
+
# (here assumed to be in a subdirectory called "ffi")
|
23
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
24
|
+
functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
|
25
|
+
|
26
|
+
# Kernel that links in functions.cu and calls mul
|
27
|
+
@cuda.jit(link=[functions_cu])
|
28
|
+
def multiply_vectors(r, x, y):
|
29
|
+
i = cuda.grid(1)
|
30
|
+
|
31
|
+
if i < len(r):
|
32
|
+
r[i] = mul(x[i], y[i])
|
33
|
+
|
34
|
+
# Generate random data
|
35
|
+
N = 32
|
36
|
+
np.random.seed(1)
|
37
|
+
x = np.random.rand(N).astype(np.float32)
|
38
|
+
y = np.random.rand(N).astype(np.float32)
|
39
|
+
r = np.zeros_like(x)
|
40
|
+
|
41
|
+
# Run the kernel
|
42
|
+
multiply_vectors[1, 32](r, x, y)
|
43
|
+
|
44
|
+
# Sanity check - ensure the results match those expected
|
45
|
+
np.testing.assert_array_equal(r, x * y)
|
46
|
+
# magictoken.ex_linking_cu.end
|
47
|
+
|
48
|
+
def test_ex_from_buffer(self):
|
49
|
+
from numba import cuda
|
50
|
+
import os
|
51
|
+
|
52
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
53
|
+
functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
|
54
|
+
|
55
|
+
# magictoken.ex_from_buffer_decl.begin
|
56
|
+
signature = 'float32(CPointer(float32), int32)'
|
57
|
+
sum_reduce = cuda.declare_device('sum_reduce', signature)
|
58
|
+
# magictoken.ex_from_buffer_decl.end
|
59
|
+
|
60
|
+
# magictoken.ex_from_buffer_kernel.begin
|
61
|
+
import cffi
|
62
|
+
ffi = cffi.FFI()
|
63
|
+
|
64
|
+
@cuda.jit(link=[functions_cu])
|
65
|
+
def reduction_caller(result, array):
|
66
|
+
array_ptr = ffi.from_buffer(array)
|
67
|
+
result[()] = sum_reduce(array_ptr, len(array))
|
68
|
+
# magictoken.ex_from_buffer_kernel.end
|
69
|
+
|
70
|
+
import numpy as np
|
71
|
+
x = np.arange(10).astype(np.float32)
|
72
|
+
r = np.ndarray((), dtype=np.float32)
|
73
|
+
|
74
|
+
reduction_caller[1, 1](r, x)
|
75
|
+
|
76
|
+
expected = np.sum(x)
|
77
|
+
actual = r[()]
|
78
|
+
np.testing.assert_allclose(expected, actual)
|
79
|
+
|
80
|
+
|
81
|
+
if __name__ == '__main__':
|
82
|
+
unittest.main()
|
@@ -0,0 +1,155 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
|
4
|
+
skip_on_cudasim, skip_unless_cc_60,
|
5
|
+
skip_if_mvc_enabled)
|
6
|
+
from numba.tests.support import captured_stdout
|
7
|
+
|
8
|
+
|
9
|
+
@skip_if_cudadevrt_missing
|
10
|
+
@skip_unless_cc_60
|
11
|
+
@skip_if_mvc_enabled('CG not supported with MVC')
|
12
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
13
|
+
class TestLaplace(CUDATestCase):
|
14
|
+
"""
|
15
|
+
Test simple vector addition
|
16
|
+
"""
|
17
|
+
|
18
|
+
def setUp(self):
|
19
|
+
# Prevent output from this test showing up when running the test suite
|
20
|
+
self._captured_stdout = captured_stdout()
|
21
|
+
self._captured_stdout.__enter__()
|
22
|
+
super().setUp()
|
23
|
+
|
24
|
+
def tearDown(self):
|
25
|
+
# No exception type, value, or traceback
|
26
|
+
self._captured_stdout.__exit__(None, None, None)
|
27
|
+
super().tearDown()
|
28
|
+
|
29
|
+
def test_ex_laplace(self):
|
30
|
+
|
31
|
+
# set True to regenerate the figures that
|
32
|
+
# accompany this example
|
33
|
+
plot = False
|
34
|
+
|
35
|
+
# ex_laplace.import.begin
|
36
|
+
import numpy as np
|
37
|
+
from numba import cuda
|
38
|
+
# ex_laplace.import.end
|
39
|
+
|
40
|
+
# ex_laplace.allocate.begin
|
41
|
+
# Use an odd problem size.
|
42
|
+
# This is so there can be an element truly in the "middle" for symmetry.
|
43
|
+
size = 1001
|
44
|
+
data = np.zeros(size)
|
45
|
+
|
46
|
+
# Middle element is made very hot
|
47
|
+
data[500] = 10000
|
48
|
+
buf_0 = cuda.to_device(data)
|
49
|
+
|
50
|
+
# This extra array is used for synchronization purposes
|
51
|
+
buf_1 = cuda.device_array_like(buf_0)
|
52
|
+
|
53
|
+
niter = 10000
|
54
|
+
# ex_laplace.allocate.end
|
55
|
+
|
56
|
+
if plot:
|
57
|
+
import matplotlib.pyplot as plt
|
58
|
+
fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
|
59
|
+
plt.plot(
|
60
|
+
np.arange(len(buf_0)),
|
61
|
+
buf_0.copy_to_host(),
|
62
|
+
lw=3,
|
63
|
+
marker="*",
|
64
|
+
color='black'
|
65
|
+
)
|
66
|
+
|
67
|
+
plt.title('Initial State', fontsize=24)
|
68
|
+
plt.xlabel('Position', fontsize=24)
|
69
|
+
plt.ylabel('Temperature', fontsize=24)
|
70
|
+
|
71
|
+
ax.set_xticks(ax.get_xticks(), fontsize=16)
|
72
|
+
ax.set_yticks(ax.get_yticks(), fontsize=16)
|
73
|
+
plt.xlim(0, len(data))
|
74
|
+
plt.ylim(0, 10001)
|
75
|
+
plt.savefig('laplace_initial.svg')
|
76
|
+
|
77
|
+
# ex_laplace.kernel.begin
|
78
|
+
@cuda.jit
|
79
|
+
def solve_heat_equation(buf_0, buf_1, timesteps, k):
|
80
|
+
i = cuda.grid(1)
|
81
|
+
|
82
|
+
# Don't continue if our index is outside the domain
|
83
|
+
if i >= len(buf_0):
|
84
|
+
return
|
85
|
+
|
86
|
+
# Prepare to do a grid-wide synchronization later
|
87
|
+
grid = cuda.cg.this_grid()
|
88
|
+
|
89
|
+
for step in range(timesteps):
|
90
|
+
# Select the buffer from the previous timestep
|
91
|
+
if (step % 2) == 0:
|
92
|
+
data = buf_0
|
93
|
+
next_data = buf_1
|
94
|
+
else:
|
95
|
+
data = buf_1
|
96
|
+
next_data = buf_0
|
97
|
+
|
98
|
+
# Get the current temperature associated with this point
|
99
|
+
curr_temp = data[i]
|
100
|
+
|
101
|
+
# Apply formula from finite difference equation
|
102
|
+
if i == 0:
|
103
|
+
# Left wall is held at T = 0
|
104
|
+
next_temp = curr_temp + k * (data[i + 1] - (2 * curr_temp))
|
105
|
+
elif i == len(data) - 1:
|
106
|
+
# Right wall is held at T = 0
|
107
|
+
next_temp = curr_temp + k * (data[i - 1] - (2 * curr_temp))
|
108
|
+
else:
|
109
|
+
# Interior points are a weighted average of their neighbors
|
110
|
+
next_temp = curr_temp + k * (
|
111
|
+
data[i - 1] - (2 * curr_temp) + data[i + 1]
|
112
|
+
)
|
113
|
+
|
114
|
+
# Write new value to the next buffer
|
115
|
+
next_data[i] = next_temp
|
116
|
+
|
117
|
+
# Wait for every thread to write before moving on
|
118
|
+
grid.sync()
|
119
|
+
# ex_laplace.kernel.end
|
120
|
+
|
121
|
+
# ex_laplace.launch.begin
|
122
|
+
solve_heat_equation.forall(len(data))(
|
123
|
+
buf_0, buf_1, niter, 0.25
|
124
|
+
)
|
125
|
+
# ex_laplace.launch.end
|
126
|
+
|
127
|
+
results = buf_1.copy_to_host()
|
128
|
+
if plot:
|
129
|
+
fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
|
130
|
+
plt.plot(
|
131
|
+
np.arange(len(results)),
|
132
|
+
results, lw=3,
|
133
|
+
marker="*",
|
134
|
+
color='black'
|
135
|
+
)
|
136
|
+
plt.title(f"T = {niter}", fontsize=24)
|
137
|
+
plt.xlabel('Position', fontsize=24)
|
138
|
+
plt.ylabel('Temperature', fontsize=24)
|
139
|
+
|
140
|
+
ax.set_xticks(ax.get_xticks(), fontsize=16)
|
141
|
+
ax.set_yticks(ax.get_yticks(), fontsize=16)
|
142
|
+
|
143
|
+
plt.ylim(0, max(results))
|
144
|
+
plt.xlim(0, len(results))
|
145
|
+
plt.savefig('laplace_final.svg')
|
146
|
+
|
147
|
+
# Integral over the domain should be equal to its initial value.
|
148
|
+
# Note that this should match the initial value of data[500] above, but
|
149
|
+
# we don't assign it to a variable because that would make the example
|
150
|
+
# code look a bit oddly verbose.
|
151
|
+
np.testing.assert_allclose(results.sum(), 10000)
|
152
|
+
|
153
|
+
|
154
|
+
if __name__ == "__main__":
|
155
|
+
unittest.main()
|
@@ -0,0 +1,173 @@
|
|
1
|
+
"""
|
2
|
+
Matrix multiplication example via `cuda.jit`.
|
3
|
+
|
4
|
+
Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
|
5
|
+
|
6
|
+
Contents in this file are referenced from the sphinx-generated docs.
|
7
|
+
"magictoken" is used for markers as beginning and ending of example text.
|
8
|
+
"""
|
9
|
+
import unittest
|
10
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
11
|
+
from numba.tests.support import captured_stdout
|
12
|
+
|
13
|
+
|
14
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
15
|
+
class TestMatMul(CUDATestCase):
|
16
|
+
"""
|
17
|
+
Text matrix multiplication using simple, shared memory/square, and shared
|
18
|
+
memory/nonsquare cases.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def setUp(self):
|
22
|
+
# Prevent output from this test showing up when running the test suite
|
23
|
+
self._captured_stdout = captured_stdout()
|
24
|
+
self._captured_stdout.__enter__()
|
25
|
+
super().setUp()
|
26
|
+
|
27
|
+
def tearDown(self):
|
28
|
+
# No exception type, value, or traceback
|
29
|
+
self._captured_stdout.__exit__(None, None, None)
|
30
|
+
super().tearDown()
|
31
|
+
|
32
|
+
def test_ex_matmul(self):
|
33
|
+
"""Test of matrix multiplication on various cases."""
|
34
|
+
# magictoken.ex_import.begin
|
35
|
+
from numba import cuda, float32
|
36
|
+
import numpy as np
|
37
|
+
import math
|
38
|
+
# magictoken.ex_import.end
|
39
|
+
|
40
|
+
# magictoken.ex_matmul.begin
|
41
|
+
@cuda.jit
|
42
|
+
def matmul(A, B, C):
|
43
|
+
"""Perform square matrix multiplication of C = A * B."""
|
44
|
+
i, j = cuda.grid(2)
|
45
|
+
if i < C.shape[0] and j < C.shape[1]:
|
46
|
+
tmp = 0.
|
47
|
+
for k in range(A.shape[1]):
|
48
|
+
tmp += A[i, k] * B[k, j]
|
49
|
+
C[i, j] = tmp
|
50
|
+
# magictoken.ex_matmul.end
|
51
|
+
|
52
|
+
# magictoken.ex_run_matmul.begin
|
53
|
+
x_h = np.arange(16).reshape([4, 4])
|
54
|
+
y_h = np.ones([4, 4])
|
55
|
+
z_h = np.zeros([4, 4])
|
56
|
+
|
57
|
+
x_d = cuda.to_device(x_h)
|
58
|
+
y_d = cuda.to_device(y_h)
|
59
|
+
z_d = cuda.to_device(z_h)
|
60
|
+
|
61
|
+
threadsperblock = (16, 16)
|
62
|
+
blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
|
63
|
+
blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
|
64
|
+
blockspergrid = (blockspergrid_x, blockspergrid_y)
|
65
|
+
|
66
|
+
matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
|
67
|
+
z_h = z_d.copy_to_host()
|
68
|
+
print(z_h)
|
69
|
+
print(x_h @ y_h)
|
70
|
+
# magictoken.ex_run_matmul.end
|
71
|
+
|
72
|
+
# magictoken.ex_fast_matmul.begin
|
73
|
+
# Controls threads per block and shared memory usage.
|
74
|
+
# The computation will be done on blocks of TPBxTPB elements.
|
75
|
+
# TPB should not be larger than 32 in this example
|
76
|
+
TPB = 16
|
77
|
+
|
78
|
+
@cuda.jit
|
79
|
+
def fast_matmul(A, B, C):
|
80
|
+
"""
|
81
|
+
Perform matrix multiplication of C = A * B using CUDA shared memory.
|
82
|
+
|
83
|
+
Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
|
84
|
+
"""
|
85
|
+
# Define an array in the shared memory
|
86
|
+
# The size and type of the arrays must be known at compile time
|
87
|
+
sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
|
88
|
+
sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
|
89
|
+
|
90
|
+
x, y = cuda.grid(2)
|
91
|
+
|
92
|
+
tx = cuda.threadIdx.x
|
93
|
+
ty = cuda.threadIdx.y
|
94
|
+
bpg = cuda.gridDim.x # blocks per grid
|
95
|
+
|
96
|
+
# Each thread computes one element in the result matrix.
|
97
|
+
# The dot product is chunked into dot products of TPB-long vectors.
|
98
|
+
tmp = float32(0.)
|
99
|
+
for i in range(bpg):
|
100
|
+
# Preload data into shared memory
|
101
|
+
sA[ty, tx] = 0
|
102
|
+
sB[ty, tx] = 0
|
103
|
+
if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:
|
104
|
+
sA[ty, tx] = A[y, tx + i * TPB]
|
105
|
+
if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:
|
106
|
+
sB[ty, tx] = B[ty + i * TPB, x]
|
107
|
+
|
108
|
+
# Wait until all threads finish preloading
|
109
|
+
cuda.syncthreads()
|
110
|
+
|
111
|
+
# Computes partial product on the shared memory
|
112
|
+
for j in range(TPB):
|
113
|
+
tmp += sA[ty, j] * sB[j, tx]
|
114
|
+
|
115
|
+
# Wait until all threads finish computing
|
116
|
+
cuda.syncthreads()
|
117
|
+
if y < C.shape[0] and x < C.shape[1]:
|
118
|
+
C[y, x] = tmp
|
119
|
+
# magictoken.ex_fast_matmul.end
|
120
|
+
|
121
|
+
# magictoken.ex_run_fast_matmul.begin
|
122
|
+
x_h = np.arange(16).reshape([4, 4])
|
123
|
+
y_h = np.ones([4, 4])
|
124
|
+
z_h = np.zeros([4, 4])
|
125
|
+
|
126
|
+
x_d = cuda.to_device(x_h)
|
127
|
+
y_d = cuda.to_device(y_h)
|
128
|
+
z_d = cuda.to_device(z_h)
|
129
|
+
|
130
|
+
threadsperblock = (TPB, TPB)
|
131
|
+
blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
|
132
|
+
blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
|
133
|
+
blockspergrid = (blockspergrid_x, blockspergrid_y)
|
134
|
+
|
135
|
+
fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
|
136
|
+
z_h = z_d.copy_to_host()
|
137
|
+
print(z_h)
|
138
|
+
print(x_h @ y_h)
|
139
|
+
# magictoken.ex_run_fast_matmul.end
|
140
|
+
|
141
|
+
# fast_matmul test(s)
|
142
|
+
msg = "fast_matmul incorrect for shared memory, square case."
|
143
|
+
self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
|
144
|
+
|
145
|
+
# magictoken.ex_run_nonsquare.begin
|
146
|
+
x_h = np.arange(115).reshape([5, 23])
|
147
|
+
y_h = np.ones([23, 7])
|
148
|
+
z_h = np.zeros([5, 7])
|
149
|
+
|
150
|
+
x_d = cuda.to_device(x_h)
|
151
|
+
y_d = cuda.to_device(y_h)
|
152
|
+
z_d = cuda.to_device(z_h)
|
153
|
+
|
154
|
+
threadsperblock = (TPB, TPB)
|
155
|
+
grid_y_max = max(x_h.shape[0], y_h.shape[0])
|
156
|
+
grid_x_max = max(x_h.shape[1], y_h.shape[1])
|
157
|
+
blockspergrid_x = math.ceil(grid_x_max / threadsperblock[0])
|
158
|
+
blockspergrid_y = math.ceil(grid_y_max / threadsperblock[1])
|
159
|
+
blockspergrid = (blockspergrid_x, blockspergrid_y)
|
160
|
+
|
161
|
+
fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
|
162
|
+
z_h = z_d.copy_to_host()
|
163
|
+
print(z_h)
|
164
|
+
print(x_h @ y_h)
|
165
|
+
# magictoken.ex_run_nonsquare.end
|
166
|
+
|
167
|
+
# nonsquare fast_matmul test(s)
|
168
|
+
msg = "fast_matmul incorrect for shared memory, non-square case."
|
169
|
+
self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
|
170
|
+
|
171
|
+
|
172
|
+
if __name__ == '__main__':
|
173
|
+
unittest.main()
|