pyopencl 2024.2.6__cp39-cp39-macosx_11_0_arm64.whl → 2024.3__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (38) hide show
  1. pyopencl/__init__.py +127 -122
  2. pyopencl/_cl.cpython-39-darwin.so +0 -0
  3. pyopencl/_mymako.py +3 -3
  4. pyopencl/algorithm.py +10 -7
  5. pyopencl/array.py +50 -40
  6. pyopencl/bitonic_sort.py +3 -1
  7. pyopencl/bitonic_sort_templates.py +1 -1
  8. pyopencl/cache.py +23 -22
  9. pyopencl/capture_call.py +5 -4
  10. pyopencl/clrandom.py +1 -0
  11. pyopencl/compyte/dtypes.py +4 -4
  12. pyopencl/compyte/pyproject.toml +54 -0
  13. pyopencl/elementwise.py +9 -2
  14. pyopencl/invoker.py +11 -9
  15. pyopencl/ipython_ext.py +1 -1
  16. pyopencl/reduction.py +16 -10
  17. pyopencl/scan.py +38 -22
  18. pyopencl/tools.py +23 -13
  19. {pyopencl-2024.2.6.dist-info → pyopencl-2024.3.dist-info}/METADATA +11 -8
  20. pyopencl-2024.3.dist-info/RECORD +42 -0
  21. {pyopencl-2024.2.6.dist-info → pyopencl-2024.3.dist-info}/WHEEL +1 -1
  22. pyopencl/compyte/.git +0 -1
  23. pyopencl/compyte/ndarray/Makefile +0 -31
  24. pyopencl/compyte/ndarray/__init__.py +0 -0
  25. pyopencl/compyte/ndarray/gen_elemwise.py +0 -1907
  26. pyopencl/compyte/ndarray/gen_reduction.py +0 -1511
  27. pyopencl/compyte/ndarray/gpu_ndarray.h +0 -35
  28. pyopencl/compyte/ndarray/pygpu_language.h +0 -207
  29. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +0 -622
  30. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +0 -317
  31. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +0 -1546
  32. pyopencl/compyte/ndarray/pygpu_ndarray.h +0 -71
  33. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +0 -232
  34. pyopencl/compyte/ndarray/setup_opencl.py +0 -101
  35. pyopencl/compyte/ndarray/test_gpu_elemwise.py +0 -411
  36. pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -487
  37. pyopencl-2024.2.6.dist-info/RECORD +0 -56
  38. {pyopencl-2024.2.6.dist-info → pyopencl-2024.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,1511 +0,0 @@
1
- import numpy
2
- import StringIO
3
-
4
- _CL_MODE = False # "pyopencl" in __name__
5
-
6
-
7
- if _CL_MODE:
8
- # THIS IS NOT FINISHED
9
- import pyopencl as cl
10
- import pyopencl.array as cl_array
11
- from pyopencl.tools import dtype_to_ctype
12
- # import pyopencl._mymako as mako
13
- from pyopencl._cluda import CLUDA_PREAMBLE
14
-
15
- # TODO: use mako to get rid of the %if
16
- CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]
17
- CLUDA_PREAMBLE += """
18
- #define LDIM_0 get_local_id(0)
19
- #define LDIM_1 get_local_id(1)
20
- #define LDIM_2 get_local_id(2)
21
-
22
- #define GDIM_0 get_global_id(0)
23
- #define GDIM_1 get_global_id(1)
24
- #define GDIM_2 get_global_id(2)
25
- """
26
- # TODO, reuse the same context as the use used to create the memory.
27
- ctx = cl.create_some_context()
28
- queue = cl.CommandQueue(ctx)
29
- else:
30
- import pycuda.autoinit
31
- import pycuda.driver as driver
32
- from pycuda.compiler import SourceModule
33
- from pycuda.tools import dtype_to_ctype
34
- # import pycuda._mymako as mako
35
- from pycuda._cluda import CLUDA_PREAMBLE
36
- CLUDA_PREAMBLE += """
37
- #define LDIM_0 blockDim.x
38
- #define LDIM_1 blockDim.y
39
- #define LDIM_2 blockDim.z
40
-
41
- #define GDIM_0 gridDim.x
42
- #define GDIM_1 gridDim.y
43
- #define GDIM_2 gridDim.z
44
- """
45
-
46
- import logging
47
-
48
- import theano
49
- from theano import Apply, scalar
50
- from theano.sandbox.cuda import CudaNdarrayType
51
- from theano.tensor import TensorType
52
-
53
- _logger_name = 'compyte.gen_reduction'
54
- _logger = logging.getLogger(_logger_name)
55
- _logger.setLevel(logging.INFO)
56
- _logger.addHandler(logging.StreamHandler()) # TO REMOVE
57
-
58
-
59
- def warning(*msg):
60
- _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))
61
-
62
-
63
- def info(*msg):
64
- _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))
65
-
66
-
67
- def debug(*msg):
68
- _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))
69
-
70
-
71
- import pygpu_ndarray as gpu_ndarray
72
-
73
-
74
- class GpuSum:
75
- """GpuSum is a Reduction along some dimensions by summation.
76
-
77
- The dimensions along which to sum is specified by the
78
- `reduce_mask` that you pass to the constructor. The `reduce_mask`
79
- is a tuple of booleans (actually integers 0 or 1) that specify for
80
- each input dimension, whether to reduce it (1) or not (0).
81
-
82
- For example:
83
-
84
- - reduce_mask == (1,) sums a vector to a scalar
85
-
86
- - reduce_mask == (1,0) computes the sum of each column in a matrix
87
-
88
- - reduce_mask == (0,1) computes the sum of each row in a matrix
89
-
90
- - reduce_mask == (1,1,1) computes the sum of all elements in a
91
- 3-tensor.
92
-
93
- :note: any reduce_mask of all zeros is a sort of 'copy', and may
94
- be removed during graph optimization
95
-
96
- """
97
- def __init__(self, reduce_mask, dtype):
98
- self.reduce_mask = tuple(reduce_mask)
99
- # input, output and accumulator dtype
100
- self.dtype = dtype_to_ctype(dtype)
101
-
102
- def __eq__(self, other):
103
- return (type(self) == type(other) and
104
- self.reduce_mask == other.reduce_mask)
105
-
106
- def __hash__(self):
107
- return hash(type(self)) ^ hash(self.reduce_mask)
108
-
109
- def __str__(self):
110
- return "GpuSum{%s}" % ','.join(str(i) for i in self.reduce_mask)
111
-
112
- def make_node(self, x):
113
- if (x.type.ndim != len(self.reduce_mask)):
114
- raise TypeError("x must have rank %i" % len(self.reduce_mask))
115
- o_broadcast = [x.type.broadcastable[i]
116
- for i in range(x.type.ndim) if not self.reduce_mask[i]]
117
- return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
118
-
119
- def perform(self, node, inp, out):
120
- x, = inp
121
- z, = out
122
- z[0] = x.reduce_sum(self.reduce_mask)
123
-
124
- def c_code(self, node, name, inp, out, sub):
125
- x, = inp
126
- z, = out
127
-
128
- nd_in = node.inputs[0].type.ndim
129
- nd_out = node.outputs[0].type.ndim
130
-
131
- assert nd_in - nd_out == sum(self.reduce_mask)
132
-
133
- sio = StringIO.StringIO()
134
- fail = sub['fail']
135
-
136
- #check input
137
- print("""
138
- if (%(x)s->nd != %(nd_in)s)
139
- {
140
- PyErr_Format(PyExc_TypeError,
141
- "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd);
142
- %(fail)s;
143
- }
144
- """ % locals(), file=sio)
145
-
146
- #
147
- # alloc an output if we need one
148
- #
149
-
150
- # check the basics of out output
151
- print("""
152
- if ( !%(z)s
153
- || (%(z)s->nd != %(nd_out)s)
154
- """ % locals(), file=sio)
155
-
156
- #ensure that the output has the right non-reduced dimensions
157
- j = 0
158
- for i in range(nd_in):
159
- if not self.reduce_mask[i]:
160
- print((" || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !="
161
- "CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " %
162
- locals()), file=sio)
163
- j += 1
164
-
165
- print("""
166
- )
167
- {
168
- """ % locals(), file=sio)
169
- print("int new_dims[%(nd_out)s]; " % locals(), file=sio)
170
-
171
- j = 0
172
- for i in range(nd_in):
173
- if not self.reduce_mask[i]:
174
- print(('new_dims[%(j)s] = CudaNdarray_HOST_DIMS'
175
- '(%(x)s)[%(i)s];' % locals()), file=sio)
176
- j += 1
177
-
178
- print("""
179
- Py_XDECREF(%(z)s);
180
- %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
181
- if (NULL == %(z)s)
182
- {
183
- PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
184
- %(fail)s;
185
- }
186
- }
187
- """ % locals(), file=sio)
188
-
189
- # \begin bracket the reduction in a check that there is
190
- # actually work to do
191
- print("""
192
- if (CudaNdarray_SIZE(%(z)s))
193
- {
194
- """ % locals(), file=sio)
195
-
196
- #
197
- # Now perform the reduction
198
- #
199
-
200
- if all(i == 1 for i in self.reduce_mask):
201
- #check if the tensor is ccontiguous, if true, use the
202
- #c_c0de_reduce_ccontig code.
203
- #TODO: check if we are ccontiguous when we un-dimshuffle
204
- #TODO: if only some dims are ccontiguous, call version
205
- # with less dims.
206
-
207
- print('if(CudaNdarray_is_c_contiguous(%(x)s)){' % locals(), file=sio)
208
- self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
209
- print("}else{", file=sio)
210
- getattr(self, 'c_code_reduce_%s' % (''.join(
211
- str(i) for i in self.reduce_mask)))(sio, node, name,
212
- x, z, fail)
213
- print("}", file=sio)
214
- else:
215
- getattr(self, 'c_code_reduce_%s' % (''.join(
216
- str(i) for i in self.reduce_mask)))(sio, node, name,
217
- x, z, fail)
218
-
219
- # \end bracket the reduction ...
220
- print("""
221
- }
222
- """ % locals(), file=sio)
223
-
224
- return sio.getvalue()
225
-
226
- def _makecall(self, node, name, x, z, fail, pattern=None):
227
- """Return a string for making a kernel call.
228
-
229
- The return value looks something like:
230
-
231
- .. code-block:: c
232
-
233
- if (verbose)
234
- printf("running kernel_reduce_sum_10_%(name)s\\n");
235
- int n_shared = sizeof(%(dtype)s) * n_threads.x;
236
- kernel_reduce_sum_10_%(name)s<<<n_blocks,
237
- n_threads, n_shared>>>(
238
- CudaNdarray_HOST_DIMS(%(x)s)[0],
239
- CudaNdarray_HOST_DIMS(%(x)s)[1],
240
- CudaNdarray_DEV_DATA(%(x)s),
241
- CudaNdarray_HOST_STRIDES(%(x)s)[0],
242
- CudaNdarray_HOST_STRIDES(%(x)s)[1],
243
- CudaNdarray_DEV_DATA(%(z)s),
244
- CudaNdarray_HOST_STRIDES(%(z)s)[0]
245
- );
246
- CNDA_THREAD_SYNC;
247
- if (cudaSuccess != cudaGetLastError())
248
- {
249
- PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
250
- %(fail)s;
251
- }
252
- """
253
- sio = StringIO.StringIO()
254
- if pattern is None:
255
- pattern = ''.join(str(c) for c in self.reduce_mask)
256
- ndim = len(self.reduce_mask)
257
- nd_out = ndim - sum(self.reduce_mask)
258
- print("""
259
- if (verbose)
260
- printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
261
- int n_shared = sizeof(%(dtype)s) * n_threads.x *
262
- n_threads.y * n_threads.z;
263
- if (verbose>1)
264
- printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
265
- " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
266
- " nb_block=%%d, n_shared=%%d\\n",
267
- n_threads.x,n_threads.y,n_threads.z,
268
- n_threads.x*n_threads.y*n_threads.z,
269
- n_blocks.x,n_blocks.y,
270
- n_blocks.x*n_blocks.y, n_shared);
271
- kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks,
272
- n_threads, n_shared>>>(
273
- """ % locals(), file=sio)
274
- for i in range(ndim):
275
- print("""
276
- CudaNdarray_HOST_DIMS(%(x)s)[%(i)s],
277
- """ % locals(), file=sio)
278
- print("""
279
- CudaNdarray_DEV_DATA(%(x)s)
280
- """ % locals(), file=sio)
281
- for i in range(ndim):
282
- print("""
283
- ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]
284
- """ % locals(), file=sio)
285
- print("""
286
- ,CudaNdarray_DEV_DATA(%(z)s)
287
- """ % locals(), file=sio)
288
- for i in range(nd_out):
289
- print("""
290
- ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
291
- """ % locals(), file=sio)
292
- print("""
293
- );
294
- CNDA_THREAD_SYNC;
295
- cudaError_t sts = cudaGetLastError();
296
- if (cudaSuccess != sts)
297
- {
298
- PyErr_Format(PyExc_RuntimeError,
299
- "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
300
- "kernel_reduce_sum_%(pattern)s_%(name)s",
301
- cudaGetErrorString(sts),
302
- n_blocks.x,
303
- n_blocks.y,
304
- n_threads.x,
305
- n_threads.y,
306
- n_threads.z);
307
- %(fail)s;
308
- }
309
- """ % locals(), file=sio)
310
- return sio.getvalue()
311
-
312
- def _k_decl(self, nodename,
313
- pattern=None, ndim=None, reduce_mask=None):
314
- """Return a string to declare a kernel function
315
-
316
- .. code-block:: c
317
-
318
- __global__ void kernel_reduce_sum_110_%(nodename)s(
319
- const int d0,
320
- const int d1,
321
- const int d2,
322
- const %(dtype)s *A,
323
- const int sA0,
324
- const int sA1,
325
- const int sA2,
326
- %(dtype)s * Z,
327
- const int sZ0)
328
-
329
- """
330
- dtype = self.dtype
331
- if reduce_mask is None:
332
- reduce_mask = self.reduce_mask
333
- if ndim is None:
334
- ndim = len(reduce_mask)
335
- if pattern is None:
336
- pattern = ''.join(str(i) for i in reduce_mask)
337
- sio = StringIO.StringIO()
338
-
339
- print("""
340
- __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s(
341
- """ % locals(), file=sio)
342
-
343
- for i in range(ndim):
344
- print("""const int d%(i)s,""" % locals(), file=sio)
345
-
346
- print("""const %(dtype)s *A,""" % locals(), file=sio)
347
-
348
- for i in range(ndim):
349
- print("""const int sA%(i)s,""" % locals(), file=sio)
350
-
351
- print("""%(dtype)s * Z""" % locals(), file=sio)
352
-
353
- for i in range(ndim - sum(reduce_mask)):
354
- print(""", const int sZ%(i)s""" % locals(), file=sio)
355
-
356
- print(")", file=sio)
357
-
358
- return sio.getvalue()
359
-
360
- def _k_init(self, *args):
361
- dtype = self.dtype
362
- return """
363
- const int threadCount = blockDim.x * blockDim.y * blockDim.z;
364
- const int threadNum = threadIdx.z * blockDim.x * blockDim.y
365
- + threadIdx.y * blockDim.x + threadIdx.x;
366
- extern __shared__ %(dtype)s buf[];
367
- %(dtype)s mysum = 0.0f;
368
-
369
- if (warpSize != 32){ //TODO: set error code
370
- Z[0] = 666;
371
- return;
372
- }
373
-
374
- """ % locals()
375
-
376
- def _k_reduce_buf(self, z_pos):
377
- return """
378
- __syncthreads(); // some kernel do multiple reduction.
379
- buf[threadNum] = mysum;
380
- __syncthreads();
381
-
382
- // rest of function is handled by one warp
383
- if (threadNum < warpSize)
384
- {
385
- //round up all the partial sums into the first `warpSize` elements
386
- for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
387
- {
388
- mysum += buf[i];
389
- }
390
- buf[threadNum] = mysum;
391
- if (threadNum < 16)
392
- {
393
- //reduce so that threadNum 0 has the sum of everything
394
- if(threadNum + 16 < threadCount)
395
- buf[threadNum] += buf[threadNum+16];
396
- if(threadNum + 8 < threadCount)
397
- buf[threadNum] += buf[threadNum+8];
398
- if(threadNum + 4 < threadCount)
399
- buf[threadNum] += buf[threadNum+4];
400
- if(threadNum + 2 < threadCount)
401
- buf[threadNum] += buf[threadNum+2];
402
- if(threadNum + 1 < threadCount)
403
- buf[threadNum] += buf[threadNum+1];
404
- if (threadNum == 0)
405
- {
406
- %(z_pos)s = buf[0];
407
- }
408
- }
409
- }
410
- """ % locals()
411
- return """
412
- __syncthreads(); // some kernel do multiple reduction.
413
- buf[threadNum] = mysum;
414
- __syncthreads();
415
-
416
- // rest of function is handled by one warp
417
- if (threadNum < warpSize)
418
- {
419
- //round up all the partial sums into the first `warpSize` elements
420
- for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
421
- {
422
- mysum += buf[i];
423
- }
424
- buf[threadNum] = mysum;
425
- /*Comment this optimization as it don't work on Fermi GPU.
426
- TODO: find why it don't work or put the GPU compute capability into the version
427
- // no sync because only one warp is running
428
- if(threadCount >32)
429
- {
430
- buf[threadNum] += buf[threadNum+16];
431
- buf[threadNum] += buf[threadNum+8];
432
- buf[threadNum] += buf[threadNum+4];
433
- buf[threadNum] += buf[threadNum+2];
434
- buf[threadNum] += buf[threadNum+1];
435
- if (threadNum == 0)
436
- {
437
- %(z_pos)s = buf[0];
438
- }
439
-
440
- }
441
- else */
442
- if (threadNum < 16)
443
- {
444
- //reduce so that threadNum 0 has the sum of everything
445
- if(threadNum + 16 < threadCount)
446
- buf[threadNum] += buf[threadNum+16];
447
- if(threadNum + 8 < threadCount)
448
- buf[threadNum] += buf[threadNum+8];
449
- if(threadNum + 4 < threadCount)
450
- buf[threadNum] += buf[threadNum+4];
451
- if(threadNum + 2 < threadCount)
452
- buf[threadNum] += buf[threadNum+2];
453
- if(threadNum + 1 < threadCount)
454
- buf[threadNum] += buf[threadNum+1];
455
- if (threadNum == 0)
456
- {
457
- %(z_pos)s = buf[0];
458
- }
459
- }
460
- }
461
- """ % locals()
462
-
463
- # Threads must be organized as: threadNum%nb_reduce correspond to
464
- # the same sum
465
- # nb_reduce<=warpSize
466
- def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
467
- return """
468
- __syncthreads(); // some kernel do multiple reduction.
469
- buf[threadNum] = mysum;
470
- __syncthreads();
471
-
472
- // rest of function is handled by one warp
473
- if (threadNum < %(nb_reduce)s)
474
- {
475
- //round up all the partial sums into the first `nb_reduce` elements
476
- for (int i = threadNum + %(nb_reduce)s;
477
- i < threadCount; i += %(nb_reduce)s)
478
- {
479
- mysum += buf[i];
480
- }
481
- %(z_pos)s = mysum;
482
- }
483
- """ % locals()
484
-
485
- def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
486
- print("""
487
- {
488
- if(CudaNdarray_SIZE(%(x)s)==0){
489
- cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(%(dtype)s));
490
- }else{
491
- int verbose = 0;
492
- dim3 n_threads(
493
- std::min(CudaNdarray_SIZE(%(x)s),
494
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
495
- dim3 n_blocks(1);
496
- if (verbose)
497
- printf("running kernel_reduce_sum_ccontig_%(name)s"
498
- " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
499
- n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
500
- int n_shared = sizeof(%(dtype)s) * n_threads.x;
501
- kernel_reduce_sum_ccontig_%(name)s<<<n_blocks,
502
- n_threads, n_shared>>>(
503
- CudaNdarray_SIZE(%(x)s),
504
- CudaNdarray_DEV_DATA(%(x)s),
505
- CudaNdarray_DEV_DATA(%(z)s));
506
- CNDA_THREAD_SYNC;
507
- cudaError_t sts = cudaGetLastError();
508
- if (cudaSuccess != sts)
509
- {
510
- PyErr_Format(PyExc_RuntimeError,
511
- "Cuda error: %%s: %%s. (grid: %%i x %%i;"
512
- " block: %%i x %%i x %%i)\\n",
513
- "kernel_reduce_sum_ccontig_%(name)s",
514
- cudaGetErrorString(sts),
515
- n_blocks.x,
516
- n_blocks.y,
517
- n_threads.x,
518
- n_threads.y,
519
- n_threads.z);
520
- %(fail)s;
521
- }
522
- }
523
- }
524
- """ % locals(), file=sio)
525
-
526
- def c_code_reduce_1(self, sio, node, name, x, z, fail):
527
- makecall = self._makecall(node, name, x, z, fail)
528
- print("""
529
- {
530
- int verbose = 0;
531
- dim3 n_threads(
532
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
533
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
534
- dim3 n_blocks(1);
535
- %(makecall)s
536
- }
537
- """ % locals(), file=sio)
538
-
539
- def c_code_reduce_11(self, sio, node, name, x, z, fail):
540
- makecall = self._makecall(node, name, x, z, fail)
541
- print("""
542
- {
543
- int verbose = 0;
544
- dim3 n_threads(
545
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
546
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
547
- while (n_threads.y * n_threads.x <=
548
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
549
- ++n_threads.y;
550
- n_threads.y -= 1;
551
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
552
- n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0];
553
-
554
- dim3 n_blocks(1);
555
- %(makecall)s
556
- }
557
- """ % locals(), file=sio)
558
-
559
- def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
560
- """
561
- :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011,
562
- N=3 ->0111 Work for N=1,2,3
563
- """
564
- assert N in [1, 2, 3]
565
- makecall = self._makecall(node, name, x, z, fail)
566
- N_pattern = ''.join(['1'] * N)
567
- param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals()
568
- for i in range(N + 1)])
569
- strides_dim = ",".join(
570
- ["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals()
571
- for i in range(N + 1)])
572
- threads_y = """
573
- //get as many y threads as we can fit
574
- while (n_threads.x * (n_threads.y+1) <=
575
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
576
- {
577
- if (n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-1])
578
- n_threads.y += 1;
579
- else
580
- break;
581
- }
582
- """ % locals()
583
- threads_z = """
584
- //get as many z threads as we can fit
585
- while (n_threads.x * n_threads.y * (n_threads.z+1) <=
586
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
587
- {
588
- if (n_threads.z < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-2])
589
- n_threads.z += 1;
590
- else
591
- break;
592
- }
593
- """ % locals()
594
- if len(self.reduce_mask) == 2:
595
- threads_y = ''
596
- threads_z = ''
597
- if len(self.reduce_mask) == 3:
598
- threads_z = ''
599
- print("""
600
- {
601
- int verbose = 0;
602
- dim3 n_threads(
603
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[%(N)s],
604
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
605
- %(threads_y)s
606
- %(threads_z)s
607
- dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
608
- NUM_VECTOR_OP_BLOCKS));
609
- %(makecall)s
610
- }
611
- """ % locals(), file=sio)
612
-
613
- def c_code_reduce_01(self, sio, node, name, x, z, fail):
614
- self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
615
-
616
- def c_code_reduce_011(self, sio, node, name, x, z, fail):
617
- self.c_code_reduce_01X(sio, node, name, x, z, fail, 2)
618
-
619
- def c_code_reduce_0111(self, sio, node, name, x, z, fail):
620
- self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
621
-
622
- def c_code_reduce_10(self, sio, node, name, x, z, fail):
623
- print("""
624
- {
625
- int verbose = 0;
626
- dim3 n_threads(
627
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
628
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
629
- dim3 n_blocks(1,
630
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
631
- NUM_VECTOR_OP_BLOCKS));
632
- if (verbose) {
633
- fprintf(stderr,
634
- "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
635
- n_blocks.x,
636
- n_blocks.y);
637
- }
638
- assert(CudaNdarray_HOST_DIMS(%(x)s)[1] ==
639
- CudaNdarray_HOST_DIMS(%(z)s)[0]);
640
- int n_shared = sizeof(%(dtype)s) * n_threads.x;
641
- kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
642
- 1,
643
- CudaNdarray_HOST_DIMS(%(x)s)[0],
644
- CudaNdarray_HOST_DIMS(%(x)s)[1],
645
- CudaNdarray_DEV_DATA(%(x)s),
646
- 1,
647
- CudaNdarray_HOST_STRIDES(%(x)s)[0],
648
- CudaNdarray_HOST_STRIDES(%(x)s)[1],
649
- CudaNdarray_DEV_DATA(%(z)s),
650
- 1,
651
- CudaNdarray_HOST_STRIDES(%(z)s)[0]
652
- );
653
- CNDA_THREAD_SYNC;
654
- cudaError_t sts = cudaGetLastError();
655
- if (cudaSuccess != sts)
656
- {
657
- PyErr_Format(PyExc_RuntimeError,
658
- "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
659
- "kernel_reduce_sum_010_%(name)s",
660
- cudaGetErrorString(sts),
661
- n_blocks.x,
662
- n_blocks.y,
663
- n_threads.x,
664
- n_threads.y,
665
- n_threads.z);
666
- %(fail)s;
667
- }
668
- }
669
- """ % locals(), file=sio)
670
-
671
- def c_code_reduce_010(self, sio, node, name, x, z, fail):
672
- makecall = self._makecall(node, name, x, z, fail)
673
- makecall_inner = self._makecall(node, name, x, z,
674
- fail, pattern="010_inner")
675
- pattern = ''.join(str(i) for i in self.reduce_mask)
676
- print("""
677
- {
678
-
679
- // if the alternative is less buggy, consider not using this branch
680
- if (1)
681
- {
682
- // If there are a lot of summations to do, then we can use
683
- // simple parallelization - use each thread to do one sum.
684
- // we might as well launch blocks of 32 threads because that's
685
- // the warp size. we could schedule more threads if we were
686
- // maxing out the gridsize below, but the gridsize is way more
687
- // than the physical hardware and I think 32 threads
688
- // on a huge grid is enough to fully use the hardware.
689
- dim3 n_threads(32,1,1);
690
-
691
- // We kindof reshape the input implicitly to something 4D:
692
- // the shape A,B,C -> A, B, D, E
693
- // where C <= D*E < C+32
694
- // where E==32
695
-
696
- int A = CudaNdarray_HOST_DIMS(%(x)s)[0];
697
- int B = CudaNdarray_HOST_DIMS(%(x)s)[1];
698
- int C = CudaNdarray_HOST_DIMS(%(x)s)[2];
699
- int D = C/32;
700
- if (32*D < C) D+= 1;
701
- assert ((C <= 32*D) && (32*D < C+32));
702
-
703
- // The gridsize would ideally be (A, D). But we do the
704
- // following logic to make sure we don't ask for a grid that
705
- // is too big.
706
- dim3 n_blocks(A,D);
707
- if (n_blocks.x > NUM_VECTOR_OP_BLOCKS)
708
- n_blocks.x = NUM_VECTOR_OP_BLOCKS;
709
- if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS)
710
- n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
711
- int n_shared = 0;
712
- kernel_reduce_sum_010_AD_%(name)s<<<n_blocks,
713
- n_threads, n_shared>>>(
714
- A,B,C,D,
715
- CudaNdarray_DEV_DATA(%(x)s),
716
- CudaNdarray_HOST_STRIDES(%(x)s)[0],
717
- CudaNdarray_HOST_STRIDES(%(x)s)[1],
718
- CudaNdarray_HOST_STRIDES(%(x)s)[2],
719
- CudaNdarray_DEV_DATA(%(z)s),
720
- CudaNdarray_HOST_STRIDES(%(z)s)[0],
721
- CudaNdarray_HOST_STRIDES(%(z)s)[1]
722
- );
723
- CNDA_THREAD_SYNC;
724
- cudaError_t sts = cudaGetLastError();
725
- if (cudaSuccess != sts)
726
- {
727
- PyErr_Format(PyExc_RuntimeError,
728
- "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
729
- "kernel_reduce_sum_010_%(name)s",
730
- cudaGetErrorString(sts),
731
- n_blocks.x,
732
- n_blocks.y,
733
- n_threads.x,
734
- n_threads.y,
735
- n_threads.z);
736
- %(fail)s;
737
- }
738
- }
739
- else
740
- {
741
- int verbose = 2;
742
-
743
- dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2]));
744
- while((n_threads.x*(n_threads.y+1) <=
745
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
746
- && (n_threads.y<CudaNdarray_HOST_DIMS(%(x)s)[1])){
747
- n_threads.y++;
748
- }
749
-
750
- dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
751
- (int)NUM_VECTOR_OP_BLOCKS));
752
- n_blocks.y = std::min(
753
- ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
754
- (int)n_threads.x),
755
- (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)
756
- );
757
- if(std::min(std::min(CudaNdarray_HOST_STRIDES(%(x)s)[0],
758
- CudaNdarray_HOST_STRIDES(%(x)s)[1]),
759
- CudaNdarray_HOST_STRIDES(%(x)s)[2])
760
- ==CudaNdarray_HOST_STRIDES(%(x)s)[2]
761
- && n_blocks.y == ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
762
- (int)n_threads.x)){
763
- if(verbose>1)
764
- printf("n_block.x.1=%%d, n_block.x.2=%%d,"
765
- " n_block.y.1=%%d, n_block.y.2=%%d,\\n",
766
- CudaNdarray_HOST_DIMS(%(x)s)[0],
767
- NUM_VECTOR_OP_BLOCKS,
768
- ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
769
- (int)n_threads.x),
770
- (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x));
771
- assert(n_threads.x<=32);
772
- %(makecall_inner)s
773
- }else{
774
- n_threads.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
775
- (int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
776
- n_blocks.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
777
- (int)NUM_VECTOR_OP_BLOCKS);
778
- n_blocks.y = std::min(
779
- CudaNdarray_HOST_DIMS(%(x)s)[2],
780
- (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)
781
- );
782
- %(makecall)s
783
- }
784
- CNDA_THREAD_SYNC;
785
- cudaError_t sts = cudaGetLastError();
786
- if (cudaSuccess != sts)
787
- {
788
- PyErr_Format(PyExc_RuntimeError,
789
- "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
790
- "kernel_reduce_sum_%(pattern)s_%(name)s",
791
- cudaGetErrorString(sts),
792
- n_blocks.x,
793
- n_blocks.y,
794
- n_threads.x,
795
- n_threads.y,
796
- n_threads.z);
797
- %(fail)s;
798
- }
799
- }
800
- }
801
- """ % locals(), file=sio)
802
-
803
- def c_code_reduce_0101(self, sio, node, name, x, z, fail):
804
- makecall = self._makecall(node, name, x, z, fail)
805
- print("""
806
- {
807
- int verbose = 0;
808
- dim3 n_threads(
809
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
810
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
811
- while (n_threads.x * n_threads.y <=
812
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
813
- {
814
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break;
815
- n_threads.y += 1;
816
- }
817
- n_threads.y -= 1;
818
- dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0],
819
- CudaNdarray_HOST_DIMS(%(x)s)[2]);
820
- %(makecall)s
821
- }
822
- """ % locals(), file=sio)
823
-
824
- def c_code_reduce_100(self, sio, node, name, x, z, fail):
825
- makecall = self._makecall(node, name, x, z, fail)
826
- # use threadIdx.x for i0
827
- # use blockIdx.x for i1
828
- # use blockIdx.y for i2
829
- print("""
830
- {
831
- int verbose = 0;
832
- dim3 n_threads(
833
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
834
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
835
- dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
836
- while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS
837
- && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
838
- {
839
- n_blocks.y += 1;
840
- }
841
- %(makecall)s
842
- }
843
- """ % locals(), file=sio)
844
-
845
- def c_code_reduce_110(self, sio, node, name, x, z, fail):
846
- makecall = self._makecall(node, name, x, z, fail)
847
- print("""
848
- {
849
- int verbose = 0;
850
- dim3 n_threads(
851
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
852
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
853
- while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
854
- {
855
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
856
- break;
857
- n_threads.y += 1;
858
- }
859
- n_threads.y -= 1;
860
-
861
- dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]);
862
- %(makecall)s
863
- }
864
- """ % locals(), file=sio)
865
-
866
- def c_code_reduce_001(self, sio, node, name, x, z, fail):
867
- makecall = self._makecall(node, name, x, z, fail)
868
- print("""
869
- {
870
- int verbose = 0;
871
- dim3 n_threads(
872
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
873
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
874
- dim3 n_blocks(
875
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
876
- NUM_VECTOR_OP_BLOCKS));
877
- while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
878
- {
879
- if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
880
- break;
881
- n_blocks.y += 1;
882
- }
883
- n_blocks.y -= 1;
884
- %(makecall)s
885
- }
886
- """ % locals(), file=sio)
887
-
888
- def c_code_reduce_111(self, sio, node, name, x, z, fail):
889
- makecall = self._makecall(node, name, x, z, fail)
890
- print("""
891
- {
892
- int verbose = 0;
893
- dim3 n_threads(
894
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
895
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
896
-
897
- //get as many y threads as we can fit
898
- while (n_threads.x * n_threads.y <=
899
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
900
- {
901
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
902
- break;
903
- n_threads.y += 1;
904
- }
905
- n_threads.y -= 1;
906
-
907
- //get as many z threads as we can fit
908
- while (n_threads.x * n_threads.y * n_threads.z <=
909
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
910
- {
911
- if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
912
- break;
913
- n_threads.z += 1;
914
- }
915
- n_threads.z -= 1;
916
-
917
- dim3 n_blocks(1,1,1);
918
- %(makecall)s
919
- }
920
- """ % locals(), file=sio)
921
-
922
- def c_code_reduce_0011(self, sio, node, name, x, z, fail):
923
- makecall = self._makecall(node, name, x, z, fail)
924
- print("""
925
- {
926
- int verbose = 0;
927
-
928
- dim3 n_blocks(
929
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
930
- NUM_VECTOR_OP_BLOCKS));
931
-
932
- while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS &&
933
- n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1])
934
- {
935
- n_blocks.y += 1;
936
- }
937
-
938
- dim3 n_threads(
939
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
940
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
941
- while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK
942
- && n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2]
943
- && n_threads.x * n_threads.y * sizeof(%(dtype)s) <=
944
- (15 * 1024 - 200))
945
- {
946
- n_threads.y += 1;
947
- }
948
-
949
- %(makecall)s
950
- }
951
- """ % locals(), file=sio)
952
-
953
- def c_code_reduce_1111(self, sio, node, name, x, z, fail):
954
- makecall = self._makecall(node, name, x, z, fail)
955
- print("""
956
- {
957
- int verbose = 0;
958
- dim3 n_threads(
959
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
960
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
961
-
962
- //get as many y threads as we can fit
963
- while (n_threads.x * n_threads.y <=
964
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
965
- {
966
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
967
- break;
968
- n_threads.y += 1;
969
- }
970
- n_threads.y -= 1;
971
-
972
- //get as many z threads as we can fit
973
- while (n_threads.x * n_threads.y * n_threads.z <=
974
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
975
- {
976
- if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
977
- break;
978
- n_threads.z += 1;
979
- }
980
- n_threads.z -= 1;
981
-
982
- dim3 n_blocks(1,1,1);
983
- %(makecall)s
984
- }
985
- """ % locals(), file=sio)
986
-
987
- def c_code_reduce_1011(self, sio, node, name, x, z, fail):
988
- makecall = self._makecall(node, name, x, z, fail)
989
- print("""
990
- {
991
- int verbose = 0;
992
- dim3 n_threads(
993
- std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
994
- NUM_VECTOR_OP_THREADS_PER_BLOCK));
995
-
996
- while (n_threads.x * (n_threads.y+1) <=
997
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
998
- ++n_threads.y;
999
- if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
1000
- n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
1001
-
1002
- while (n_threads.x * n_threads.y * (n_threads.z+1) <=
1003
- NUM_VECTOR_OP_THREADS_PER_BLOCK)
1004
- ++n_threads.z;
1005
- if (n_threads.z > 64)
1006
- n_threads.z = 64;
1007
- if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
1008
- n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
1009
-
1010
- dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
1011
- %(makecall)s
1012
- }
1013
- """ % locals(), file=sio)
1014
-
1015
- def c_code_cache_version(self):
1016
- return (21,)
1017
-
1018
- def c_support_code_apply(self, nodename, contig=False):
1019
- sio = StringIO.StringIO()
1020
- nd_in = len(self.reduce_mask)
1021
- dtype = self.dtype
1022
- if contig: # all(i == 1 for i in self.reduce_mask):
1023
- #this kernel is ok for up to a few thousand elements, but
1024
- # it only runs on ONE multiprocessor
1025
- reducebuf = self._k_reduce_buf('Z[0]')
1026
- print("""
1027
- __global__ void kernel_reduce_sum_ccontig_%(nodename)s(
1028
- const int d0,
1029
- const %(dtype)s *A,
1030
- %(dtype)s * Z)
1031
- {
1032
- const int threadCount = blockDim.x;
1033
- const int threadNum = threadIdx.x;
1034
- extern __shared__ %(dtype)s buf[];
1035
- %(dtype)s mysum = 0.0f;
1036
-
1037
- if (warpSize != 32)
1038
- {
1039
- return; //TODO: set error code
1040
- }
1041
-
1042
- for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
1043
- {
1044
- mysum += A[i0];
1045
- }
1046
- %(reducebuf)s
1047
- }
1048
- """ % locals(), file=sio)
1049
- if self.reduce_mask == (1,):
1050
- #this kernel is ok for up to a few thousand elements, but
1051
- # it only runs on ONE multiprocessor
1052
- reducebuf = self._k_reduce_buf('Z[0]')
1053
- decl = self._k_decl(nodename)
1054
- print("""
1055
- %(decl)s
1056
- {
1057
- const int threadCount = blockDim.x;
1058
- const int threadNum = threadIdx.x;
1059
- extern __shared__ %(dtype)s buf[];
1060
- %(dtype)s mysum = 0.0f;
1061
-
1062
- if (warpSize != 32)
1063
- {
1064
- return; //TODO: set error code
1065
- }
1066
-
1067
- for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
1068
- {
1069
- %(dtype)s Ai = A[i0 * sA0];
1070
- mysum += Ai;
1071
- }
1072
- %(reducebuf)s
1073
- }
1074
- """ % locals(), file=sio)
1075
- if self.reduce_mask == (1, 1):
1076
- #this kernel is ok for up to a few thousand elements, but
1077
- # it only runs on ONE multiprocessor
1078
- reducebuf = self._k_reduce_buf('Z[0]')
1079
- decl = self._k_decl(nodename)
1080
- init = self._k_init(nodename)
1081
- print(decl, file=sio)
1082
- print(" { ", file=sio)
1083
- print(init, file=sio)
1084
- print("""
1085
- for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
1086
- {
1087
- for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
1088
- {
1089
- %(dtype)s Ai = A[i0 * sA0 + i1 * sA1];
1090
- mysum += Ai;
1091
- }
1092
- }
1093
- """ % locals(), file=sio)
1094
- print(reducebuf, file=sio)
1095
- print(" } ", file=sio)
1096
-
1097
- #01, 011, 0111
1098
- if (0 == self.reduce_mask[0] and
1099
- all(self.reduce_mask[1:]) and nd_in in[2, 3, 4]):
1100
- # this kernel uses one block for each row.
1101
- # threads per block for each element per row.
1102
-
1103
- N_pattern = ''.join(['1'] * (nd_in - 1))
1104
- if nd_in == 2:
1105
- for_i1 = "for(int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
1106
- for_i2 = "int i2=0, sA2=0;"
1107
- for_i3 = "int i3=0, sA3=0;"
1108
- if nd_in == 3:
1109
- for_i1 = "for(int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
1110
- for_i2 = "for(int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
1111
- for_i3 = "int i3=0, sA3=0;"
1112
- if nd_in == 4:
1113
- for_i1 = "for(int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
1114
- for_i2 = "for(int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
1115
- for_i3 = "for(int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
1116
-
1117
- reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
1118
- param_dim = ",".join(["const int d%(i)s" % locals()
1119
- for i in range(nd_in)])
1120
- param_strides = ",".join(["const int sA%(i)s" % locals()
1121
- for i in range(nd_in)])
1122
- decl = self._k_decl(nodename)
1123
- init = self._k_init(nodename)
1124
- print("""
1125
- %(decl)s{
1126
- %(init)s
1127
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1128
- mysum = 0;
1129
- %(for_i1)s{
1130
- %(for_i2)s{
1131
- %(for_i3)s{
1132
- %(dtype)s Ai = A[i3 * sA3 + i2 * sA2 +
1133
- i1 * sA1 + i0 * sA0];
1134
- mysum += Ai;
1135
- }
1136
- }
1137
- }
1138
- %(reducebuf)s
1139
- }
1140
- }
1141
- """ % locals(), file=sio)
1142
- if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
1143
- # this kernel uses one block for each column,
1144
- # threads per block for each element per column.
1145
-
1146
- #TODO: This kernel is pretty inefficient in terms of
1147
- # reading, because if A is c_contiguous (typical
1148
- # case) then each warp is accessing non-contigous
1149
- # memory (a segment of a column).
1150
- reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
1151
- print("""
1152
- __global__ void kernel_reduce_sum_010_%(nodename)s(
1153
- const int d0,
1154
- const int d1,
1155
- const int d2,
1156
- const %(dtype)s *A, const int sA0,
1157
- const int sA1, const int sA2,
1158
- %(dtype)s * Z, const int sZ0, const int sZ1)
1159
- {
1160
- const int threadCount = blockDim.x;
1161
- const int threadNum = threadIdx.x;
1162
- extern __shared__ %(dtype)s buf[];
1163
-
1164
- if (warpSize != 32)
1165
- {
1166
- return; //TODO: set error code
1167
- }
1168
-
1169
-
1170
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1171
- {
1172
- for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
1173
- {
1174
- %(dtype)s mysum = 0.0f;
1175
- for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
1176
- {
1177
- mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
1178
- }
1179
- %(reducebuf)s
1180
- }
1181
- }
1182
-
1183
- }
1184
- """ % locals(), file=sio)
1185
- if self.reduce_mask == (0, 1, 0):
1186
- print("""
1187
- __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
1188
- const int A,
1189
- const int B,
1190
- const int C,
1191
- const int D,
1192
- //const int E, // THIS is 32
1193
- const %(dtype)s *X, const int sX0,
1194
- const int sX1, const int sX2,
1195
- %(dtype)s * Z, const int sZ0, const int sZ1)
1196
- {
1197
- const int threadCount = blockDim.x;
1198
- const int threadNum = threadIdx.x;
1199
- %(dtype)s mysum = 0.0f;
1200
-
1201
- if (warpSize != 32)
1202
- {
1203
- return; //TODO: set error code
1204
- }
1205
-
1206
- for (int a = blockIdx.x; a < A; a += gridDim.x)
1207
- {
1208
- for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
1209
- {
1210
- int c = i2_D * 32 + threadIdx.x;
1211
- if (c < C)
1212
- {
1213
- mysum = 0;
1214
- for (int b = 0; b < B; ++b)
1215
- {
1216
- mysum += X[a * sX0 + b * sX1 + c * sX2];
1217
- }
1218
- Z[a * sZ0 + c * sZ1] = mysum;
1219
- }
1220
- }
1221
- }
1222
-
1223
- }
1224
- """ % locals(), file=sio)
1225
- if self.reduce_mask == (0, 1, 0):
1226
- #
1227
- # This kernel is optimized when the inner most dimensions
1228
- # have the smallest stride.
1229
-
1230
- # this kernel uses one block for multiple column(up to 32TODO),
1231
- # threads per block for each element per column.
1232
-
1233
- #thread.x = dim 2 contiguous
1234
- #thread.y = dim 1
1235
- #block.x = dim 0
1236
- #block.y = dim 1 rest
1237
- init = self._k_init(nodename)
1238
- decl = self._k_decl(nodename, pattern="010_inner")
1239
- reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
1240
- 'blockDim.x')
1241
- reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
1242
- 'blockDim.x')
1243
- print("""
1244
- %(decl)s
1245
- {
1246
- if(warpSize<blockDim.x){
1247
- //TODO: set error code
1248
- // need to be positive to work with unsigned
1249
- Z[0] = 666;
1250
- return;
1251
- }
1252
-
1253
- %(init)s
1254
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1255
- {
1256
- for (int i2 = blockIdx.y*blockDim.x+threadIdx.x;
1257
- i2 < d2; i2 += gridDim.y*blockDim.x)
1258
- {
1259
- for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
1260
- {
1261
- mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
1262
- }
1263
- %(reducebuf)s
1264
- }
1265
- }
1266
- }
1267
- """ % locals(), file=sio)
1268
- if self.reduce_mask == (1, 1, 0):
1269
- # this kernel uses one block for each column,
1270
- # threads per block for each element per column.
1271
-
1272
- #TODO: This kernel is pretty inefficient in terms of
1273
- # reading, because if A is c_contiguous (typical
1274
- # case) then each warp is accessing non-contigous
1275
- # memory (a segment of a column).
1276
- reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
1277
- print("""
1278
- __global__ void kernel_reduce_sum_110_%(nodename)s(
1279
- const int d0,
1280
- const int d1,
1281
- const int d2,
1282
- const %(dtype)s *A, const int sA0,
1283
- const int sA1, const int sA2,
1284
- %(dtype)s * Z, const int sZ0)
1285
- {
1286
- const int threadCount = blockDim.x * blockDim.y;
1287
- const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
1288
- extern __shared__ %(dtype)s buf[];
1289
- %(dtype)s mysum = 0.0f;
1290
-
1291
- if (warpSize != 32)
1292
- {
1293
- //TODO: set error code
1294
- Z[blockIdx.x * sZ0] = 666;
1295
- return;
1296
- }
1297
-
1298
- for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
1299
- {
1300
- for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
1301
- {
1302
- %(dtype)s Ai = A[i0 * sA0 + i1 * sA1 +
1303
- blockIdx.x * sA2];
1304
- mysum += Ai;
1305
- }
1306
- }
1307
-
1308
- %(reducebuf)s
1309
- }
1310
- """ % locals(), file=sio)
1311
- if self.reduce_mask == (1, 0, 0):
1312
- reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
1313
- decl = self._k_decl(nodename)
1314
- init = self._k_init(nodename)
1315
- print("""
1316
- %(decl)s
1317
- {
1318
- %(init)s
1319
- for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
1320
- {
1321
- for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
1322
- {
1323
- mysum = 0;
1324
- for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
1325
- {
1326
- mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
1327
- }
1328
- %(reducebuf)s
1329
- }
1330
- }
1331
- }
1332
- """ % locals(), file=sio)
1333
- if self.reduce_mask == (1, 1, 1):
1334
- reducebuf = self._k_reduce_buf('Z[0]')
1335
- decl = self._k_decl(nodename)
1336
- init = self._k_init(nodename)
1337
- print("""
1338
- %(decl)s
1339
- {
1340
- %(init)s
1341
-
1342
- for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
1343
- {
1344
- for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
1345
- {
1346
- for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
1347
- {
1348
- mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
1349
- }
1350
- }
1351
- }
1352
- """ % locals(), file=sio)
1353
- print(reducebuf, "}", file=sio)
1354
-
1355
- if self.reduce_mask == (0, 0, 1):
1356
- # this kernel uses one block for each row,
1357
- # threads per block for each element per row.
1358
- reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
1359
- print("""
1360
- __global__ void kernel_reduce_sum_001_%(nodename)s(
1361
- const int d0,
1362
- const int d1,
1363
- const int d2,
1364
- const %(dtype)s *A, const int sA0,
1365
- const int sA1, const int sA2,
1366
- %(dtype)s * Z, const int sZ0, const int sZ1)
1367
- {
1368
- const int threadCount = blockDim.x;
1369
- const int threadNum = threadIdx.x;
1370
- extern __shared__ %(dtype)s buf[];
1371
-
1372
- if (warpSize != 32)
1373
- {
1374
- return; //TODO: set error code
1375
- }
1376
-
1377
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1378
- {
1379
- for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
1380
- {
1381
- %(dtype)s mysum = 0.0f;
1382
- for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
1383
- {
1384
- mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
1385
- }
1386
- %(reducebuf)s
1387
- }
1388
- }
1389
- }
1390
- """ % locals(), file=sio)
1391
- if self.reduce_mask == (0, 0, 1, 1):
1392
- # this kernel uses one block for each row,
1393
- # threads per block for each element per row.
1394
- reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
1395
- decl = self._k_decl(nodename)
1396
- init = self._k_init(nodename)
1397
- print("""
1398
- %(decl)s
1399
- {
1400
- %(init)s
1401
-
1402
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1403
- {
1404
- for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
1405
- {
1406
- %(dtype)s mysum = 0.0f;
1407
- for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
1408
- {
1409
- for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
1410
- {
1411
- mysum += A[i0 * sA0 + i1 * sA1 +
1412
- i2 * sA2 + i3 * sA3];
1413
- }
1414
- }
1415
- %(reducebuf)s
1416
- }
1417
- }
1418
- }
1419
- """ % locals(), file=sio)
1420
- if self.reduce_mask == (0, 1, 0, 1):
1421
- # this kernel uses one block for each row,
1422
- # threads per block for each element per row.
1423
- reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]')
1424
- decl = self._k_decl(nodename)
1425
- init = self._k_init(nodename)
1426
- print("""
1427
- %(decl)s
1428
- {
1429
- %(init)s
1430
-
1431
- for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
1432
- {
1433
- for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
1434
- {
1435
- %(dtype)s mysum = 0.0f;
1436
- for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
1437
- {
1438
- for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
1439
- {
1440
- mysum += A[i0 * sA0 + i1 * sA1 +
1441
- i2 * sA2 + i3 * sA3];
1442
- }
1443
- }
1444
- %(reducebuf)s
1445
- }
1446
- }
1447
- }
1448
- """ % locals(), file=sio)
1449
- if self.reduce_mask == (1, 1, 1, 1):
1450
- reducebuf = self._k_reduce_buf('Z[0]')
1451
- decl = self._k_decl(nodename)
1452
- init = self._k_init(nodename)
1453
- print("""
1454
- %(decl)s
1455
- {
1456
- %(init)s
1457
- mysum = 0;
1458
- for (int i0 = 0; i0 < d0; i0++)
1459
- for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
1460
- {
1461
- for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
1462
- {
1463
- for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
1464
- {
1465
- mysum += A[i0 * sA0 + i1 * sA1 +
1466
- i2 * sA2 + i3 * sA3];
1467
- }
1468
- }
1469
- }
1470
- %(reducebuf)s
1471
- }
1472
- """ % locals(), file=sio)
1473
- if self.reduce_mask == (1, 0, 1, 1):
1474
- reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
1475
- print("""
1476
- __global__ void kernel_reduce_sum_1011_%(nodename)s(
1477
- const int d0,
1478
- const int d1,
1479
- const int d2,
1480
- const int d3,
1481
- const %(dtype)s *A, const int sA0,
1482
- const int sA1, const int sA2, const int sA3,
1483
- %(dtype)s * Z, const int sZ0)
1484
- {
1485
- const int threadCount = blockDim.x * blockDim.y * blockDim.z;
1486
- const int threadNum = threadIdx.z * blockDim.x * blockDim.y +
1487
- threadIdx.y * blockDim.x + threadIdx.x;
1488
- extern __shared__ %(dtype)s buf[];
1489
- %(dtype)s mysum = 0.0f;
1490
-
1491
- if (warpSize != 32)
1492
- {
1493
- return; //TODO: set error code
1494
- }
1495
-
1496
- for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
1497
- {
1498
- for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
1499
- {
1500
- for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
1501
- {
1502
- %(dtype)sy Ai = A[i0 * sA0 + blockIdx.x * sA1 +
1503
- i2 * sA2 + i3 * sA3];
1504
- mysum += Ai;
1505
- }
1506
- }
1507
- }
1508
- %(reducebuf)s
1509
- }
1510
- """ % locals(), file=sio)
1511
- return sio.getvalue()