pyopencl 2024.2.7__cp38-cp38-macosx_11_0_arm64.whl → 2025.1__cp38-cp38-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (40) hide show
  1. pyopencl/__init__.py +127 -122
  2. pyopencl/_cl.cpython-38-darwin.so +0 -0
  3. pyopencl/_mymako.py +3 -3
  4. pyopencl/algorithm.py +10 -7
  5. pyopencl/array.py +58 -123
  6. pyopencl/bitonic_sort.py +3 -1
  7. pyopencl/bitonic_sort_templates.py +1 -1
  8. pyopencl/cache.py +23 -22
  9. pyopencl/capture_call.py +5 -4
  10. pyopencl/clrandom.py +1 -0
  11. pyopencl/cltypes.py +2 -2
  12. pyopencl/compyte/dtypes.py +4 -4
  13. pyopencl/compyte/pyproject.toml +54 -0
  14. pyopencl/elementwise.py +9 -2
  15. pyopencl/invoker.py +11 -9
  16. pyopencl/ipython_ext.py +1 -1
  17. pyopencl/reduction.py +16 -10
  18. pyopencl/scan.py +38 -22
  19. pyopencl/tools.py +23 -13
  20. pyopencl/version.py +1 -1
  21. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/METADATA +11 -8
  22. pyopencl-2025.1.dist-info/RECORD +42 -0
  23. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/WHEEL +1 -1
  24. pyopencl/compyte/.git +0 -1
  25. pyopencl/compyte/ndarray/Makefile +0 -31
  26. pyopencl/compyte/ndarray/__init__.py +0 -0
  27. pyopencl/compyte/ndarray/gen_elemwise.py +0 -1907
  28. pyopencl/compyte/ndarray/gen_reduction.py +0 -1511
  29. pyopencl/compyte/ndarray/gpu_ndarray.h +0 -35
  30. pyopencl/compyte/ndarray/pygpu_language.h +0 -207
  31. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +0 -622
  32. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +0 -317
  33. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +0 -1546
  34. pyopencl/compyte/ndarray/pygpu_ndarray.h +0 -71
  35. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +0 -232
  36. pyopencl/compyte/ndarray/setup_opencl.py +0 -101
  37. pyopencl/compyte/ndarray/test_gpu_elemwise.py +0 -411
  38. pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -487
  39. pyopencl-2024.2.7.dist-info/RECORD +0 -56
  40. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,1907 +0,0 @@
1
- """
2
- This file implement 1 version of the elemwise op on the gpu.
3
-
4
- The elemwise fct are also used with scalar operation! So it can happen
5
- that ndim is 0 as with all scalar type.
6
- """
7
-
8
-
9
- import numpy
10
- import pygpu_ndarray as gpu_ndarray
11
- import StringIO
12
-
13
- _CL_MODE = hasattr(gpu_ndarray, "set_opencl_context")
14
-
15
-
16
- if _CL_MODE:
17
- # THIS IS NOT FINISHED
18
- import pyopencl as cl
19
- import pyopencl.array as cl_array
20
- from pyopencl.tools import dtype_to_ctype
21
- # import pyopencl._mymako as mako
22
- from pyopencl._cluda import CLUDA_PREAMBLE
23
-
24
- # TODO: use mako to get rid of the %if
25
- CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]
26
- CLUDA_PREAMBLE += """
27
- #define LDIM_0 get_local_size(0)
28
- #define LDIM_1 get_local_size(1)
29
- #define LDIM_2 get_local_size(2)
30
-
31
- #define GDIM_0 get_num_groups(0)
32
- #define GDIM_1 get_num_groups(1)
33
- #define GDIM_2 get_num_groups(2)
34
- """
35
- # TODO, reuse the same context as the use used to create the memory.
36
- ctx = cl.create_some_context()
37
- queue = cl.CommandQueue(ctx)
38
- else:
39
- import pycuda.autoinit
40
- import pycuda.driver as driver
41
- from pycuda.compiler import SourceModule
42
- from pycuda.tools import dtype_to_ctype
43
- # import pycuda._mymako as mako
44
- from pycuda._cluda import CLUDA_PREAMBLE
45
- CLUDA_PREAMBLE += """
46
- #define LDIM_0 blockDim.x
47
- #define LDIM_1 blockDim.y
48
- #define LDIM_2 blockDim.z
49
-
50
- #define GDIM_0 gridDim.x
51
- #define GDIM_1 gridDim.y
52
- #define GDIM_2 gridDim.z
53
- """
54
-
55
- import logging
56
-
57
- import theano
58
- from theano import Apply, scalar
59
- from theano.tensor import TensorType
60
-
61
- _logger_name = 'compyte.gen_elemwise'
62
- _logger = logging.getLogger(_logger_name)
63
- _logger.setLevel(logging.INFO)
64
- _logger.addHandler(logging.StreamHandler()) # TO REMOVE
65
-
66
-
67
- def warning(*msg):
68
- _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))
69
-
70
-
71
- def info(*msg):
72
- _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))
73
-
74
-
75
- def debug(*msg):
76
- _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))
77
-
78
-
79
- if _CL_MODE:
80
- gpu_ndarray.set_opencl_context(ctx.obj_ptr)
81
-
82
-
83
- cast_int = numpy.intc
84
- cast_uint = numpy.uintc
85
-
86
-
87
- def _logical_scalar(x):
88
- return numpy.all(x.type.broadcastable)
89
-
90
-
91
- def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value',
92
- data_str='ii_i%i_data[0]'):
93
- l = []
94
- for ipos, i in enumerate(inputs):
95
- if _logical_scalar(i):
96
- l += [value_str % ipos]
97
- else:
98
- l += [data_str % ipos]
99
- return l
100
-
101
-
102
- class WrapOpenCLFunction:
103
- def __init__(self, fct):
104
- self.fct = fct
105
-
106
- def _param_wrap(self, p):
107
- if isinstance(p, MyGpuNdArray):
108
- p = p.gpu_nd_array
109
- if isinstance(p, gpu_ndarray.GpuNdArrayObject):
110
- p = cl.MemoryObject.from_cl_mem_as_int(p.bytes)
111
- return p
112
-
113
- def set_block_shape(self, *shape):
114
- self.local_size = shape
115
-
116
- def param_set(self, *param):
117
- self.param = [self._param_wrap(p) for p in param]
118
-
119
- def launch_grid(self, *global_shape):
120
- global_size = global_shape + (1,)
121
-
122
- d = {"g_times_l": True}
123
- return self.fct(queue, global_size, self.local_size,
124
- *self.param, **d)
125
-
126
-
127
- def compile_gpu_code(code, fct_name):
128
- if _CL_MODE:
129
- # Compile the gpu function with pyopencl
130
- prg = cl.Program(ctx, code).build()
131
- fct2 = getattr(prg, fct_name)
132
-
133
- fct = WrapOpenCLFunction(fct2)
134
- else:
135
- # Compile the gpu function with pycuda
136
- mod = SourceModule(code)
137
- fct = mod.get_function(fct_name)
138
- return fct
139
-
140
-
141
- class ElemwiseAlgo:
142
- verbose = 0 # 1, 2 or 3 for more verbose output.
143
- cache_version = ()
144
- cache_version = ('debug', 14, verbose)
145
-
146
- def __init__(self, scalar_op, inplace_pattern={}):
147
- """
148
- :param scalar_op: the scalar operation to execute on each element.
149
- """
150
- self.scalar_op = scalar_op
151
- self.inplace_pattern = inplace_pattern
152
-
153
- def task_code(self, inputs, outputs, sio,
154
- nodename, iname=None, oname=None):
155
- if iname == None:
156
- iname = get_str_list_logical_scalar(inputs)
157
- if oname == None:
158
- oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)]
159
- print(self.scalar_op.c_code(
160
- Apply(self.scalar_op,
161
- [scalar.Scalar(dtype=input.type.dtype)()
162
- for input in inputs],
163
- [scalar.Scalar(dtype=output.type.dtype)()
164
- for output in outputs]),
165
- nodename + '_scalar_',
166
- iname,
167
- oname,
168
- sub=dict(fail='return;')), file=sio) # TODO: set a failure code somehow!!!
169
-
170
- def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"):
171
- sio = StringIO.StringIO()
172
- #print 'C_SRC_KERNEL', sio.getvalue()
173
-
174
- for ipos, i in enumerate(inputs):
175
- print("// Input ", ipos, str(i.type), file=sio)
176
- for ipos, i in enumerate(outputs):
177
- print("// Output ", ipos, str(i.type), file=sio)
178
- print(static, (
179
- f"KERNEL void kernel_{nodename}_{nd}(unsigned int numEls"), file=sio)
180
- if (nd):
181
- print("\t,", ", ".join("const int dim%i" % i
182
- for i in range(nd)), file=sio)
183
- #declare inputs
184
- for ipos, i in enumerate(inputs):
185
- s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % (
186
- dtype_to_ctype(i.dtype), ipos)] +
187
- list("int i%i_str_%i" % (ipos, d)
188
- for d in range(nd)))
189
- print("\t,", s, file=sio)
190
- #declare outputs
191
- for ipos, i in enumerate(outputs):
192
- s = ", ".join(["GLOBAL_MEM %s * o%i_data" % (
193
- dtype_to_ctype(i.dtype), ipos)]
194
- + list("int o%i_str_%i" % (ipos, d)
195
- for d in range(nd)))
196
- print("\t,", s, file=sio)
197
- #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d)
198
- # for d in xrange(nd))
199
- #print >> sio, "\t,", "float * o%i_data" % ipos
200
- print("\t)\n{", file=sio)
201
- print(" const int idx = GID_0 * LDIM_0 + LID_0;", file=sio)
202
- print(" const int numThreads = LDIM_0 * GDIM_0;", file=sio)
203
-
204
- # For each input that is a scalar which has been broadcasted
205
- # to a tensor, load it into a local variable
206
- for ipos, i in enumerate(inputs):
207
- if _logical_scalar(i):
208
- print(" const %s ii_i%i_value = i%i_data[0];" % (
209
- dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
210
-
211
- #loop over the elements to be treated by this kernel call
212
- print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
213
- # calculate the data pointers for all arguments
214
- print(" int ii = i;", file=sio)
215
- for ipos, i in enumerate(inputs):
216
- if not _logical_scalar(i):
217
- print((" GLOBAL_MEM const "
218
- "%s * ii_i%i_data = i%i_data;" % (
219
- dtype_to_ctype(i.dtype), ipos, ipos)), file=sio)
220
- for ipos, i in enumerate(outputs):
221
- print(" GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % (
222
- dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
223
- for d in range(nd - 1, -1, -1):
224
- if d > 0:
225
- print(" int pos%i = ii %% dim%i;" % (d, d), file=sio)
226
- print(" ii = ii / dim%i;" % d, file=sio)
227
- else:
228
- print(" int pos%i = ii;" % d, file=sio)
229
-
230
- for ipos, i in enumerate(inputs):
231
- if not _logical_scalar(i):
232
- print((" ii_i"
233
- "%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)), file=sio)
234
- for ipos, i in enumerate(outputs):
235
- print(" ii_o%i_data += pos%i * o%i_str_%i;" % (
236
- ipos, d, ipos, d), file=sio)
237
-
238
- # perform the scalar operation on the input and output references
239
- #TODO: What if the scalar_op needs support_code??
240
- self.task_code(inputs, outputs, sio, nodename)
241
- print(" }", file=sio)
242
-
243
- #indent = " "*(4*d+7)
244
- #for ipos, i in enumerate(inputs):
245
- #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
246
- print("}", file=sio)
247
-
248
- #print sio.getvalue()
249
- return sio.getvalue()
250
-
251
- def c_src_kernel_Ccontiguous(self, inputs, outputs,
252
- nodename, static="static"):
253
- nd = outputs[0].type.ndim
254
- sio = StringIO.StringIO()
255
- #print 'C_SRC_KERNEL', sio.getvalue()
256
-
257
- for ipos, i in enumerate(inputs):
258
- print("// Input ", ipos, str(i.type), file=sio)
259
- for ipos, i in enumerate(outputs):
260
- print("// Output ", ipos, str(i.type), file=sio)
261
- print(static, ("KERNEL void kernel_%s_Ccontiguous"
262
- " (unsigned int numEls" % (nodename)), file=sio)
263
- #declare inputs
264
- for ipos, i in enumerate(inputs):
265
- print("\t,", "GLOBAL_MEM const %s * i%i_data" % (
266
- dtype_to_ctype(i.dtype), ipos), file=sio)
267
- #declare outputs
268
- for ipos, i in enumerate(outputs):
269
- print("\t,", "GLOBAL_MEM %s * o%i_data" % (
270
- dtype_to_ctype(i.dtype), ipos), file=sio)
271
- print("\t)\n{", file=sio)
272
- print(" const int idx = GID_0 * LDIM_0 + LID_0;", file=sio)
273
- print(" const int numThreads = LDIM_0 * GDIM_0;", file=sio)
274
-
275
- # For each input that is a scalar which has been broadcasted
276
- # to a tensor, load it into a local variable
277
- for ipos, i in enumerate(inputs):
278
- if _logical_scalar(i):
279
- print(" const %s ii_i%i_value = i%i_data[0];" % (
280
- dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
281
-
282
- #loop over the elements to be treated by this kernel call
283
- print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
284
- # perform the scalar operation on the input and output references
285
- #TODO: What if the scalar_op needs support_code??
286
- self.task_code(inputs, outputs, sio, nodename,
287
- iname=get_str_list_logical_scalar(
288
- inputs, data_str='i%i_data[i]'),
289
- oname=['o%i_data[i]' % ipos
290
- for ipos, i in enumerate(outputs)])
291
- print(" }", file=sio)
292
- print("}", file=sio)
293
-
294
- #print sio.getvalue()
295
- return sio.getvalue()
296
-
297
- def c_src_callkernel(self, inputs, outputs, nodename):
298
- #
299
- # This function serves three main goals:
300
- #
301
- # The first is stride unpacking:
302
- # it accepts input and output arguments as
303
- # float * , int*
304
- # pairs, and it constructs a kernel function call where inputs
305
- # and arguments are named like
306
- # float *, int, int, int ...
307
- #
308
- # The second is to recognize when any dimensions can be collapsed as
309
- # being contiguous. That mean that we can merge that dimensions with
310
- # another one for all inputs/outputs and have the same retusuls
311
- # (confusing... read code)
312
- #
313
- # The thrid is to make a special case for scalar element. We allow
314
- # the collapsing of them. In the ccontiguous and not contiguous case,
315
- # we use registers to lower the number of memory access.
316
-
317
- # TODO: make a special case for broadcasting, to store the
318
- # data in shared memory.
319
-
320
- nd = outputs[0].type.ndim
321
- nb_inputs = len(inputs)
322
- nb_outputs = len(outputs)
323
- d = dict()
324
- # input_params and output_params go into the function
325
- # declaration/definition
326
- input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % (
327
- dtype_to_ctype(inputs[i].dtype), ipos, ipos)
328
- for ipos in range(len(inputs)))
329
- output_params = ", ".join("%s * o%i_data, const int * o%i_str" % (
330
- dtype_to_ctype(outputs[i].dtype),
331
- ipos, ipos)
332
- for ipos in range(len(outputs)))
333
-
334
- #input_args and output_args go into the recursive call.
335
- input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
336
- for ipos in range(len(inputs)))
337
- output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
338
- for ipos in range(len(outputs)))
339
-
340
- prod_dims = '*'.join(["dims[%i]" % di for di in range(nd)] + ['1'])
341
-
342
- sio = StringIO.StringIO()
343
- print("""
344
- static void can_collapse_%(nodename)s(int nd, const int * dims,
345
- const int * strides,
346
- int collapse[])
347
- {
348
- //can we collapse dims[i] and dims[i-1]
349
- for(int i=nd-1;i>0;i--){
350
- if(strides[i]*dims[i]==strides[i-1]){
351
- //the dims nd-1 are not strided again dimension nd
352
- collapse[i]=1;
353
- }else collapse[i]=0;
354
- }
355
- }
356
- """ % locals(), file=sio)
357
- print("""
358
- static int callkernel_%(nodename)s(unsigned int numEls, const int d,
359
- const int * dims,
360
- %(input_params)s,
361
- %(output_params)s)
362
- {
363
- numEls = %(prod_dims)s;
364
- """ % locals(), file=sio)
365
- if self.verbose:
366
- print("""
367
- std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n";
368
- """ % locals(), file=sio)
369
- print('std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di
370
- for di in range(nd)) + ["'\\n';"]), file=sio)
371
- if self.verbose > 1:
372
- for ipos in range(len(inputs)):
373
- print("""
374
- std::cerr << " %(ipos)s data strides" <<
375
- """ % locals() + " << ' ' << ".join(["i%s_data" % ipos]
376
- + list("i%s_str[%i]" % (ipos, di)
377
- for di in range(nd))) + ''' << "\\n"; ''', file=sio)
378
-
379
- for ipos in range(len(outputs)):
380
- print("""
381
- std::cerr << " %(ipos)s data strides" <<
382
- """ % locals() + " << ' ' << ".join(["o%s_data" % ipos]
383
- + list("o%s_str[%i]" % (ipos, di)
384
- for di in range(nd))) + ''' << "\\n"; ''', file=sio)
385
- # collapse dimension that are broadcast in all inputs.
386
- # need to be done before contiguous collapse as it will break it.
387
- # do the dimensions and the strides
388
- print("""
389
- int local_dims[%(nd)s];
390
- int local_str[%(nb_inputs)s][%(nd)s];
391
- int local_ostr[%(nb_inputs)s][%(nd)s];
392
- int nd_collapse = %(nd)s;
393
- for(int i=0;i<%(nd)s;i++){//init new dim
394
- local_dims[i]=dims[i];
395
- }
396
- """ % locals(), file=sio)
397
- for ipos in range(len(inputs)):
398
- print("""
399
- for(int i=0;i<%(nd)s;i++){//init new strides
400
- local_str[%(ipos)s][i]=i%(ipos)s_str[i];
401
- }
402
- """ % locals(), file=sio)
403
- for ipos in range(len(outputs)):
404
- print("""
405
- for(int i=0;i<%(nd)s;i++){//init new strides
406
- local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
407
- }
408
- """ % locals(), file=sio)
409
- if self.verbose > 2:
410
- print('std::cerr <<"before broadcast collapse\\n";', file=sio)
411
- print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
412
- print('std::cerr << "local_dims";', file=sio)
413
- for d in range(nd):
414
- print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
415
- print('std::cerr << "\\n";', file=sio)
416
-
417
- for ipos in range(len(inputs)):
418
- print('std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
419
- for ipos in range(len(outputs)):
420
- print('std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
421
-
422
- print("""
423
- for(int id=0;id<nd_collapse;id++){
424
-
425
- bool all_broadcast=true;
426
- for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
427
- if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
428
- }
429
- for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
430
- if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
431
- }
432
- if(all_broadcast){
433
- for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
434
- local_dims[j-1]=local_dims[j];
435
- for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
436
- for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
437
- local_str[input_id][j-1]=local_str[input_id][j];
438
- }
439
- }
440
- for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
441
- for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
442
- local_ostr[output_id][j-1]=local_ostr[output_id][j];
443
- }
444
- }
445
- nd_collapse--; id--;
446
- }
447
- }
448
- """ % locals(), file=sio)
449
-
450
- if self.verbose > 2:
451
- print('std::cerr <<"after broadcast collapse\\n";', file=sio)
452
- print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
453
- print('std::cerr << "local_dims";', file=sio)
454
- for d in range(nd):
455
- print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
456
- print('std::cerr << "\\n";', file=sio)
457
-
458
- for ipos in range(len(inputs)):
459
- print('std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
460
- for ipos in range(len(outputs)):
461
- print('std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
462
- # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
463
- # this is a good idea because we make less index calculation in the gpu.
464
-
465
- print("int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio)
466
- for ipos in range(len(inputs)):
467
- if not _logical_scalar(inputs[ipos]):
468
- print("""
469
- int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio)
470
- print("""
471
- can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
472
- for(int i=0;i<nd_collapse;i++){
473
- if(nd_collapse_%(ipos)s[i]==0)
474
- nd_collapse_[i]=0;
475
- }
476
- """ % locals(), file=sio)
477
- if self.verbose > 1:
478
- print("""
479
- std::cerr<< "nd_collapse_%(ipos)s "<<
480
- """ % locals(), file=sio)
481
- print(' << " " << '.join(
482
- ["nd_collapse_%(ipos)s[" % locals() + str(i) + "]"
483
- for i in range(nd)]), file=sio)
484
- print('<< "\\n";', file=sio)
485
- print("""
486
- std::cerr<< "nd_collapse_ "<<
487
- """ % locals(), file=sio)
488
- print(' << " " << '.join(
489
- ["nd_collapse_[" % locals() + str(i) + "]"
490
- for i in range(nd)]), file=sio)
491
- print('<< "\\n";', file=sio)
492
-
493
- # update the local stride.
494
- for ipos in range(len(inputs)):
495
- print("""
496
- for(int i=nd_collapse-1;i>0;i--){
497
- if(nd_collapse_[i]==1){
498
- local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
499
- for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
500
- local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
501
- }
502
- }
503
- """ % locals(), file=sio)
504
-
505
- for ipos in range(len(outputs)):
506
- print("""
507
- for(int i=nd_collapse-1;i>0;i--){
508
- if(nd_collapse_[i]==1){
509
- local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
510
- for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
511
- local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
512
- }
513
- }
514
- """ % locals(), file=sio)
515
-
516
- # update the local dims.
517
- print("""
518
- for(int i=nd_collapse-1;i>0;i--){
519
- if(nd_collapse_[i]==1){
520
- local_dims[i-1]*=local_dims[i];//set new dims
521
- for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
522
- local_dims[j-1]=local_dims[j];
523
- }
524
- }
525
- """ % locals(), file=sio)
526
-
527
- #update the new number of dim
528
- print("""
529
- for(int i=1, end=nd_collapse;i<end;i++){
530
- if(nd_collapse_[i]==1)nd_collapse--;
531
- }
532
- if(nd_collapse == 1 """ % locals(), file=sio)
533
- l = ["local_str[%(ipos)s][nd_collapse-1]==1 " % locals()
534
- for ipos in range(len(inputs))
535
- if not _logical_scalar(inputs[ipos])]
536
- l += ["local_ostr[%(ipos)s][nd_collapse-1]==1 " % locals()
537
- for ipos in range(len(outputs))
538
- if not _logical_scalar(outputs[ipos])]
539
- if len(l) > 0:
540
- print(" && ", " && ".join(l), file=sio)
541
- print("""){nd_collapse=0;} """, file=sio)
542
-
543
- if self.verbose:
544
- print('std::cerr <<"after can_collapse\\n";', file=sio)
545
- print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio)
546
- if self.verbose > 1:
547
- for d in range(nd):
548
- print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
549
- print('std::cerr << "\\n";', file=sio)
550
-
551
- for ipos in range(len(inputs)):
552
- print(('std::cerr << " local_str %(ipos)s: " <<' %
553
- locals() + ' << " " << '.join(
554
- ["local_str[%(ipos)s][%(x)s]" % locals()
555
- for x in range(nd)]) + '<<"\\n";'), file=sio)
556
- for ipos in range(len(outputs)):
557
- print(('std::cerr << " local_ostr %(ipos)s: " <<' %
558
- locals() + ' << " " << '.join(
559
- ["local_ostr[%(ipos)s][%(x)s]" % locals()
560
- for x in range(nd)]) + '<<"\\n";'), file=sio)
561
-
562
- def launch_Ccontiguous(nodename, scalar_op):
563
- kernel_call_args = ["numEls"]
564
- for ipos in range(len(inputs)):
565
- kernel_call_args.append("i%i_data" % ipos)
566
- for ipos in range(len(outputs)):
567
- kernel_call_args.append("o%i_data" % ipos)
568
- kernel_call_args = ", ".join(kernel_call_args)
569
- verb = ""
570
- if self.verbose:
571
- verb = 'std::cerr << " Running ccontiguous version\\n";'
572
- print("""
573
- //first use at least a full warp
574
- int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
575
-
576
- //next start adding multiprocessors
577
- int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
578
-
579
- // next start adding more warps per multiprocessor
580
- if (threads_per_block * n_blocks < numEls)
581
- threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
582
- kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
583
-
584
- //std::cerr << "calling callkernel returned\\n";
585
- """ % locals(), file=sio)
586
-
587
- print("""
588
- CNDA_THREAD_SYNC;
589
- cudaError_t err = cudaGetLastError();
590
- if( cudaSuccess != err)
591
- {
592
- PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
593
- "GpuElemwise %(nodename)s", cudaGetErrorString(err),
594
- n_blocks, threads_per_block,
595
- "kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
596
- return -1;
597
-
598
- }
599
- %(verb)s
600
- return 0;
601
- """ % locals(), file=sio)
602
-
603
- def launch_General(nodename, scalar_op, force_nd):
604
- # kernel_call_args are used to invoke the cuda kernel
605
- local = "local_"
606
- kernel_call_args = ["numEls"]
607
- kernel_call_args.extend(local + "dims[%i]" % di
608
- for di in range(force_nd))
609
- for ipos in range(len(inputs)):
610
- kernel_call_args += ["i%i_data" % ipos] + list(
611
- local + "str[%i][%i]" % (ipos, di)
612
- for di in range(force_nd))
613
- #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
614
- #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
615
- for ipos in range(len(outputs)):
616
- kernel_call_args += ["o%i_data" % ipos] + list(
617
- local + "ostr[%i][%i]" % (ipos, di)
618
- for di in range(force_nd))
619
- #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
620
- #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
621
- if self.verbose:
622
- print("""
623
- std::cerr << " Running general version with %(force_nd)s dims\\n";
624
- """ % locals(), file=sio)
625
- print("std::cerr << "+ ' << " " << '.join(
626
- kernel_call_args)+' << "\\n";', file=sio)
627
- #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
628
-
629
- kernel_call_args = ", ".join(kernel_call_args)
630
-
631
- print("""
632
- //first use at least a full warp
633
- int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
634
-
635
- //next start adding multiprocessors
636
- int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
637
-
638
- // next start adding more warps per multiprocessor
639
- if (threads_per_block * n_blocks < numEls)
640
- threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
641
-
642
- kernel_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
643
- """ % locals(), file=sio)
644
- print("""
645
- CNDA_THREAD_SYNC;
646
- cudaError_t err = cudaGetLastError();
647
- if( cudaSuccess != err)
648
- {
649
- PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
650
- "GpuElemwise %(nodename)s", cudaGetErrorString(err),
651
- n_blocks, threads_per_block,
652
- "kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
653
- return -1;
654
-
655
- }
656
- return 0;
657
- """ % locals(), file=sio)
658
-
659
- print("if(numEls==0) return 0;", file=sio)
660
- print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals(), file=sio)
661
- print("case 0: {", file=sio)
662
- launch_Ccontiguous(nodename, scalar_op)
663
- print(" } break;", file=sio)
664
- for i in range(1, nd + 1):
665
- print("case " + str(i) + ": {", file=sio)
666
- launch_General(nodename, scalar_op, i)
667
- print(" } break;", file=sio)
668
-
669
- print("}", file=sio) # end case
670
- print("return -2;", file=sio) # should not get to this point
671
- print("}", file=sio) # end fct
672
-
673
- #N.B. cudaGetLastError is called by c_code
674
- return sio.getvalue()
675
-
676
- def c_support_code_apply(self, inputs, outputs, nodename):
677
- nd = outputs[0].type.ndim
678
- return "".join(
679
- CLUDA_PREAMBLE,
680
- [self.c_src_kernel(inputs, outputs, nodename, x)
681
- for x in range(1, nd + 1)] +
682
- [self.c_src_kernel_Ccontiguous(inputs, outputs, nodename),
683
- self.c_src_callkernel(inputs, outputs, nodename),
684
- ])
685
-
686
- def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub):
687
- d = dict(sub)
688
- nd = noutputs[0].type.ndim
689
- d.update(locals())
690
- sio = StringIO.StringIO()
691
- nin = len(inputs)
692
- nout = len(outputs)
693
- fail = sub['fail']
694
- opname = str(self.scalar_op)
695
- initial_dims = ','.join('1' for i in range(nd))
696
- if 1 or self.scalar_op == scalar.pow:
697
- print("""
698
- //std::cerr << "C_CODE %(opname)s START\\n";
699
- //standard elemwise size checks
700
- """ % locals(), file=sio)
701
- print("""
702
- int dims[%(nd)s] = {%(initial_dims)s};
703
- """ % locals(), file=sio)
704
-
705
- #check that all inputs have valid dimensions
706
- emitted_inames = {}
707
- for id, iname in enumerate(inputs):
708
- if iname in emitted_inames:
709
- assert emitted_inames[iname] is ninputs[id]
710
- continue
711
- broadcasts = ', '.join(map(str, list(map(int,
712
- ninputs[id].broadcastable))))
713
- nd = ninputs[id].ndim
714
- print("""
715
- int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
716
- """ % locals(), file=sio)
717
- emitted_inames[iname] = ninputs[id]
718
- #check that all inputs have valid dimensions
719
- emitted_inames = {}
720
- for id, iname in enumerate(inputs):
721
- if iname in emitted_inames:
722
- continue
723
- print("""
724
- //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
725
- if (%(nd)s != %(iname)s->nd)
726
- {
727
- PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);
728
- %(fail)s;
729
- }
730
- for (int i = 0; i< %(nd)s; ++i)
731
- {
732
- dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
733
- if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
734
- {
735
- //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
736
- PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.",
737
- i,
738
- CudaNdarray_HOST_DIMS(%(iname)s)[i],
739
- dims[i]
740
- );
741
- %(fail)s;
742
- }
743
- }
744
- """ % locals(), file=sio)
745
- emitted_inames[iname] = True
746
-
747
- #check that all outputs have valid dimensions
748
- for idx, oname in enumerate(outputs):
749
- if idx not in list(self.inplace_pattern.keys()):
750
- print("""
751
- for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
752
- if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
753
- {
754
- Py_DECREF(%(oname)s);
755
- %(oname)s = NULL;
756
- }
757
- }
758
- if (NULL == %(oname)s)
759
- {
760
- %(oname)s = (CudaNdarray*)CudaNdarray_New();
761
- if (!%(oname)s)
762
- {
763
- //error string already set
764
- %(fail)s;
765
- }
766
- if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
767
- {
768
- //error string already set
769
- Py_DECREF(%(oname)s);
770
- %(oname)s = NULL;
771
- %(fail)s;
772
- }
773
- }
774
- //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
775
- //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
776
- """ % locals(), file=sio)
777
- else:
778
- input_idx = self.inplace_pattern[idx]
779
- iname = inputs[input_idx]
780
- print("""
781
- Py_XDECREF(%(oname)s);
782
- %(oname)s = %(iname)s;
783
- Py_INCREF(%(oname)s);
784
- for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
785
- if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
786
- {
787
- Py_DECREF(%(oname)s);
788
- %(oname)s = NULL;
789
- %(fail)s;
790
- }
791
- }
792
- //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
793
- //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
794
- """ % locals(), file=sio)
795
-
796
- print("""
797
- {
798
- //new block so that failure gotos don't skip over variable initialization
799
- //std::cerr << "calling callkernel\\n";
800
- if (callkernel_%(nodename)s(1, 0, dims
801
- """ % locals(), file=sio)
802
- for iname in inputs:
803
- print("""
804
- , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
805
- """ % locals(), file=sio)
806
- for oname in outputs:
807
- print("""
808
- , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
809
- """ % locals(), file=sio)
810
- print("""
811
- ))
812
- {
813
- // error
814
- """, file=sio)
815
- for oname in outputs:
816
- print("""
817
- Py_DECREF(%(oname)s);
818
- %(oname)s = NULL;
819
- """ % locals(), file=sio)
820
- print("""
821
- %(fail)s;
822
- }
823
- else // no error
824
- {
825
- }
826
- }
827
- //std::cerr << "C_CODE %(opname)s END\\n";
828
- """ % locals(), file=sio)
829
- #print sio.getvalue()
830
- return sio.getvalue()
831
-
832
- def c_support_code(self):
833
- return """
834
- #define INTDIV_POW2(a, b) (a >> b)
835
- #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
836
- """
837
-
838
- def dummy_holder_for_code_not_used():
839
-
840
- def c_src_kernel_tiling(self, inputs, outputs, nodename):
841
- """ The kernel applies to problems with <= 5 dimensions """
842
-
843
- #The kernel is intended to be structured roughly like this:
844
- """
845
- static __global__ void kernel()
846
- {
847
- for (int v = blockIdx.y; v < dim0; v += gridDim.x)
848
- {
849
- for (int w = blockIdx.y; w < dim1; w += gridDim.y)
850
- {
851
- for (int x = threadIdx.x; x < dim2; x += blockDim.x)
852
- {
853
- for (int y = threadIdx.y; y < dim3; y += blockDim.y)
854
- {
855
- for (int z = threadIdx.z; z < dim4; z += blockDim.z)
856
- {
857
- out[v * out_stride[0] + ...] = f(in1[...], in2[...])
858
- }
859
- }
860
- }
861
- }
862
- }
863
- }
864
-
865
- """
866
-
867
- nd = outputs[0].type.ndim
868
- sio = StringIO.StringIO()
869
- #print 'C_SRC_KERNEL', sio.getvalue()
870
-
871
- if nd in (4,):
872
- # print some leading comments to make the code easier to read
873
- for ipos, i in enumerate(inputs):
874
- print("// Input ", ipos, str(i.type), file=sio)
875
- for ipos, i in enumerate(outputs):
876
- print("// Output ", ipos, str(i.type), file=sio)
877
- print("""static __global__ void kernel_{}_{}(
878
- unsigned int numEls""".format(
879
- nodename,
880
- 'tiling%i' % nd), file=sio)
881
- if (nd):
882
- print("\t,", ", ".join("const int dim%i" % i
883
- for i in range(nd)), file=sio)
884
- #declare inputs
885
- for ipos, i in enumerate(inputs):
886
- s = ", ".join(["const float * i%i_data" % ipos] + list(
887
- "int i%i_str_%i" % (ipos, d) for d in range(nd)))
888
- print("\t,", s, file=sio)
889
- #declare outputs
890
- for ipos, i in enumerate(outputs):
891
- s = ", ".join(["float * o%i_data" % ipos] + list(
892
- "int o%i_str_%i" % (ipos, d) for d in range(nd)))
893
- print("\t,", s, file=sio)
894
- #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
895
- #print >> sio, "\t,", "float * o%i_data" % ipos
896
- print("\t)\n{", file=sio)
897
-
898
- # For each input that is a scalar which has been broadcasted to a tensor,
899
- # load it into a local variable
900
- print(" __shared__ float value0[%i];" % len(inputs), file=sio)
901
- print(" __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
902
- #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
903
- print(" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
904
- for ipos, i in enumerate(inputs):
905
- if _logical_scalar(i):
906
- print(" value0[%i] = i%i_data[0];" % (ipos,
907
- ipos), file=sio)
908
- for ipos in range(nd):
909
- print(" shared_dims[%i] = dim%i;" % (ipos, ipos), file=sio)
910
- print(" }", file=sio)
911
- print(" __syncthreads();", file=sio)
912
-
913
- if (nd == 4):
914
- print("""
915
- for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
916
- {
917
- for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
918
- {
919
- //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
920
- for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
921
- {
922
- //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
923
- for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
924
- {
925
- """, file=sio)
926
- else:
927
- raise NotImplementedError()
928
-
929
- for ipos, i in enumerate(inputs):
930
- if not _logical_scalar(i):
931
- print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
932
- for ipos, i in enumerate(outputs):
933
- print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
934
- for d in range(nd):
935
- for ipos, i in enumerate(inputs):
936
- if not _logical_scalar(i):
937
- print(" ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
938
- for ipos, i in enumerate(outputs):
939
- print(" ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
940
-
941
- # perform the scalar operation on the input and output references
942
- #TODO: What if the scalar_op needs support_code??
943
- self.task_code(inputs, outputs, sio, nodename,
944
- iname=get_str_list_logical_scalar(
945
- inputs, value_str='value0[%i]'))
946
- print(" }" * nd, file=sio)
947
-
948
- #TODO: insert runtime stride checks that select the best loop order either here, or in
949
- # the host code that launched the kernel (host code probably better spot)
950
-
951
- #indent = " "*(4*d+7)
952
- #for ipos, i in enumerate(inputs):
953
- #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
954
- print("}", file=sio)
955
-
956
- print(sio.getvalue())
957
- return sio.getvalue()
958
-
959
- def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename):
960
- """ The kernel applies to problems with <= 5 dimensions """
961
-
962
- nd = outputs[0].type.ndim
963
- n_in = len(inputs)
964
- n_out = len(outputs)
965
- sio = StringIO.StringIO()
966
-
967
- if nd not in (2,):
968
- return sio.getvalue()
969
-
970
- # print some leading comments to make the code easier to read
971
- for ipos, i in enumerate(inputs):
972
- print("// Input ", ipos, str(i.type), file=sio)
973
- for ipos, i in enumerate(outputs):
974
- print("// Output ", ipos, str(i.type), file=sio)
975
- print("static __global__ void kernel_%s_%s(unsigned int numEls" %(
976
- nodename,
977
- 'tiling%i_less_registers'%nd), file=sio)
978
- if (nd):
979
- print("\t,", ", ".join("const int dim%i" % i
980
- for i in range(nd)), file=sio)
981
- #declare inputs
982
- for ipos, i in enumerate(inputs):
983
- s = ", ".join(["const float * i%i_data_0" % ipos] + list(
984
- "int i%i_str_%i" % (ipos, d) for d in range(nd)))
985
- print("\t,", s, file=sio)
986
- #declare outputs
987
- for ipos, i in enumerate(outputs):
988
- s = ", ".join(["float * o%i_data_0" % ipos] + list(
989
- "int o%i_str_%i" % (ipos, d) for d in range(nd)))
990
- print("\t,", s, file=sio)
991
- #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
992
- #print >> sio, "\t,", "float * o%i_data" % ipos
993
- print("\t)\n{", file=sio)
994
-
995
- # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
996
- use_shared_stride = False
997
- use_shared_limits = False
998
-
999
- def decl_limits(nd):
1000
- if use_shared_limits:
1001
- print("__shared__ float * limits[%(nd)s];" % locals(), file=sio)
1002
-
1003
- def stride(io, p, d):
1004
- if use_shared_stride:
1005
- return "s%s_str[%i][%i]" % (io, p, d)
1006
- else:
1007
- return "%s%i_str_%i" % (io, p, d)
1008
-
1009
- def limits(d):
1010
- if use_shared_limits:
1011
- return "limits[%i]" % d
1012
- else:
1013
- return "limits%i" % d
1014
-
1015
- def decl_shared_stride(nin, nout, nd):
1016
- if not use_shared_stride:
1017
- return
1018
- print("""
1019
- __shared__ int si_str[%(nin)s][%(nd)s];
1020
- __shared__ int so_str[%(nout)s][%(nd)s];
1021
- if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
1022
- """ % locals(), file=sio)
1023
- for i in range(nin):
1024
- for d in range(nd):
1025
- print("si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(), file=sio)
1026
- for i in range(n_out):
1027
- for d in range(nd):
1028
- print("so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(), file=sio)
1029
- print("} __syncthreads();", file=sio)
1030
-
1031
- def calc_limit(d):
1032
- s = stride('o', 0, d)
1033
- lname = limits(d)
1034
- if use_shared_limits:
1035
- print("if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
1036
- if d == 0:
1037
- print("%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
1038
- else:
1039
- dm1 = d - 1
1040
- print("%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
1041
- print("} __syncthreads();", file=sio)
1042
- else:
1043
- if d == 0:
1044
- print("const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
1045
- else:
1046
- dm1 = d - 1
1047
- print("const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
1048
-
1049
- def decl_ptrs(d, offset):
1050
- dm1 = d - 1
1051
- assert dm1 >= 0
1052
- for i in range(n_in):
1053
- s = stride('i', i, d)
1054
- print("const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
1055
- for i in range(n_out):
1056
- s = stride('o', i, d)
1057
- print("float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
1058
-
1059
- def inc_ptrs(d, amt):
1060
- for i in range(n_in):
1061
- s = stride('i', i, d)
1062
- print("i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
1063
- for i in range(n_out):
1064
- s = stride('o', i, d)
1065
- print("o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
1066
-
1067
- def while_limit(d):
1068
- lname = limits(d)
1069
- print("while (o0_data_%(d)s < %(lname)s) { " % locals(), file=sio)
1070
-
1071
- def end_while(d):
1072
- print("}", file=sio)
1073
-
1074
- def task_code(d):
1075
- self.task_code(inputs, outputs, sio, nodename,
1076
- iname=['i%i_data_%i[0]' % (ipos, d)
1077
- for ipos, i in enumerate(inputs)],
1078
- oname=['o%i_data_%i[0]' % (ipos, d)
1079
- for ipos, i in enumerate(outputs)])
1080
-
1081
- if nd == 4:
1082
- decl_shared_stride(n_in, n_out, nd)
1083
- decl_limits(nd)
1084
- calc_limit(0)
1085
- inc_ptrs(0, 'blockIdx.x')
1086
- while_limit(0)
1087
- if 1:
1088
- calc_limit(1)
1089
- decl_ptrs(1, 'blockIdx.y')
1090
- while_limit(1)
1091
- if 1:
1092
- calc_limit(2)
1093
- decl_ptrs(2, 'threadIdx.y')
1094
- while_limit(2)
1095
- if 1:
1096
- calc_limit(3)
1097
- decl_ptrs(3, 'threadIdx.x')
1098
- while_limit(3)
1099
- if 1:
1100
- task_code(3)
1101
- inc_ptrs(3, 'blockDim.x')
1102
- end_while(3)
1103
- inc_ptrs(2, 'blockDim.y')
1104
- end_while(2)
1105
- inc_ptrs(1, 'gridDim.y')
1106
- end_while(1)
1107
- inc_ptrs(0, 'gridDim.x')
1108
- end_while(0)
1109
-
1110
- print("}", file=sio)
1111
- print(sio.getvalue())
1112
- return sio.getvalue()
1113
-
1114
-
1115
- def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0):
1116
- """
1117
- This collapse dimensions that are not needed when computing
1118
- elemwise. This is usefull as it lower the indexing computation
1119
- that is heavier on gpu then on cpu.
1120
-
1121
- This is a generic version. It collapse dimensions at any place in
1122
- the shape. It handle broadcasted dimensions correctly.
1123
-
1124
- There is no special handling needed for broadcasted scalar at this level.
1125
-
1126
- @return: ndims, tuple(dims, strides) after collapsing.
1127
- """
1128
- in_out = inputs + outputs
1129
- del inputs
1130
- if out_shape is not None:
1131
- local_dims = tuple(out_shape)
1132
- else:
1133
- # TODO, use the right algo here or make the parameter not optional
1134
- # We should always have the same shape for all outputs
1135
- # If there is more then one outputs
1136
- local_dims = tuple(outputs[0].shape)
1137
- del outputs
1138
- nd_orig = len(local_dims)
1139
- if nd_orig == 1:
1140
- # This have a lower overhead
1141
- all_c_contig = True
1142
- for inp in in_out:
1143
- if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims:
1144
- all_c_contig = False
1145
- break
1146
- if all_c_contig:
1147
- return 0, (local_dims, [])
1148
-
1149
- collapsable = [1] * nd_orig
1150
-
1151
- local_str = [None] * len(in_out)
1152
- nd_collapse = nd_orig
1153
- for ipos in range(len(in_out)):
1154
- inp = in_out[ipos]
1155
- assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse"
1156
- local_str[ipos] = list(inp.strides)
1157
- # We set the strides of broacastable dims to 0
1158
- # This make indexing in gpu simpler and is needed
1159
- # For collapsing the dimensions.
1160
- for dim_pos in range(inp.ndim):
1161
- if inp.shape[dim_pos] == 1:
1162
- local_str[ipos][dim_pos] = 0
1163
-
1164
- if nd_orig == 1:
1165
- # We already covered the contiguous case before
1166
- # So we are sure it is not contiguous
1167
- # TODO: Add a test that f contiguous are also collapsed by the first case.
1168
- # I think that for 1d array when the flags f contiguous is true, c contiguous is also true.
1169
- return 1, (local_dims, local_str)
1170
-
1171
- if verbose > 2:
1172
- print("before broadcast collapse")
1173
- print(" nd_collapse", nd_collapse)
1174
- print(" local_dims", local_dims)
1175
- for ipos in range(len(local_str)):
1176
- print(" local_str inputs", ipos, local_str[ipos])
1177
- local_dims = list(local_dims)
1178
- # Collapse dimension that are broadcast in all inputs.
1179
- # need to be done before contiguous collapse as it will break it.
1180
- # Update the dimensions and the strides
1181
- for id in range(nd_collapse):
1182
- if local_dims[id] == 1:
1183
- # remove dims i from the array
1184
- for j in range(id + 1, nd_collapse):
1185
- local_dims[j - 1] = local_dims[j]
1186
- # remove dims i from the array
1187
- for input_id in range(len(in_out)):
1188
- for j in range(id + 1, nd_collapse):
1189
- local_str[input_id][j - 1] = local_str[input_id][j]
1190
- nd_collapse -= 1
1191
- id -= 1 # TODO: what is this? How this work?
1192
-
1193
- if verbose > 2:
1194
- print("after broadcast collapse")
1195
- print(" nd_collapse", nd_collapse)
1196
- print(" local_dims", local_dims)
1197
- for ipos in range(len(local_str)):
1198
- print(" local_str inputs", ipos, local_str[ipos])
1199
-
1200
- nd_collapse_ = [1] * nd_orig
1201
- for ipos in range(len(local_str)):
1202
- # Can we collapse dims[i] and dims[i-1]?
1203
- strides = local_str[ipos]
1204
- for i in range(nd_collapse - 1, 0, -1):
1205
- if strides[i] * local_dims[i] != strides[i - 1]:
1206
- # The dims nd-1 are not strided again dimension nd
1207
- nd_collapse_[i] = 0
1208
-
1209
- if verbose > 1:
1210
- print("nd_collapse_", nd_collapse_)
1211
-
1212
- nd_collapse2 = nd_collapse
1213
- for i in range(nd_collapse - 1, 0, -1):
1214
- if nd_collapse_[i] == 1:
1215
- # update the local dims.
1216
- local_dims[i - 1] *= local_dims[i]
1217
- for j in range(i + 1, nd_collapse):
1218
- local_dims[j - 1] = local_dims[j]
1219
-
1220
- # update the local stride.
1221
- for ipos in range(len(local_str)):
1222
- local_str[ipos][i - 1] = local_str[ipos][i] # set new strides
1223
- # remove stride i from the array
1224
- for j in range(i + 1, nd_collapse):
1225
- local_str[ipos][j - 1] = local_str[ipos][j]
1226
-
1227
- # update the new number of dim
1228
- nd_collapse2 -= 1
1229
- nd_collapse = nd_collapse2
1230
-
1231
- if nd_collapse == 1:
1232
- l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize
1233
- for ipos in range(len(local_str))]
1234
- if all(l):
1235
- nd_collapse = 0
1236
-
1237
- if verbose:
1238
- print("end collapsing")
1239
- print(" nd_collapse", nd_collapse)
1240
- if verbose > 1:
1241
- print(" local_dims", local_dims)
1242
- for ipos in range(len(local_str)):
1243
- print(" local_str inputs", ipos, local_str[ipos])
1244
-
1245
- return nd_collapse, (local_dims, local_str)
1246
-
1247
-
1248
- def reduction_collapses(inout, axis, verbose=0):
1249
- """
1250
- This collapse dimensions that are not needed when computing
1251
- reduction. This is usefull as it lower the indexing computation
1252
- that is heavier on gpu then on cpu.
1253
-
1254
- This is a generic version. It collapse dimensions at any place in
1255
- the shape.
1256
- @param: inout: tuple(input, output)
1257
- @param: axis: None, interger, list of 1 interger
1258
- The axis over witch we will do reduction.
1259
- @return: (ndims, (input dims, input strides, input pattern), out strides)
1260
- after collapsing.
1261
-
1262
- :note: we suppose that we can always collapse the output dimensions.
1263
- """
1264
- input = inout[0]
1265
- out = inout[1]
1266
- # Some quick check. It is faster then the full version.
1267
- if axis is None:
1268
- # The output size is always 1, so we don't care about this strides
1269
- if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']):
1270
- return 0, ((input.size,), (input.itemsize,), axis), (0,)
1271
- if input.ndim == 1:
1272
- assert axis == [0] or axis == 0 or axis is None
1273
- # not c contiguous as the first if should have catched it.
1274
- return 1, (input.shape, input.strides, axis), (0,)
1275
-
1276
- if not isinstance(axis, (list, tuple)):
1277
- local_axis = [axis]
1278
- else:
1279
- local_axis = list(axis)
1280
-
1281
- # This is needed for the computing of the output strides
1282
- assert axis is None or len(local_axis) == 1
1283
-
1284
- local_dims = list(input.shape)
1285
- local_str = list(input.strides)
1286
- out_strides = list(out.strides)
1287
-
1288
- nd_orig = len(local_dims)
1289
- collapsable = [1] * nd_orig
1290
- nd_collapse = nd_orig
1291
-
1292
- if verbose > 2:
1293
- print("before broadcast collapse")
1294
- print(" nd_collapse", nd_collapse)
1295
- print(" local_dims", local_dims)
1296
- print(" local_str inputs", local_str)
1297
- print(" local_axis", local_axis)
1298
-
1299
- # Collapse dimension that are broadcast in all inputs.
1300
- # need to be done before contiguous collapse as it will break it.
1301
- # Update the dimensions and the strides
1302
- for id in range(nd_collapse):
1303
- if local_dims[id] == 1:
1304
- for j in range(id + 1, nd_collapse):
1305
- # remove dims i from the array
1306
- local_dims[j - 1] = local_dims[j]
1307
- # remove strides i from the array
1308
- local_str[j - 1] = local_str[j]
1309
- # remove output strides i from the array
1310
- if axis is not None:
1311
- out_strides[j - 2] = out_strides[j - 1]
1312
- if id in local_axis:
1313
- local_axis.remove(id)
1314
- for axis_pos in range(len(local_axis)):
1315
- if local_axis[axis_pos] > id:
1316
- local_axis[axis_pos] -= 1
1317
-
1318
- nd_collapse -= 1
1319
- id -= 1 # TODO: how this work?
1320
-
1321
- if verbose > 2:
1322
- print("after broadcast collapse")
1323
- print(" nd_collapse", nd_collapse)
1324
- print(" local_dims", local_dims)
1325
- print(" local_str inputs", local_str)
1326
- print(" local_axis", local_axis)
1327
- print(" out_strides", out_strides)
1328
-
1329
- nd_collapse_ = [1] * nd_orig
1330
- # Can we collapse dims[i] and dims[i-1]?
1331
- for i in range(nd_collapse - 1, 0, -1):
1332
- if (local_str[i] * local_dims[i] != local_str[i - 1]):
1333
- # The dims nd-1 are not strided again dimension nd
1334
- nd_collapse_[i] = 0
1335
- elif (i in local_axis) != ((i - 1) in local_axis):
1336
- nd_collapse_[i] = 0
1337
-
1338
- if verbose > 1:
1339
- print("nd_collapse_", nd_collapse_)
1340
-
1341
- nd_collapse2 = nd_collapse
1342
- for i in range(nd_collapse - 1, 0, -1):
1343
- if nd_collapse_[i] == 1:
1344
- # update the local dims.
1345
- local_dims[i - 1] *= local_dims[i]
1346
- # set new strides
1347
- local_str[i - 1] = local_str[i]
1348
- #remove the old dims and strides
1349
- for j in range(i + 1, nd_collapse):
1350
- local_dims[j - 1] = local_dims[j]
1351
- local_str[j - 1] = local_str[j]
1352
- if axis is not None:
1353
- out_strides[j - 2] = out_strides[j - 1]
1354
-
1355
- if i in local_axis:
1356
- local_axis.remove(i)
1357
- for axis_pos in range(len(local_axis)):
1358
- if local_axis[axis_pos] > i:
1359
- local_axis[axis_pos] -= 1
1360
-
1361
- # update the new number of dim
1362
- nd_collapse2 -= 1
1363
-
1364
- nd_collapse = nd_collapse2
1365
-
1366
- if nd_collapse == 1:
1367
- if local_str[nd_collapse - 1] == input.itemsize:
1368
- nd_collapse = 0
1369
-
1370
- if verbose:
1371
- print("end collapsing")
1372
- print(" nd_collapse", nd_collapse)
1373
- if verbose > 1:
1374
- print(" local_dims", local_dims)
1375
- print(" local_str inputs", local_str)
1376
- print(" local_axis", local_axis)
1377
- print(" out_strides", out_strides)
1378
-
1379
- #print input.shape, input.strides
1380
- #print nd_collapse, (local_dims, local_str, local_axis)
1381
- local_dims = local_dims[:nd_collapse]
1382
- local_str = local_str[:nd_collapse]
1383
- out_strides = out_strides[:nd_collapse]
1384
- return nd_collapse, (local_dims, local_str, local_axis), out_strides
1385
-
1386
-
1387
- def call_elemwise(fct, input_vals, block=None, grid=None, out=None,
1388
- out_shape=None,
1389
- strides=None):
1390
- """ Call an elemwise gpu function with gived inputs and block size.
1391
-
1392
- :param fct: The gpu function to call
1393
- :param input_vals: a list of inputs to pass to fct
1394
- :param block: int, the size of the block wanted
1395
- :param grid: int, the size of the grid wanted
1396
- :param out: Optional, the preallocated output. Must have the right shape
1397
- and dtype.
1398
-
1399
- :param out_shape: Optional, if provided, we will suppose that the output,
1400
- have this shape event if it is not true.
1401
- :param strides: Optional, if provided, we will use those strides for
1402
- the inputs and outputs.
1403
-
1404
- :note: param out_shape and strides are used for the collapsing of
1405
- dimensions.
1406
- """
1407
- inp = input_vals[0]
1408
-
1409
- # Get the output and output shape to us
1410
- if out_shape is None and out is None:
1411
- out_shape = list(inp.shape)
1412
- for i in input_vals[1:]:
1413
- # dtype checked by pycuda before gpu call
1414
- for s_i in range(len(inp.shape)):
1415
- assert (inp.shape[s_i] == i.shape[s_i]
1416
- or inp.shape[s_i] == 1
1417
- or i.shape[s_i] == 1)
1418
- out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i],
1419
- i.shape[s_i])
1420
- if out is None:
1421
- out = gpu_ndarray.empty(out_shape, dtype=inp.dtype)
1422
- elif out_shape is None:
1423
- out_shape = out.shape
1424
-
1425
- # Arg: nb element
1426
- args = [cast_uint(out.size)]
1427
- # Arg: output shape to the arguments.
1428
- for i in range(len(out_shape)):
1429
- args.append(cast_int(out_shape[i]))
1430
-
1431
- # for each inputs and the output
1432
- # add its ptr and strides
1433
- nd = len(out_shape)
1434
- idx = 0
1435
- for i in list(input_vals) + [out]:
1436
- itemsize = i.dtype.itemsize
1437
- args.append(i)
1438
- for j in range(nd):
1439
- # We force a stride of 0 for broadcastable dimensions
1440
- # This lower the index computation in the kernel.
1441
- if strides is not None:
1442
- # strides should have a strides of 0 for broadcasting.
1443
- args.append(cast_int(strides[idx][j] / itemsize))
1444
- elif i.shape[j] == 1:
1445
- args.append(cast_int(0))
1446
- else:
1447
- args.append(cast_int(i.strides[j] / itemsize))
1448
- idx += 1
1449
- out_size = out.size
1450
- # First use at least a full warp
1451
- if block is None:
1452
- block_ = min(32, out_size)
1453
- else:
1454
- block_ = block
1455
- # Next start adding multiprocessors
1456
- if grid is None:
1457
- grid_ = min(out_size / block_ + (out_size % block_ != 0), 60)
1458
- else:
1459
- grid_ = grid
1460
- # Next start adding more warps per multiprocessor
1461
- if block is None:
1462
- if block_ * grid_ < out_size:
1463
- block_ = min(out_size / grid_, 512)
1464
-
1465
- # We bypass the pycuda wrapper gpu function call.
1466
- # by calling directly the gpu function.
1467
- # This is faster and lower the overhead.
1468
- # Here is code that allow you to use the pycuda fct call.
1469
- # d = {"block":(block_,1,1), "grid":(grid_,1)}
1470
- # fct(*args, **d)
1471
- fct.set_block_shape(block_, 1, 1) # time_kernel
1472
- fct.param_set(*args)
1473
- fct.launch_grid(grid_, 1)
1474
- return out
1475
-
1476
-
1477
- class MyGpuNdArray():
1478
- _compiled_fct = {}
1479
-
1480
- def __init__(self, gpu_nd_array):
1481
- #assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject)
1482
- self.gpu_nd_array = gpu_nd_array
1483
- self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype)
1484
-
1485
- @staticmethod
1486
- def gen_fct(op, inputs, nd, nodename="TestNodeName",
1487
- collapse=True):
1488
- if _CL_MODE:
1489
- npy_ty = "typedef float npy_float32;\n"
1490
- else:
1491
- npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n"
1492
-
1493
- # Generate the gpu functions
1494
- nb_in = len(inputs)
1495
- fcts = [None]
1496
- for nd in range(1, nd + 1): # 1 to nd
1497
- out = op(*[TensorType(i.gpu_nd_array.dtype,
1498
- (False,) * nd)() for i in inputs])
1499
- out_dtype = out.dtype
1500
- node = out.owner
1501
- elemwise_algo = ElemwiseAlgo(node.op.scalar_op)
1502
-
1503
- code = (CLUDA_PREAMBLE +
1504
- npy_ty +
1505
- elemwise_algo.c_src_kernel(node.inputs,
1506
- node.outputs,
1507
- nodename, nd,
1508
- static=""))
1509
- fct_name = "kernel_%s_%d" % (nodename, nd)
1510
- fct = compile_gpu_code(code, fct_name)
1511
- fcts.append(fct)
1512
-
1513
- # All inputs/outputs C contiguous case
1514
- code = (npy_ty +
1515
- CLUDA_PREAMBLE +
1516
- elemwise_algo.c_src_kernel_Ccontiguous(
1517
- node.inputs, node.outputs, nodename, static=""))
1518
- fct_name = "kernel_%s_Ccontiguous" % nodename
1519
- fcts[0] = compile_gpu_code(code, fct_name)
1520
-
1521
- def call_fct2(inputs, out=None):
1522
- " Do dimensions collapsing before call the gpu code "
1523
- assert len(inputs) == nb_in
1524
- # dtype checked by pycuda
1525
- # TODO: assert nb dim?
1526
-
1527
- inp = inputs[0]
1528
-
1529
- # Compute the output shape.
1530
- out_shape = list(inp.shape)
1531
- for i in inputs[1:]:
1532
- for s_i in range(len(inp.shape)):
1533
- assert (inp.shape[s_i] == i.shape[s_i]
1534
- or inp.shape[s_i] == 1
1535
- or i.shape[s_i] == 1)
1536
- out_shape[s_i] = max(out_shape[s_i], i.shape[s_i])
1537
- # Create the output object
1538
- if (out is None
1539
- or out.dtype != out_dtype
1540
- or out.shape != tuple(out_shape)):
1541
- out = MyGpuNdArray(gpu_ndarray.empty(out_shape,
1542
- dtype=out_dtype))
1543
-
1544
- if collapse:
1545
- # Do the collapsing.
1546
- nd_col, info = elemwise_collapses(list(inputs), [out])
1547
- # The two next line are usefull to force a call to the
1548
- # c contiguous version:
1549
- #nd_col = 0
1550
- #info = [[],[]]
1551
- out = call_elemwise(fcts[nd_col], inputs,
1552
- out=out, out_shape=info[0][:nd_col],
1553
- strides=info[1])
1554
- else:
1555
- out = call_elemwise(fcts[-1], inputs, out=out,
1556
- out_shape=out_shape)
1557
- return out
1558
- return call_fct2
1559
-
1560
- def __elemwise2__(self, other, name, op):
1561
- """ Call this code on this op with 2 inputs """
1562
- nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim
1563
- assert nd == len(other.gpu_nd_array.shape) # ndim
1564
- tag = (name + '_' + str(self.gpu_nd_array.dtype)
1565
- + str(self.gpu_nd_array.ndim))
1566
- tag += ('_' + str(other.gpu_nd_array.dtype)
1567
- + str(other.gpu_nd_array.ndim))
1568
- fct = self._compiled_fct.get(tag, None)
1569
- if fct is None:
1570
- # print "compile", tag
1571
- fct = MyGpuNdArray.gen_fct(op, [self, other], nd)
1572
- self._compiled_fct[tag] = fct
1573
- return fct((self, other))
1574
-
1575
- @classmethod
1576
- def __elemwise__(cls, inputs, name, op, out=None):
1577
- """ Call this code on this op with * inputs """
1578
- nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim
1579
- for i in inputs[1:]:
1580
- assert nd == len(i.gpu_nd_array.shape) # ndim
1581
- nb = len(inputs)
1582
- tag = name + "_".join([str(i.gpu_nd_array.dtype) +
1583
- str(i.gpu_nd_array.ndim) for i in inputs])
1584
- fct = cls._compiled_fct.get(tag, None)
1585
- if fct is None:
1586
- # print "compile", tag
1587
- fct = MyGpuNdArray.gen_fct(op, inputs, nd)
1588
- cls._compiled_fct[tag] = fct
1589
- return fct(inputs, out=out)
1590
-
1591
- base = property(lambda self: self.gpu_nd_array.base)
1592
- bytes = property(lambda self: self.gpu_nd_array.bytes)
1593
- dtype = property(lambda self: self.gpu_nd_array.dtype)
1594
- flags = property(lambda self: self.gpu_nd_array.flags)
1595
- itemsize = property(lambda self: self.gpu_nd_array.itemsize)
1596
- ndim = property(lambda self: self.gpu_nd_array.ndim,
1597
- doc="number of dimensions")
1598
- offset = property(lambda self: self.gpu_nd_array.offset)
1599
- shape = property(lambda self: self.gpu_nd_array.shape)
1600
- size = property(lambda self: self.gpu_nd_array.size)
1601
- strides = property(lambda self: self.gpu_nd_array.strides)
1602
-
1603
- def __array__(self):
1604
- return numpy.asarray(self.gpu_nd_array)
1605
-
1606
- def copy(self):
1607
- return MyGpuNdArray(self.gpu_nd_array.copy())
1608
-
1609
- def view(self):
1610
- return MyGpuNdArray(self.gpu_nd_array.view())
1611
-
1612
- def __copy__(self):
1613
- return MyGpuNdArray(self.gpu_nd_array.__copy__())
1614
-
1615
- def __deepcopy__(self):
1616
- return MyGpuNdArray(self.gpu_nd_array.__deepcopy__())
1617
-
1618
- @property
1619
- def gpudata(self):
1620
- # TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes
1621
- # attributes. Without this assert old code that don't support
1622
- # strides can receive as input object that are strided and no
1623
- # error will be gived
1624
-
1625
- #assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or
1626
- # self.gpu_nd_array.flags['F_CONTIGUOUS'])
1627
-
1628
- # TODO: find a way to pass to a pycuda/pyopencl function the
1629
- # bytes + offset directly.
1630
- return self.bytes + self.offset
1631
-
1632
- def __getitem__(self, *inputs):
1633
- return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs))
1634
-
1635
- def __add__(self, other):
1636
- return self.__elemwise2__(other, "add", theano.tensor.add)
1637
-
1638
- def __sub__(self, other):
1639
- return self.__elemwise2__(other, "sub", theano.tensor.sub)
1640
-
1641
- def __mul__(self, other):
1642
- return self.__elemwise2__(other, "mul", theano.tensor.mul)
1643
-
1644
- def __div__(self, other):
1645
- assert (str(self.gpu_nd_array.dtype).startswith("float") or
1646
- str(other.gpu_nd_array.dtype).startswith("float"))
1647
- return self.__elemwise2__(other, "true_div", theano.tensor.true_div)
1648
-
1649
- @classmethod
1650
- def add(cls, x, y, out=None):
1651
- """ add all inputs togethers element-wise """
1652
- return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out)
1653
-
1654
- @classmethod
1655
- def adds(cls, *inputs):
1656
- """ add all inputs togethers element-wise """
1657
- return cls.__elemwise__(inputs, "add", theano.tensor.add)
1658
-
1659
- @classmethod
1660
- def multiplys(cls, *inputs):
1661
- """ multiply all inputs togethers element-wise """
1662
- return cls.__elemwise__(inputs, "mul", theano.tensor.mul)
1663
-
1664
- def sum(self, axis=None, collapse=True):
1665
- from . import gen_reduction
1666
- max_thread_per_block = 512
1667
- max_block = 4096
1668
- if isinstance(axis, (list, tuple)):
1669
- if len(axis) == 1:
1670
- axis = axis[0]
1671
- else:
1672
- assert len(axis) == self.ndim
1673
- axis.sort()
1674
- assert axis == list(range(self.ndim))
1675
- axis = None
1676
-
1677
- # TODO: Why this?
1678
- if self.size == 0:
1679
- make_out = gpu_ndarray.zeros
1680
- else:
1681
- make_out = gpu_ndarray.empty
1682
-
1683
- if axis is None:
1684
- out = make_out((), self.dtype)
1685
- out = MyGpuNdArray(out)
1686
- else:
1687
- out_shape = [self.shape[i] for i in range(self.ndim)
1688
- if i != axis]
1689
- out = make_out(out_shape, self.dtype)
1690
- out = MyGpuNdArray(out)
1691
-
1692
- if self.size == 0:
1693
- return out
1694
-
1695
- args_set = False
1696
-
1697
- if collapse:
1698
- coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = (
1699
- reduction_collapses([self, out], axis))
1700
- else:
1701
- coll_ndim = self.ndim
1702
- coll_shape = self.shape
1703
- coll_strides = self.strides
1704
- coll_axis = [axis]
1705
- coll_out_str = out.strides
1706
-
1707
- if axis is not None:
1708
- coll_axis = coll_axis[0]
1709
-
1710
- args_set = False
1711
-
1712
- if coll_ndim == 0:
1713
- sum_op = gen_reduction.GpuSum([1], self.dtype)
1714
- c_code = sum_op.c_support_code_apply("nodename", contig=True)
1715
- fctname = "kernel_reduce_sum_ccontig_nodename"
1716
- fct = compile_gpu_code(c_code, fctname)
1717
- block_ = min(coll_shape[0], max_thread_per_block)
1718
- block = (block_, 1, 1)
1719
-
1720
- grid = (1, 1)
1721
- shared_ = self.dtype.itemsize * block_
1722
- args = [cast_int(coll_shape[0]), self, out]
1723
- args_set = True
1724
- elif axis is None:
1725
- pattern = [1] * coll_ndim
1726
- str_pattern = [str(i) for i in pattern]
1727
- sum_op = gen_reduction.GpuSum(pattern, self.dtype)
1728
- c_code = sum_op.c_support_code_apply("nodename")
1729
- if not c_code:
1730
- raise NotImplementedError(
1731
- "GpuNdArray sum case not implemented")
1732
- fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename"
1733
- fct = compile_gpu_code(c_code, fctname)
1734
- if coll_ndim == 1:
1735
- bx = min(max_thread_per_block, coll_shape[0])
1736
- block = (bx, 1, 1)
1737
- block_ = bx
1738
- elif coll_ndim == 2:
1739
- bx = min(max_thread_per_block, coll_shape[1])
1740
- by = min(max_thread_per_block // coll_shape[1], coll_shape[0])
1741
- by = max(by, 1)
1742
- block = (bx, by, 1)
1743
- block_ = bx * by
1744
- elif coll_ndim == 3:
1745
- bx = min(max_thread_per_block, coll_shape[2])
1746
- by = min(max_thread_per_block // bx, coll_shape[1])
1747
- bz = min(max_thread_per_block // (bx * by), coll_shape[0])
1748
- by = max(by, 1)
1749
- bz = min(max(bz, 1), 64)
1750
- block = (bx, by, bz)
1751
- block_ = bx * by * bz
1752
- elif coll_ndim == 4:
1753
- bx = min(max_thread_per_block, coll_shape[3])
1754
- by = min(max_thread_per_block // bx, coll_shape[2])
1755
- bz = min(max_thread_per_block // (bx * by), coll_shape[1])
1756
- by = max(by, 1)
1757
- bz = min(max(bz, 1), 64)
1758
- block = (bx, by, bz)
1759
- block_ = bx * by * bz
1760
- grid = (1, 1)
1761
- shared_ = self.dtype.itemsize * block_
1762
- elif coll_ndim in [1, 2, 3]:
1763
- if coll_ndim == 1:
1764
- assert coll_axis == 0
1765
- # pattern 1
1766
- sum_op = gen_reduction.GpuSum([1], self.dtype)
1767
- fctname = "kernel_reduce_sum_1_nodename"
1768
-
1769
- grid = (1, 1)
1770
-
1771
- block_ = min(max_thread_per_block, coll_shape[0])
1772
- block = (block_, 1, 1)
1773
- elif coll_ndim == 3 and coll_axis == 0:
1774
- # pattern 100
1775
- sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype)
1776
- fctname = "kernel_reduce_sum_100_nodename"
1777
-
1778
- gx = min(coll_shape[1], max_block)
1779
- gy = min(max_block // (gx * coll_shape[2]), coll_shape[2])
1780
- gy = max(gy, 1)
1781
- grid = (gx, gy)
1782
-
1783
- block_ = min(max_thread_per_block, coll_shape[0])
1784
- block = (block_, 1, 1)
1785
- elif coll_ndim == 3 and coll_axis == 1:
1786
- # pattern 010
1787
- sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype)
1788
- fctname = "kernel_reduce_sum_010_AD_nodename"
1789
-
1790
- A = coll_shape[0]
1791
- B = coll_shape[1]
1792
- C = coll_shape[2]
1793
- D = C / 32
1794
- if (32 * D < C):
1795
- D += 1
1796
- assert ((C <= 32 * D) and (32 * D < C + 32))
1797
- shared_ = 0
1798
-
1799
- gx = min(A, max_block)
1800
- gy = min(max_block // (D * A), D)
1801
- gy = max(gy, 1)
1802
- grid = (gx, gy)
1803
-
1804
- block = (32, 1, 1)
1805
- block_ = 32
1806
-
1807
- args_set = True
1808
- # input shape
1809
- args = [cast_int(A), cast_int(B),
1810
- cast_int(C), cast_int(D)]
1811
- # input
1812
- args.append(self)
1813
- # input strides
1814
- args += [cast_int(i / self.dtype.itemsize)
1815
- for i in coll_strides]
1816
- # output
1817
- args.append(out)
1818
- # output strides
1819
- args.append(cast_int(coll_out_str[0] / out.dtype.itemsize))
1820
- args.append(cast_int(coll_out_str[1] / out.dtype.itemsize))
1821
- elif coll_ndim == 3 and coll_axis == 2:
1822
- # pattern 001
1823
- sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype)
1824
- fctname = "kernel_reduce_sum_001_nodename"
1825
-
1826
- gx = min(coll_shape[0], max_block)
1827
- gy = min(max_block // (gx * coll_shape[1]), coll_shape[1])
1828
- gy = max(gy, 1)
1829
- grid = (gx, gy)
1830
-
1831
- block_ = min(max_thread_per_block, coll_shape[2])
1832
- block = (block_, 1, 1)
1833
- elif coll_axis == 0:
1834
- # pattern 10
1835
- sum_op = gen_reduction.GpuSum([1, 0], self.dtype)
1836
- fctname = "kernel_reduce_sum_010_nodename"
1837
- block_ = min(coll_shape[1], max_thread_per_block)
1838
- block = (block_, 1, 1)
1839
- grid = (1, coll_shape[0])
1840
- args_set = True
1841
- # input shape
1842
- args = [cast_int(1)]
1843
- args += [cast_int(i) for i in coll_shape]
1844
- # input
1845
- args.append(self)
1846
- # input strides
1847
- args.append(cast_int(1))
1848
- args += [cast_int(i / self.dtype.itemsize)
1849
- for i in coll_strides]
1850
- # output
1851
- args.append(out)
1852
- # output strides
1853
- args.append(cast_int(1))
1854
- # We must take the last dimensions in the case of
1855
- # dimensions collapsing.
1856
- args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize))
1857
- elif coll_axis == 1:
1858
- # pattern 01
1859
- sum_op = gen_reduction.GpuSum([0, 1], self.dtype)
1860
- fctname = "kernel_reduce_sum_01_nodename"
1861
- block_ = min(coll_shape[1], max_thread_per_block)
1862
- block = (block_, 1, 1)
1863
- grid = (1, min(coll_shape[0], max_block))
1864
- else:
1865
- raise Exception("Bad axis")
1866
-
1867
- c_code = sum_op.c_support_code_apply("nodename")
1868
- fct = compile_gpu_code(c_code, fctname)
1869
-
1870
- shared_ = self.dtype.itemsize * block_
1871
- else:
1872
- raise Exception("Not implemented")
1873
-
1874
- if not args_set:
1875
- # input shape
1876
- args = [cast_int(i) for i in coll_shape]
1877
- # input
1878
- args.append(self)
1879
- # input strides
1880
- args += [cast_int(i / self.dtype.itemsize)
1881
- for i in coll_strides]
1882
- # output
1883
- args.append(out)
1884
- # output strides
1885
- args += [cast_int(i / self.dtype.itemsize)
1886
- for i in coll_out_str]
1887
-
1888
- pycuda._driver.Context.synchronize()
1889
- #print fctname, block, grid, shared_, axis
1890
- #print self.ndim, self.shape, self.strides, axis, out.strides
1891
- #print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str
1892
- #print args
1893
-
1894
- if False:
1895
- d = {"block": block,
1896
- "shared": shared_,
1897
- "grid": grid}
1898
- fct(*args, **d)
1899
- else:
1900
- # We bypass the pycuda wrapper gpu function call.
1901
- # by calling directly the gpu function.
1902
- # This is faster and lower the overhead.
1903
- fct.set_block_shape(*block)
1904
- fct.set_shared_size(shared_)
1905
- fct.param_set(*args)
1906
- fct.launch_grid(*grid)
1907
- return out