froog 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
froog/ops_gpu.py DELETED
@@ -1,598 +0,0 @@
1
- # _______ ______ _______ _______ _______
2
- # | || _ | | || || |
3
- # | ___|| | || | _ || _ || ___|
4
- # | |___ | |_||_ | | | || | | || | __
5
- # | ___|| __ || |_| || |_| || || |
6
- # | | | | | || || || |_| |
7
- # |___| |___| |_||_______||_______||_______|
8
- #
9
- # OpenCL kernels
10
-
11
- import numpy as np
12
- from .tensor import Function, register
13
- import pyopencl as cl
14
- import functools
15
-
16
- def buffer_new(ctx, shape):
17
- res_g = cl.Buffer(ctx.cl_ctx, cl.mem_flags.WRITE_ONLY, 4*np.prod(shape))
18
- res_g.shape = shape
19
- res_g.dtype = np.float32
20
- return res_g
21
-
22
- def buffer_zeros(ctx, shape):
23
- res_g = cl.Buffer(ctx.cl_ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=np.zeros(shape))
24
- res_g.shape = shape
25
- res_g.dtype = np.float32
26
- return res_g
27
-
28
- def buffer_like(ctx, x):
29
- return buffer_new(ctx, x.shape)
30
-
31
- @functools.lru_cache
32
- def clbuild(cl_ctx, prg):
33
- return cl.Program(cl_ctx, prg).build()
34
-
35
- def binary_op(ctx, code, x, y):
36
- # if len(x.shape) != len(y.shape):
37
- # raise Exception(f"shape mismatch in binop {code}: {x.shape} {y.shape}")
38
- xdiv = 1
39
- ydiv = 1
40
- if x.shape != y.shape:
41
- # special case broadcasting
42
- if len(y.shape) == 4 and x.shape[0:2] == y.shape[0:2] and y.shape[2] == 1 and y.shape[3] == 1:
43
- ydiv = x.shape[2] * x.shape[3]
44
- elif len(y.shape) == 4 and x.shape[0:2] == y.shape[0:2] and x.shape[2] == 1 and x.shape[3] == 1:
45
- xdiv = y.shape[2] * y.shape[3]
46
- elif np.prod(y.shape) == 1:
47
- ydiv = np.prod(x.shape)
48
- else:
49
- raise Exception(f"binary op shape mismatch: {x.shape} != {y.shape}")
50
- ret = buffer_like(ctx, x if np.prod(x.shape) >= np.prod(y.shape) else y)
51
- prg = clbuild(ctx.cl_ctx, """
52
- __kernel void binop( __global const float *a_g, __global const float *b_g, __global float *res_g, int xdiv, int ydiv) {
53
- int gid = get_global_id(0);
54
- float a = a_g[gid/xdiv];
55
- float b = b_g[gid/ydiv];
56
- res_g[gid] = """+code+""";
57
- }
58
- """)
59
- prg.binop(ctx.cl_queue, [np.prod(ret.shape)], None, x, y, ret, np.int32(xdiv), np.int32(ydiv))
60
- return ret
61
-
62
- def unary_op(ctx, code, x):
63
- ret = buffer_like(ctx, x)
64
- prg = clbuild(ctx.cl_ctx, """
65
- __kernel void unop(
66
- __global const float *a_g, __global float *res_g)
67
- {
68
- int gid = get_global_id(0);
69
- float a = a_g[gid];
70
- res_g[gid] = """+code+""";
71
- }
72
- """)
73
- prg.unop(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret)
74
- return ret
75
-
76
- @functools.lru_cache
77
- def cl_pooling_krnl_build(cl_ctx, iter_op, result_op, init_val=0):
78
- prg = """
79
- __kernel void subsample(
80
- __global float *output, __global const float *input, uint2 osize, uint2 isize, uint2 kernel_size, int nelem
81
- ) {
82
- int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
83
- int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
84
- float group_res = """+str(init_val)+""";
85
- for (uint j=0; j<kernel_size.y; ++j) {
86
- for (uint i=0; i<kernel_size.x; ++i) {
87
- int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
88
- if (iid < nelem)
89
- """+iter_op+""";
90
- }
91
- }
92
- output[oid] = """+result_op+""";
93
- }
94
- """
95
- return clbuild(cl_ctx, prg)
96
-
97
- def pooling_op(ctx, input, kernel_size, iter_op, result_op, init_val=0):
98
- N, C, Y, X = input.shape
99
- py,px = kernel_size
100
- ret = buffer_new(ctx, (N, C, Y//py, X//px))
101
- osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
102
- isize = np.array((X, Y), dtype=cl.cltypes.uint2)
103
- ksize = np.array((px,py), dtype=cl.cltypes.uint2)
104
- prg = cl_pooling_krnl_build(ctx.cl_ctx, iter_op, result_op, init_val=init_val)
105
- prg.subsample(ctx.cl_queue, (N*C, Y//py, X//px), None, ret, input, osize, isize, ksize, np.int32(input.size))
106
- ctx.data = np.empty((N, C, Y, X)) # set shape expectation on tensor instance
107
- return ret
108
-
109
- class Add(Function):
110
- @staticmethod
111
- def forward(ctx, x, y):
112
- return binary_op(ctx, 'a+b', x, y)
113
-
114
- @staticmethod
115
- def backward(ctx, grad_output):
116
- return grad_output, grad_output
117
- register('add', Add, gpu=True)
118
-
119
- class Sub(Function):
120
- @staticmethod
121
- def forward(ctx, x, y):
122
- return binary_op(ctx, 'a-b', x, y)
123
-
124
- @staticmethod
125
- def backward(ctx, grad_output):
126
- not_grad_output = unary_op(ctx, '-a', grad_output)
127
- return grad_output, not_grad_output
128
- register('sub', Sub, gpu=True)
129
-
130
- class Mul(Function):
131
- @staticmethod
132
- def forward(ctx, x, y):
133
- ctx.save_for_backward(x, y)
134
-
135
- return binary_op(ctx, 'a*b', x, y)
136
-
137
- @staticmethod
138
- def backward(ctx, grad_output):
139
- x,y = ctx.saved_tensors
140
- return binary_op(ctx, 'a*b', y, grad_output), binary_op(ctx, 'a*b', x, grad_output)
141
- register('mul', Mul, gpu=True)
142
-
143
- class Pow(Function):
144
- @staticmethod
145
- def forward(ctx, x, y):
146
- ctx.save_for_backward(x, y)
147
- return binary_op(ctx, 'pow(a,b)', x, y)
148
-
149
- @staticmethod
150
- def backward(ctx, grad_output):
151
- x,y = ctx.saved_tensors
152
- gradx = binary_op(ctx, 'a*b', grad_output,
153
- binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)));', x, y))
154
- grady = binary_op(ctx, 'a*b', grad_output,
155
- binary_op(ctx, 'pow((float)a, (float)b) * log(a);', x, y))
156
- return gradx, grady
157
- register('pow', Pow, gpu=True)
158
-
159
- class Sum(Function):
160
- @staticmethod
161
- def forward(ctx, input):
162
- ctx.save_for_backward(input)
163
- ret = buffer_new(ctx, (1,)) # buffer of size 1, which will hold the sum
164
- prg = clbuild(ctx.cl_ctx, """
165
- __kernel void sum(
166
- __global const float *a_g, int sz, __global float *res_g)
167
- {
168
- float out = 0.0;
169
- for (int x = 0; x < sz; x++) {
170
- out += a_g[x];
171
- }
172
- res_g[0] = out;
173
- }
174
- """)
175
- prg.sum(ctx.cl_queue, [input.shape[0]], None, input, np.int32(np.prod(input.shape)), ret)
176
- return ret
177
-
178
- @staticmethod
179
- def backward(ctx, grad_output):
180
- input, = ctx.saved_tensors
181
- ret = buffer_like(ctx, input)
182
- prg = clbuild(ctx.cl_ctx, """
183
- __kernel void fill(
184
- __global const float *a_g, __global float *res_g)
185
- {
186
- int gid = get_global_id(0);
187
- res_g[gid] = a_g[0];
188
- }
189
- """)
190
- prg.fill(ctx.cl_queue, [np.prod(ret.shape)], None, grad_output, ret)
191
- return ret
192
- register('sum', Sum, gpu=True)
193
-
194
- class Dot(Function):
195
- """
196
- A[gid_y * size + i] accesses an element in the row gid_y of matrix A and the column i
197
- """
198
- @staticmethod
199
- def forward(ctx, input, weight):
200
- assert input.shape[1] == weight.shape[0] # inner dims must match for dot product
201
- isize = np.int32(input.shape[0])
202
- msize = np.int32(input.shape[1])
203
- osize = np.int32(weight.shape[1])
204
- one = np.int32(1)
205
- ret = buffer_new(ctx, (isize, osize))
206
-
207
- prg = clbuild(ctx.cl_ctx, """
208
- __kernel void matmul(
209
- __global const float *input,
210
- __global const float *weight,
211
- __global float *res,
212
- int input_row_size,
213
- int input_col_size,
214
- int msize,
215
- int weight_row_size,
216
- int weight_col_size,
217
- int osize
218
- )
219
- {
220
- int gid_y = get_global_id(0); // row index
221
- int gid_x = get_global_id(1); // col index
222
-
223
- float acc = 0.0;
224
- for (int i = 0; i < msize; i++) {
225
- acc += input[gid_y * input_row_size + i * input_col_size] * weight[gid_x * weight_row_size + i * weight_col_size];
226
- }
227
- res[gid_y * osize + gid_x] = acc;
228
- }
229
- """)
230
- ctx.save_for_backward(input, weight, prg)
231
- # (isize,msize) x (msize,osize) = (isize,osize)
232
- prg.matmul(ctx.cl_queue, [isize, osize], None,
233
- input, weight, ret,
234
- msize, one, msize, one, osize, osize)
235
- return ret
236
-
237
- @staticmethod
238
- def backward(ctx, grad_output):
239
- input, weight, prg = ctx.saved_tensors
240
- isize = np.int32(input.shape[0])
241
- msize = np.int32(input.shape[1])
242
- osize = np.int32(weight.shape[1])
243
- one = np.int32(1)
244
-
245
- grad_input = buffer_like(ctx, input)
246
- grad_weight = buffer_like(ctx, weight)
247
-
248
- # (isize,osize) x (msize,osize) = (isize,msize)
249
- prg.matmul(ctx.cl_queue, [isize, msize], None,
250
- grad_output, weight, grad_input,
251
- osize, one, osize, osize, one, msize)
252
-
253
- # (isize,msize) x (isize,osize) = (msize,osize)
254
- prg.matmul(ctx.cl_queue, [msize, osize], None,
255
- input, grad_output, grad_weight,
256
- one, msize, isize, one, osize, osize)
257
-
258
- return grad_input, grad_weight
259
- register('dot', Dot, gpu=True)
260
- register('matmul', Dot, gpu=True)
261
-
262
- # ***** SIMPLE OPS ********
263
-
264
- class Reshape(Function):
265
- @staticmethod
266
- def forward(ctx, x, shape):
267
- ctx.save_for_backward(x.shape)
268
- ss = list(shape)
269
-
270
- # ???
271
- tsum = 1
272
- for s in ss:
273
- if s != -1:
274
- tsum *= s
275
- for i,s in enumerate(ss):
276
- if s == -1:
277
- ss[i] = np.prod(x.shape) // tsum
278
- assert np.prod(x.shape) == np.prod(ss)
279
- x.shape = tuple(ss)
280
- return x
281
-
282
- @staticmethod
283
- def backward(ctx, grad_output):
284
- in_shape, = ctx.saved_tensors
285
- grad_output.shape = in_shape
286
- return grad_output
287
- register('reshape', Reshape, gpu=True)
288
-
289
- # ***** ACTIVATION OPS ********
290
-
291
- class ReLU(Function):
292
- @staticmethod
293
- def forward(ctx, input):
294
- ctx.save_for_backward(input)
295
- return unary_op(ctx, 'max(a, (float)0.);', input)
296
-
297
- @staticmethod
298
- def backward(ctx, grad_output):
299
- input, = ctx.saved_tensors
300
- return binary_op(ctx, 'a * (b >= 0);', grad_output, input)
301
- register('relu', ReLU, gpu=True)
302
-
303
- class LogSoftmax(Function):
304
- @staticmethod
305
- def forward(ctx, input):
306
- # first find max values for numerical stability
307
- max_vals = buffer_new(ctx, (input.shape[0],))
308
- prg = clbuild(ctx.cl_ctx, """
309
- __kernel void max_vals(
310
- __global const float *a_g, int sz, __global float *res_g)
311
- {
312
- int gid = get_global_id(0);
313
- int gidsz = gid*sz;
314
- float max_val = -INFINITY;
315
- for (int x = 0; x < sz; x++) {
316
- max_val = max(max_val, a_g[gidsz+x]);
317
- }
318
- res_g[gid] = max_val;
319
- }
320
- """)
321
- prg.max_vals(ctx.cl_queue, [input.shape[0]], None, input, np.int32(input.shape[1]), max_vals)
322
-
323
- # compute exp(x - max) and sum
324
- lsum = buffer_new(ctx, (input.shape[0],))
325
- prg = clbuild(ctx.cl_ctx, """
326
- __kernel void logsoftmax(
327
- __global const float *a_g, __global const float *max_vals, int sz, __global float *res_g)
328
- {
329
- int gid = get_global_id(0);
330
- int gidsz = gid*sz;
331
- float max_val = max_vals[gid];
332
- float out = 0.0;
333
- for (int x = 0; x < sz; x++) {
334
- out += exp(a_g[gidsz+x] - max_val);
335
- }
336
- res_g[gid] = log(out) + max_val;
337
- }
338
- """)
339
- prg.logsoftmax(ctx.cl_queue, [input.shape[0]], None, input, max_vals, np.int32(input.shape[1]), lsum)
340
-
341
- # compute final output
342
- output = buffer_like(ctx, input)
343
- prg = clbuild(ctx.cl_ctx, """
344
- __kernel void lsmsub(
345
- __global const float *a_g, __global const float *b_g, int sz, __global float *res_g)
346
- {
347
- int gid = get_global_id(0);
348
- int gid2 = get_global_id(1);
349
- res_g[gid*sz + gid2] = a_g[gid*sz + gid2] - b_g[gid];
350
- }
351
- """)
352
- prg.lsmsub(ctx.cl_queue, [input.shape[0], input.shape[1]], None, input, lsum, np.int32(input.shape[1]), output)
353
- ctx.save_for_backward(output)
354
- return output
355
-
356
- @staticmethod
357
- def backward(ctx, grad_output):
358
- output, = ctx.saved_tensors
359
-
360
- grad_input = buffer_like(ctx, grad_output)
361
- prg = clbuild(ctx.cl_ctx, """
362
- __kernel void lsmsub2(
363
- __global const float *grad_output, __global const float *output, int sz, __global float *grad_input)
364
- {
365
- int gid = get_global_id(0);
366
- int gidsz = gid*sz;
367
- int gid2 = get_global_id(1);
368
- // TODO: this is repeated in many kernels
369
- float acc = 0.0;
370
- for (int x = 0; x < sz; x++) {
371
- acc += grad_output[gidsz + x];
372
- }
373
- grad_input[gidsz + gid2] = grad_output[gidsz + gid2] - exp(output[gidsz + gid2]) * acc;
374
- }
375
- """)
376
- prg.lsmsub2(ctx.cl_queue, [grad_output.shape[0], grad_output.shape[1]], None,
377
- grad_output, output, np.int32(grad_output.shape[1]), grad_input)
378
-
379
- return grad_input
380
- register('logsoftmax', LogSoftmax, gpu=True)
381
-
382
- class Sigmoid(Function):
383
- @staticmethod
384
- def forward(ctx, input):
385
- ret = unary_op(ctx, '1./(1+exp(-a))', input)
386
- ctx.save_for_backward(ret)
387
- return ret
388
-
389
- @staticmethod
390
- def backward(ctx, grad_output):
391
- ret, = ctx.saved_tensors
392
- return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
393
- register('sigmoid', Sigmoid, gpu=True)
394
-
395
- # ***** CONV OPS ********
396
-
397
- class Conv2D(Function):
398
- @staticmethod
399
- def forward(ctx, x, w, stride=1, groups=1):
400
- if type(ctx.stride) == int: # ctx stores function params
401
- ctx.stride = (ctx.stride, ctx.stride)
402
- cout,cin,H,W = w.shape
403
- ys,xs = ctx.stride
404
- bs,cin_,iy,ix = x.shape
405
- oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
406
- assert cin*ctx.groups == cin_
407
- assert cout % ctx.groups == 0
408
- rcout = cout//ctx.groups
409
- # output buffer
410
- ret = buffer_new(ctx, (bs, cout, oy, ox))
411
- prg = clbuild(ctx.cl_ctx, """
412
- __kernel void conv(__global const float *input, __global const float *weight, __global float *output,
413
- int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
414
- int B = get_global_id(0)/(groups*rcout); // range 0-bs
415
- int g = (get_global_id(0)/rcout)%groups;
416
- int c = get_global_id(0) % rcout;
417
- int Y = get_global_id(1); // range 0-oy
418
- int X = get_global_id(2); // range 0-ox
419
- int IY = Y*ys;
420
- int IX = X*xs;
421
-
422
- // input = (bs, groups, cin, iy, ix)
423
- // weight = (groups, rcout, cin, H, W)
424
- // output = (bs, groups, rcout, oy, ox)
425
- float acc = 0.0;
426
- for (int ci = 0; ci < cin; ci++) {
427
- for (int y = IY; y < IY+H; y++) {
428
- for (int x = IX; x < IX+W; x++) {
429
- acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + y*ix + x] * \
430
- weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-IY)*W + (x-IX)];
431
- }
432
- }
433
- }
434
- output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
435
- }
436
- """)
437
-
438
- prg.conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
439
- x, w, ret,
440
- np.int32(H), np.int32(W),
441
- np.int32(groups), np.int32(rcout), np.int32(cin),
442
- np.int32(oy), np.int32(ox),
443
- np.int32(iy), np.int32(ix),
444
- np.int32(ys), np.int32(xs)
445
- )
446
- return ret
447
- @staticmethod
448
- def backward(ctx, grad_output):
449
- raise Exception("not implemented")
450
-
451
- register('conv2d', Conv2D, gpu=True)
452
-
453
- class Pad2D(Function):
454
- @staticmethod
455
- def forward(ctx, x, padding=None):
456
- bs,cin,iy,ix = x.shape
457
- oy,ox = iy+padding[0]+padding[1], ix+padding[2]+padding[3] # top, bottom, left, right
458
- ret = buffer_zeros(ctx, (bs, cin, oy, ox))
459
-
460
- prg = clbuild(ctx.cl_ctx, """
461
- __kernel void pad2d(
462
- __global const float *input, __global float *output,
463
- int cin, int py, int px, int oy, int ox, int iy, int ix
464
- )
465
- {
466
- int B = get_global_id(0);
467
- int C = get_global_id(1);
468
- int Y = get_global_id(2);
469
- int iptr = B*cin*iy*ix + C*iy*ix + Y*ix;
470
- int optr = B*cin*oy*ox + C*oy*ox + (Y+py)*ox + px;
471
- for (int x = 0; x < ix; x++) {
472
- output[optr+x] = input[iptr+x];
473
- }
474
- }
475
- """)
476
- prg.pad2d(ctx.cl_queue, [bs, cin, iy], None,
477
- x, ret,
478
- np.int32(cin), np.int32(padding[0]), np.int32(padding[2]),
479
- np.int32(oy), np.int32(ox), np.int32(iy), np.int32(ix)
480
- )
481
- return ret
482
-
483
- @staticmethod
484
- def backward(ctx, grad_output):
485
- raise Exception("write this")
486
- register('pad2d', Pad2D, gpu=True)
487
-
488
- class AvgPool2D(Function):
489
- @staticmethod
490
- def forward(ctx, input, kernel_size=(2, 2)):
491
- iter_op = "group_res += input[iid]"
492
- result_op = "group_res / (kernel_size.x * kernel_size.y)"
493
- return pooling_op(ctx, input, kernel_size, iter_op, result_op)
494
-
495
- @staticmethod
496
- def backward(ctx, grad_output):
497
- # for average pooling, we need to distribute the gradient evenly across all elements in the pooling window
498
- input_shape = ctx.data.shape
499
- N, C, Y, X = input_shape
500
- py, px = ctx.kernel_size
501
- ret = buffer_zeros(ctx, input_shape)
502
-
503
- prg = clbuild(ctx.cl_ctx, """
504
- __kernel void avgpool_backward(
505
- __global float *grad_input, __global const float *grad_output,
506
- uint2 osize, uint2 isize, uint2 kernel_size, int nelem
507
- ) {
508
- int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
509
- int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
510
- float grad = grad_output[oid] / (kernel_size.x * kernel_size.y);
511
-
512
- for (uint j=0; j<kernel_size.y; ++j) {
513
- for (uint i=0; i<kernel_size.x; ++i) {
514
- int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
515
- if (iid < nelem)
516
- grad_input[iid] += grad;
517
- }
518
- }
519
- }
520
- """)
521
-
522
- osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
523
- isize = np.array((X, Y), dtype=cl.cltypes.uint2)
524
- ksize = np.array((px,py), dtype=cl.cltypes.uint2)
525
-
526
- prg.avgpool_backward(ctx.cl_queue, (N*C, Y//py, X//px), None, ret, grad_output, osize, isize, ksize, np.int32(input_shape.size))
527
-
528
- return ret
529
- register('avg_pool2d', AvgPool2D, gpu=True)
530
-
531
- class MaxPool2D(Function):
532
- @staticmethod
533
- def forward(ctx, input, kernel_size=(2, 2)):
534
- init_val = "FLT_MIN"
535
- iter_op = "group_res = max(group_res, input[iid])"
536
- result_op = "group_res"
537
- ret = pooling_op(ctx, input, kernel_size, iter_op, result_op, init_val=init_val)
538
-
539
- # save indices of max elements for backward pass
540
- indices = buffer_new(ctx, ret.shape)
541
- prg = clbuild(ctx.cl_ctx, """
542
- __kernel void maxpool_indices(
543
- __global const float *input, __global float *output, __global int *indices,
544
- uint2 osize, uint2 isize, uint2 kernel_size, int nelem
545
- ) {
546
- int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
547
- int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
548
- float max_val = -INFINITY;
549
- int max_idx = 0;
550
-
551
- for (uint j=0; j<kernel_size.y; ++j) {
552
- for (uint i=0; i<kernel_size.x; ++i) {
553
- int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
554
- if (iid < nelem) {
555
- float val = input[iid];
556
- if (val > max_val) {
557
- max_val = val;
558
- max_idx = iid;
559
- }
560
- }
561
- }
562
- }
563
- indices[oid] = max_idx;
564
- }
565
- """)
566
-
567
- N, C, Y, X = input.shape
568
- py, px = kernel_size
569
- osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
570
- isize = np.array((X, Y), dtype=cl.cltypes.uint2)
571
- ksize = np.array((px,py), dtype=cl.cltypes.uint2)
572
-
573
- prg.maxpool_indices(ctx.cl_queue, (N*C, Y//py, X//px), None, input, ret, indices, osize, isize, ksize, np.int32(input.size))
574
-
575
- ctx.save_for_backward(indices)
576
- return ret
577
-
578
- @staticmethod
579
- def backward(ctx, grad_output):
580
- indices, = ctx.saved_tensors
581
- input_shape = ctx.data.shape
582
- ret = buffer_zeros(ctx, input_shape)
583
- prg = clbuild(ctx.cl_ctx, """
584
- __kernel void maxpool_backward(
585
- __global float *grad_input, __global const float *grad_output,
586
- __global const int *indices, int nelem
587
- ) {
588
- int gid = get_global_id(0);
589
- if (gid < nelem) {
590
- int idx = indices[gid];
591
- grad_input[idx] += grad_output[gid];
592
- }
593
- }
594
- """)
595
-
596
- prg.maxpool_backward(ctx.cl_queue, [np.prod(grad_output.shape)], None, ret, grad_output, indices, np.int32(grad_output.size))
597
- return ret
598
- register('max_pool2d', MaxPool2D, gpu=True)
@@ -1 +0,0 @@
1
- Copyright (c) 2023 Kevin Buhler