froog 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- froog/__init__.py +34 -1
- froog/{gradcheck.py → gradient.py} +4 -11
- froog/ops.py +354 -87
- froog/optim.py +104 -32
- froog/tensor.py +219 -219
- froog/utils.py +8 -7
- froog-0.5.0.dist-info/METADATA +205 -0
- froog-0.5.0.dist-info/RECORD +10 -0
- {froog-0.4.0.dist-info → froog-0.5.0.dist-info}/WHEEL +1 -1
- froog/nn.py +0 -60
- froog/ops_gpu.py +0 -598
- froog-0.4.0.dist-info/LICENSE +0 -1
- froog-0.4.0.dist-info/METADATA +0 -293
- froog-0.4.0.dist-info/RECORD +0 -13
- {froog-0.4.0.dist-info → froog-0.5.0.dist-info}/top_level.txt +0 -0
froog/ops_gpu.py
DELETED
@@ -1,598 +0,0 @@
|
|
1
|
-
# _______ ______ _______ _______ _______
|
2
|
-
# | || _ | | || || |
|
3
|
-
# | ___|| | || | _ || _ || ___|
|
4
|
-
# | |___ | |_||_ | | | || | | || | __
|
5
|
-
# | ___|| __ || |_| || |_| || || |
|
6
|
-
# | | | | | || || || |_| |
|
7
|
-
# |___| |___| |_||_______||_______||_______|
|
8
|
-
#
|
9
|
-
# OpenCL kernels
|
10
|
-
|
11
|
-
import numpy as np
|
12
|
-
from .tensor import Function, register
|
13
|
-
import pyopencl as cl
|
14
|
-
import functools
|
15
|
-
|
16
|
-
def buffer_new(ctx, shape):
|
17
|
-
res_g = cl.Buffer(ctx.cl_ctx, cl.mem_flags.WRITE_ONLY, 4*np.prod(shape))
|
18
|
-
res_g.shape = shape
|
19
|
-
res_g.dtype = np.float32
|
20
|
-
return res_g
|
21
|
-
|
22
|
-
def buffer_zeros(ctx, shape):
|
23
|
-
res_g = cl.Buffer(ctx.cl_ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=np.zeros(shape))
|
24
|
-
res_g.shape = shape
|
25
|
-
res_g.dtype = np.float32
|
26
|
-
return res_g
|
27
|
-
|
28
|
-
def buffer_like(ctx, x):
|
29
|
-
return buffer_new(ctx, x.shape)
|
30
|
-
|
31
|
-
@functools.lru_cache
|
32
|
-
def clbuild(cl_ctx, prg):
|
33
|
-
return cl.Program(cl_ctx, prg).build()
|
34
|
-
|
35
|
-
def binary_op(ctx, code, x, y):
|
36
|
-
# if len(x.shape) != len(y.shape):
|
37
|
-
# raise Exception(f"shape mismatch in binop {code}: {x.shape} {y.shape}")
|
38
|
-
xdiv = 1
|
39
|
-
ydiv = 1
|
40
|
-
if x.shape != y.shape:
|
41
|
-
# special case broadcasting
|
42
|
-
if len(y.shape) == 4 and x.shape[0:2] == y.shape[0:2] and y.shape[2] == 1 and y.shape[3] == 1:
|
43
|
-
ydiv = x.shape[2] * x.shape[3]
|
44
|
-
elif len(y.shape) == 4 and x.shape[0:2] == y.shape[0:2] and x.shape[2] == 1 and x.shape[3] == 1:
|
45
|
-
xdiv = y.shape[2] * y.shape[3]
|
46
|
-
elif np.prod(y.shape) == 1:
|
47
|
-
ydiv = np.prod(x.shape)
|
48
|
-
else:
|
49
|
-
raise Exception(f"binary op shape mismatch: {x.shape} != {y.shape}")
|
50
|
-
ret = buffer_like(ctx, x if np.prod(x.shape) >= np.prod(y.shape) else y)
|
51
|
-
prg = clbuild(ctx.cl_ctx, """
|
52
|
-
__kernel void binop( __global const float *a_g, __global const float *b_g, __global float *res_g, int xdiv, int ydiv) {
|
53
|
-
int gid = get_global_id(0);
|
54
|
-
float a = a_g[gid/xdiv];
|
55
|
-
float b = b_g[gid/ydiv];
|
56
|
-
res_g[gid] = """+code+""";
|
57
|
-
}
|
58
|
-
""")
|
59
|
-
prg.binop(ctx.cl_queue, [np.prod(ret.shape)], None, x, y, ret, np.int32(xdiv), np.int32(ydiv))
|
60
|
-
return ret
|
61
|
-
|
62
|
-
def unary_op(ctx, code, x):
|
63
|
-
ret = buffer_like(ctx, x)
|
64
|
-
prg = clbuild(ctx.cl_ctx, """
|
65
|
-
__kernel void unop(
|
66
|
-
__global const float *a_g, __global float *res_g)
|
67
|
-
{
|
68
|
-
int gid = get_global_id(0);
|
69
|
-
float a = a_g[gid];
|
70
|
-
res_g[gid] = """+code+""";
|
71
|
-
}
|
72
|
-
""")
|
73
|
-
prg.unop(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret)
|
74
|
-
return ret
|
75
|
-
|
76
|
-
@functools.lru_cache
|
77
|
-
def cl_pooling_krnl_build(cl_ctx, iter_op, result_op, init_val=0):
|
78
|
-
prg = """
|
79
|
-
__kernel void subsample(
|
80
|
-
__global float *output, __global const float *input, uint2 osize, uint2 isize, uint2 kernel_size, int nelem
|
81
|
-
) {
|
82
|
-
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
|
83
|
-
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
|
84
|
-
float group_res = """+str(init_val)+""";
|
85
|
-
for (uint j=0; j<kernel_size.y; ++j) {
|
86
|
-
for (uint i=0; i<kernel_size.x; ++i) {
|
87
|
-
int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
|
88
|
-
if (iid < nelem)
|
89
|
-
"""+iter_op+""";
|
90
|
-
}
|
91
|
-
}
|
92
|
-
output[oid] = """+result_op+""";
|
93
|
-
}
|
94
|
-
"""
|
95
|
-
return clbuild(cl_ctx, prg)
|
96
|
-
|
97
|
-
def pooling_op(ctx, input, kernel_size, iter_op, result_op, init_val=0):
|
98
|
-
N, C, Y, X = input.shape
|
99
|
-
py,px = kernel_size
|
100
|
-
ret = buffer_new(ctx, (N, C, Y//py, X//px))
|
101
|
-
osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
|
102
|
-
isize = np.array((X, Y), dtype=cl.cltypes.uint2)
|
103
|
-
ksize = np.array((px,py), dtype=cl.cltypes.uint2)
|
104
|
-
prg = cl_pooling_krnl_build(ctx.cl_ctx, iter_op, result_op, init_val=init_val)
|
105
|
-
prg.subsample(ctx.cl_queue, (N*C, Y//py, X//px), None, ret, input, osize, isize, ksize, np.int32(input.size))
|
106
|
-
ctx.data = np.empty((N, C, Y, X)) # set shape expectation on tensor instance
|
107
|
-
return ret
|
108
|
-
|
109
|
-
class Add(Function):
|
110
|
-
@staticmethod
|
111
|
-
def forward(ctx, x, y):
|
112
|
-
return binary_op(ctx, 'a+b', x, y)
|
113
|
-
|
114
|
-
@staticmethod
|
115
|
-
def backward(ctx, grad_output):
|
116
|
-
return grad_output, grad_output
|
117
|
-
register('add', Add, gpu=True)
|
118
|
-
|
119
|
-
class Sub(Function):
|
120
|
-
@staticmethod
|
121
|
-
def forward(ctx, x, y):
|
122
|
-
return binary_op(ctx, 'a-b', x, y)
|
123
|
-
|
124
|
-
@staticmethod
|
125
|
-
def backward(ctx, grad_output):
|
126
|
-
not_grad_output = unary_op(ctx, '-a', grad_output)
|
127
|
-
return grad_output, not_grad_output
|
128
|
-
register('sub', Sub, gpu=True)
|
129
|
-
|
130
|
-
class Mul(Function):
|
131
|
-
@staticmethod
|
132
|
-
def forward(ctx, x, y):
|
133
|
-
ctx.save_for_backward(x, y)
|
134
|
-
|
135
|
-
return binary_op(ctx, 'a*b', x, y)
|
136
|
-
|
137
|
-
@staticmethod
|
138
|
-
def backward(ctx, grad_output):
|
139
|
-
x,y = ctx.saved_tensors
|
140
|
-
return binary_op(ctx, 'a*b', y, grad_output), binary_op(ctx, 'a*b', x, grad_output)
|
141
|
-
register('mul', Mul, gpu=True)
|
142
|
-
|
143
|
-
class Pow(Function):
|
144
|
-
@staticmethod
|
145
|
-
def forward(ctx, x, y):
|
146
|
-
ctx.save_for_backward(x, y)
|
147
|
-
return binary_op(ctx, 'pow(a,b)', x, y)
|
148
|
-
|
149
|
-
@staticmethod
|
150
|
-
def backward(ctx, grad_output):
|
151
|
-
x,y = ctx.saved_tensors
|
152
|
-
gradx = binary_op(ctx, 'a*b', grad_output,
|
153
|
-
binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)));', x, y))
|
154
|
-
grady = binary_op(ctx, 'a*b', grad_output,
|
155
|
-
binary_op(ctx, 'pow((float)a, (float)b) * log(a);', x, y))
|
156
|
-
return gradx, grady
|
157
|
-
register('pow', Pow, gpu=True)
|
158
|
-
|
159
|
-
class Sum(Function):
|
160
|
-
@staticmethod
|
161
|
-
def forward(ctx, input):
|
162
|
-
ctx.save_for_backward(input)
|
163
|
-
ret = buffer_new(ctx, (1,)) # buffer of size 1, which will hold the sum
|
164
|
-
prg = clbuild(ctx.cl_ctx, """
|
165
|
-
__kernel void sum(
|
166
|
-
__global const float *a_g, int sz, __global float *res_g)
|
167
|
-
{
|
168
|
-
float out = 0.0;
|
169
|
-
for (int x = 0; x < sz; x++) {
|
170
|
-
out += a_g[x];
|
171
|
-
}
|
172
|
-
res_g[0] = out;
|
173
|
-
}
|
174
|
-
""")
|
175
|
-
prg.sum(ctx.cl_queue, [input.shape[0]], None, input, np.int32(np.prod(input.shape)), ret)
|
176
|
-
return ret
|
177
|
-
|
178
|
-
@staticmethod
|
179
|
-
def backward(ctx, grad_output):
|
180
|
-
input, = ctx.saved_tensors
|
181
|
-
ret = buffer_like(ctx, input)
|
182
|
-
prg = clbuild(ctx.cl_ctx, """
|
183
|
-
__kernel void fill(
|
184
|
-
__global const float *a_g, __global float *res_g)
|
185
|
-
{
|
186
|
-
int gid = get_global_id(0);
|
187
|
-
res_g[gid] = a_g[0];
|
188
|
-
}
|
189
|
-
""")
|
190
|
-
prg.fill(ctx.cl_queue, [np.prod(ret.shape)], None, grad_output, ret)
|
191
|
-
return ret
|
192
|
-
register('sum', Sum, gpu=True)
|
193
|
-
|
194
|
-
class Dot(Function):
|
195
|
-
"""
|
196
|
-
A[gid_y * size + i] accesses an element in the row gid_y of matrix A and the column i
|
197
|
-
"""
|
198
|
-
@staticmethod
|
199
|
-
def forward(ctx, input, weight):
|
200
|
-
assert input.shape[1] == weight.shape[0] # inner dims must match for dot product
|
201
|
-
isize = np.int32(input.shape[0])
|
202
|
-
msize = np.int32(input.shape[1])
|
203
|
-
osize = np.int32(weight.shape[1])
|
204
|
-
one = np.int32(1)
|
205
|
-
ret = buffer_new(ctx, (isize, osize))
|
206
|
-
|
207
|
-
prg = clbuild(ctx.cl_ctx, """
|
208
|
-
__kernel void matmul(
|
209
|
-
__global const float *input,
|
210
|
-
__global const float *weight,
|
211
|
-
__global float *res,
|
212
|
-
int input_row_size,
|
213
|
-
int input_col_size,
|
214
|
-
int msize,
|
215
|
-
int weight_row_size,
|
216
|
-
int weight_col_size,
|
217
|
-
int osize
|
218
|
-
)
|
219
|
-
{
|
220
|
-
int gid_y = get_global_id(0); // row index
|
221
|
-
int gid_x = get_global_id(1); // col index
|
222
|
-
|
223
|
-
float acc = 0.0;
|
224
|
-
for (int i = 0; i < msize; i++) {
|
225
|
-
acc += input[gid_y * input_row_size + i * input_col_size] * weight[gid_x * weight_row_size + i * weight_col_size];
|
226
|
-
}
|
227
|
-
res[gid_y * osize + gid_x] = acc;
|
228
|
-
}
|
229
|
-
""")
|
230
|
-
ctx.save_for_backward(input, weight, prg)
|
231
|
-
# (isize,msize) x (msize,osize) = (isize,osize)
|
232
|
-
prg.matmul(ctx.cl_queue, [isize, osize], None,
|
233
|
-
input, weight, ret,
|
234
|
-
msize, one, msize, one, osize, osize)
|
235
|
-
return ret
|
236
|
-
|
237
|
-
@staticmethod
|
238
|
-
def backward(ctx, grad_output):
|
239
|
-
input, weight, prg = ctx.saved_tensors
|
240
|
-
isize = np.int32(input.shape[0])
|
241
|
-
msize = np.int32(input.shape[1])
|
242
|
-
osize = np.int32(weight.shape[1])
|
243
|
-
one = np.int32(1)
|
244
|
-
|
245
|
-
grad_input = buffer_like(ctx, input)
|
246
|
-
grad_weight = buffer_like(ctx, weight)
|
247
|
-
|
248
|
-
# (isize,osize) x (msize,osize) = (isize,msize)
|
249
|
-
prg.matmul(ctx.cl_queue, [isize, msize], None,
|
250
|
-
grad_output, weight, grad_input,
|
251
|
-
osize, one, osize, osize, one, msize)
|
252
|
-
|
253
|
-
# (isize,msize) x (isize,osize) = (msize,osize)
|
254
|
-
prg.matmul(ctx.cl_queue, [msize, osize], None,
|
255
|
-
input, grad_output, grad_weight,
|
256
|
-
one, msize, isize, one, osize, osize)
|
257
|
-
|
258
|
-
return grad_input, grad_weight
|
259
|
-
register('dot', Dot, gpu=True)
|
260
|
-
register('matmul', Dot, gpu=True)
|
261
|
-
|
262
|
-
# ***** SIMPLE OPS ********
|
263
|
-
|
264
|
-
class Reshape(Function):
|
265
|
-
@staticmethod
|
266
|
-
def forward(ctx, x, shape):
|
267
|
-
ctx.save_for_backward(x.shape)
|
268
|
-
ss = list(shape)
|
269
|
-
|
270
|
-
# ???
|
271
|
-
tsum = 1
|
272
|
-
for s in ss:
|
273
|
-
if s != -1:
|
274
|
-
tsum *= s
|
275
|
-
for i,s in enumerate(ss):
|
276
|
-
if s == -1:
|
277
|
-
ss[i] = np.prod(x.shape) // tsum
|
278
|
-
assert np.prod(x.shape) == np.prod(ss)
|
279
|
-
x.shape = tuple(ss)
|
280
|
-
return x
|
281
|
-
|
282
|
-
@staticmethod
|
283
|
-
def backward(ctx, grad_output):
|
284
|
-
in_shape, = ctx.saved_tensors
|
285
|
-
grad_output.shape = in_shape
|
286
|
-
return grad_output
|
287
|
-
register('reshape', Reshape, gpu=True)
|
288
|
-
|
289
|
-
# ***** ACTIVATION OPS ********
|
290
|
-
|
291
|
-
class ReLU(Function):
|
292
|
-
@staticmethod
|
293
|
-
def forward(ctx, input):
|
294
|
-
ctx.save_for_backward(input)
|
295
|
-
return unary_op(ctx, 'max(a, (float)0.);', input)
|
296
|
-
|
297
|
-
@staticmethod
|
298
|
-
def backward(ctx, grad_output):
|
299
|
-
input, = ctx.saved_tensors
|
300
|
-
return binary_op(ctx, 'a * (b >= 0);', grad_output, input)
|
301
|
-
register('relu', ReLU, gpu=True)
|
302
|
-
|
303
|
-
class LogSoftmax(Function):
|
304
|
-
@staticmethod
|
305
|
-
def forward(ctx, input):
|
306
|
-
# first find max values for numerical stability
|
307
|
-
max_vals = buffer_new(ctx, (input.shape[0],))
|
308
|
-
prg = clbuild(ctx.cl_ctx, """
|
309
|
-
__kernel void max_vals(
|
310
|
-
__global const float *a_g, int sz, __global float *res_g)
|
311
|
-
{
|
312
|
-
int gid = get_global_id(0);
|
313
|
-
int gidsz = gid*sz;
|
314
|
-
float max_val = -INFINITY;
|
315
|
-
for (int x = 0; x < sz; x++) {
|
316
|
-
max_val = max(max_val, a_g[gidsz+x]);
|
317
|
-
}
|
318
|
-
res_g[gid] = max_val;
|
319
|
-
}
|
320
|
-
""")
|
321
|
-
prg.max_vals(ctx.cl_queue, [input.shape[0]], None, input, np.int32(input.shape[1]), max_vals)
|
322
|
-
|
323
|
-
# compute exp(x - max) and sum
|
324
|
-
lsum = buffer_new(ctx, (input.shape[0],))
|
325
|
-
prg = clbuild(ctx.cl_ctx, """
|
326
|
-
__kernel void logsoftmax(
|
327
|
-
__global const float *a_g, __global const float *max_vals, int sz, __global float *res_g)
|
328
|
-
{
|
329
|
-
int gid = get_global_id(0);
|
330
|
-
int gidsz = gid*sz;
|
331
|
-
float max_val = max_vals[gid];
|
332
|
-
float out = 0.0;
|
333
|
-
for (int x = 0; x < sz; x++) {
|
334
|
-
out += exp(a_g[gidsz+x] - max_val);
|
335
|
-
}
|
336
|
-
res_g[gid] = log(out) + max_val;
|
337
|
-
}
|
338
|
-
""")
|
339
|
-
prg.logsoftmax(ctx.cl_queue, [input.shape[0]], None, input, max_vals, np.int32(input.shape[1]), lsum)
|
340
|
-
|
341
|
-
# compute final output
|
342
|
-
output = buffer_like(ctx, input)
|
343
|
-
prg = clbuild(ctx.cl_ctx, """
|
344
|
-
__kernel void lsmsub(
|
345
|
-
__global const float *a_g, __global const float *b_g, int sz, __global float *res_g)
|
346
|
-
{
|
347
|
-
int gid = get_global_id(0);
|
348
|
-
int gid2 = get_global_id(1);
|
349
|
-
res_g[gid*sz + gid2] = a_g[gid*sz + gid2] - b_g[gid];
|
350
|
-
}
|
351
|
-
""")
|
352
|
-
prg.lsmsub(ctx.cl_queue, [input.shape[0], input.shape[1]], None, input, lsum, np.int32(input.shape[1]), output)
|
353
|
-
ctx.save_for_backward(output)
|
354
|
-
return output
|
355
|
-
|
356
|
-
@staticmethod
|
357
|
-
def backward(ctx, grad_output):
|
358
|
-
output, = ctx.saved_tensors
|
359
|
-
|
360
|
-
grad_input = buffer_like(ctx, grad_output)
|
361
|
-
prg = clbuild(ctx.cl_ctx, """
|
362
|
-
__kernel void lsmsub2(
|
363
|
-
__global const float *grad_output, __global const float *output, int sz, __global float *grad_input)
|
364
|
-
{
|
365
|
-
int gid = get_global_id(0);
|
366
|
-
int gidsz = gid*sz;
|
367
|
-
int gid2 = get_global_id(1);
|
368
|
-
// TODO: this is repeated in many kernels
|
369
|
-
float acc = 0.0;
|
370
|
-
for (int x = 0; x < sz; x++) {
|
371
|
-
acc += grad_output[gidsz + x];
|
372
|
-
}
|
373
|
-
grad_input[gidsz + gid2] = grad_output[gidsz + gid2] - exp(output[gidsz + gid2]) * acc;
|
374
|
-
}
|
375
|
-
""")
|
376
|
-
prg.lsmsub2(ctx.cl_queue, [grad_output.shape[0], grad_output.shape[1]], None,
|
377
|
-
grad_output, output, np.int32(grad_output.shape[1]), grad_input)
|
378
|
-
|
379
|
-
return grad_input
|
380
|
-
register('logsoftmax', LogSoftmax, gpu=True)
|
381
|
-
|
382
|
-
class Sigmoid(Function):
|
383
|
-
@staticmethod
|
384
|
-
def forward(ctx, input):
|
385
|
-
ret = unary_op(ctx, '1./(1+exp(-a))', input)
|
386
|
-
ctx.save_for_backward(ret)
|
387
|
-
return ret
|
388
|
-
|
389
|
-
@staticmethod
|
390
|
-
def backward(ctx, grad_output):
|
391
|
-
ret, = ctx.saved_tensors
|
392
|
-
return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
|
393
|
-
register('sigmoid', Sigmoid, gpu=True)
|
394
|
-
|
395
|
-
# ***** CONV OPS ********
|
396
|
-
|
397
|
-
class Conv2D(Function):
|
398
|
-
@staticmethod
|
399
|
-
def forward(ctx, x, w, stride=1, groups=1):
|
400
|
-
if type(ctx.stride) == int: # ctx stores function params
|
401
|
-
ctx.stride = (ctx.stride, ctx.stride)
|
402
|
-
cout,cin,H,W = w.shape
|
403
|
-
ys,xs = ctx.stride
|
404
|
-
bs,cin_,iy,ix = x.shape
|
405
|
-
oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
|
406
|
-
assert cin*ctx.groups == cin_
|
407
|
-
assert cout % ctx.groups == 0
|
408
|
-
rcout = cout//ctx.groups
|
409
|
-
# output buffer
|
410
|
-
ret = buffer_new(ctx, (bs, cout, oy, ox))
|
411
|
-
prg = clbuild(ctx.cl_ctx, """
|
412
|
-
__kernel void conv(__global const float *input, __global const float *weight, __global float *output,
|
413
|
-
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
|
414
|
-
int B = get_global_id(0)/(groups*rcout); // range 0-bs
|
415
|
-
int g = (get_global_id(0)/rcout)%groups;
|
416
|
-
int c = get_global_id(0) % rcout;
|
417
|
-
int Y = get_global_id(1); // range 0-oy
|
418
|
-
int X = get_global_id(2); // range 0-ox
|
419
|
-
int IY = Y*ys;
|
420
|
-
int IX = X*xs;
|
421
|
-
|
422
|
-
// input = (bs, groups, cin, iy, ix)
|
423
|
-
// weight = (groups, rcout, cin, H, W)
|
424
|
-
// output = (bs, groups, rcout, oy, ox)
|
425
|
-
float acc = 0.0;
|
426
|
-
for (int ci = 0; ci < cin; ci++) {
|
427
|
-
for (int y = IY; y < IY+H; y++) {
|
428
|
-
for (int x = IX; x < IX+W; x++) {
|
429
|
-
acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + y*ix + x] * \
|
430
|
-
weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-IY)*W + (x-IX)];
|
431
|
-
}
|
432
|
-
}
|
433
|
-
}
|
434
|
-
output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
|
435
|
-
}
|
436
|
-
""")
|
437
|
-
|
438
|
-
prg.conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
|
439
|
-
x, w, ret,
|
440
|
-
np.int32(H), np.int32(W),
|
441
|
-
np.int32(groups), np.int32(rcout), np.int32(cin),
|
442
|
-
np.int32(oy), np.int32(ox),
|
443
|
-
np.int32(iy), np.int32(ix),
|
444
|
-
np.int32(ys), np.int32(xs)
|
445
|
-
)
|
446
|
-
return ret
|
447
|
-
@staticmethod
|
448
|
-
def backward(ctx, grad_output):
|
449
|
-
raise Exception("not implemented")
|
450
|
-
|
451
|
-
register('conv2d', Conv2D, gpu=True)
|
452
|
-
|
453
|
-
class Pad2D(Function):
|
454
|
-
@staticmethod
|
455
|
-
def forward(ctx, x, padding=None):
|
456
|
-
bs,cin,iy,ix = x.shape
|
457
|
-
oy,ox = iy+padding[0]+padding[1], ix+padding[2]+padding[3] # top, bottom, left, right
|
458
|
-
ret = buffer_zeros(ctx, (bs, cin, oy, ox))
|
459
|
-
|
460
|
-
prg = clbuild(ctx.cl_ctx, """
|
461
|
-
__kernel void pad2d(
|
462
|
-
__global const float *input, __global float *output,
|
463
|
-
int cin, int py, int px, int oy, int ox, int iy, int ix
|
464
|
-
)
|
465
|
-
{
|
466
|
-
int B = get_global_id(0);
|
467
|
-
int C = get_global_id(1);
|
468
|
-
int Y = get_global_id(2);
|
469
|
-
int iptr = B*cin*iy*ix + C*iy*ix + Y*ix;
|
470
|
-
int optr = B*cin*oy*ox + C*oy*ox + (Y+py)*ox + px;
|
471
|
-
for (int x = 0; x < ix; x++) {
|
472
|
-
output[optr+x] = input[iptr+x];
|
473
|
-
}
|
474
|
-
}
|
475
|
-
""")
|
476
|
-
prg.pad2d(ctx.cl_queue, [bs, cin, iy], None,
|
477
|
-
x, ret,
|
478
|
-
np.int32(cin), np.int32(padding[0]), np.int32(padding[2]),
|
479
|
-
np.int32(oy), np.int32(ox), np.int32(iy), np.int32(ix)
|
480
|
-
)
|
481
|
-
return ret
|
482
|
-
|
483
|
-
@staticmethod
|
484
|
-
def backward(ctx, grad_output):
|
485
|
-
raise Exception("write this")
|
486
|
-
register('pad2d', Pad2D, gpu=True)
|
487
|
-
|
488
|
-
class AvgPool2D(Function):
|
489
|
-
@staticmethod
|
490
|
-
def forward(ctx, input, kernel_size=(2, 2)):
|
491
|
-
iter_op = "group_res += input[iid]"
|
492
|
-
result_op = "group_res / (kernel_size.x * kernel_size.y)"
|
493
|
-
return pooling_op(ctx, input, kernel_size, iter_op, result_op)
|
494
|
-
|
495
|
-
@staticmethod
|
496
|
-
def backward(ctx, grad_output):
|
497
|
-
# for average pooling, we need to distribute the gradient evenly across all elements in the pooling window
|
498
|
-
input_shape = ctx.data.shape
|
499
|
-
N, C, Y, X = input_shape
|
500
|
-
py, px = ctx.kernel_size
|
501
|
-
ret = buffer_zeros(ctx, input_shape)
|
502
|
-
|
503
|
-
prg = clbuild(ctx.cl_ctx, """
|
504
|
-
__kernel void avgpool_backward(
|
505
|
-
__global float *grad_input, __global const float *grad_output,
|
506
|
-
uint2 osize, uint2 isize, uint2 kernel_size, int nelem
|
507
|
-
) {
|
508
|
-
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
|
509
|
-
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
|
510
|
-
float grad = grad_output[oid] / (kernel_size.x * kernel_size.y);
|
511
|
-
|
512
|
-
for (uint j=0; j<kernel_size.y; ++j) {
|
513
|
-
for (uint i=0; i<kernel_size.x; ++i) {
|
514
|
-
int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
|
515
|
-
if (iid < nelem)
|
516
|
-
grad_input[iid] += grad;
|
517
|
-
}
|
518
|
-
}
|
519
|
-
}
|
520
|
-
""")
|
521
|
-
|
522
|
-
osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
|
523
|
-
isize = np.array((X, Y), dtype=cl.cltypes.uint2)
|
524
|
-
ksize = np.array((px,py), dtype=cl.cltypes.uint2)
|
525
|
-
|
526
|
-
prg.avgpool_backward(ctx.cl_queue, (N*C, Y//py, X//px), None, ret, grad_output, osize, isize, ksize, np.int32(input_shape.size))
|
527
|
-
|
528
|
-
return ret
|
529
|
-
register('avg_pool2d', AvgPool2D, gpu=True)
|
530
|
-
|
531
|
-
class MaxPool2D(Function):
|
532
|
-
@staticmethod
|
533
|
-
def forward(ctx, input, kernel_size=(2, 2)):
|
534
|
-
init_val = "FLT_MIN"
|
535
|
-
iter_op = "group_res = max(group_res, input[iid])"
|
536
|
-
result_op = "group_res"
|
537
|
-
ret = pooling_op(ctx, input, kernel_size, iter_op, result_op, init_val=init_val)
|
538
|
-
|
539
|
-
# save indices of max elements for backward pass
|
540
|
-
indices = buffer_new(ctx, ret.shape)
|
541
|
-
prg = clbuild(ctx.cl_ctx, """
|
542
|
-
__kernel void maxpool_indices(
|
543
|
-
__global const float *input, __global float *output, __global int *indices,
|
544
|
-
uint2 osize, uint2 isize, uint2 kernel_size, int nelem
|
545
|
-
) {
|
546
|
-
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
|
547
|
-
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
|
548
|
-
float max_val = -INFINITY;
|
549
|
-
int max_idx = 0;
|
550
|
-
|
551
|
-
for (uint j=0; j<kernel_size.y; ++j) {
|
552
|
-
for (uint i=0; i<kernel_size.x; ++i) {
|
553
|
-
int iid = (gid.x*kernel_size.x+i) + isize.x*((gid.y*kernel_size.y+j) + isize.y*gid.z);
|
554
|
-
if (iid < nelem) {
|
555
|
-
float val = input[iid];
|
556
|
-
if (val > max_val) {
|
557
|
-
max_val = val;
|
558
|
-
max_idx = iid;
|
559
|
-
}
|
560
|
-
}
|
561
|
-
}
|
562
|
-
}
|
563
|
-
indices[oid] = max_idx;
|
564
|
-
}
|
565
|
-
""")
|
566
|
-
|
567
|
-
N, C, Y, X = input.shape
|
568
|
-
py, px = kernel_size
|
569
|
-
osize = np.array((X//px, Y//py), dtype=cl.cltypes.uint2)
|
570
|
-
isize = np.array((X, Y), dtype=cl.cltypes.uint2)
|
571
|
-
ksize = np.array((px,py), dtype=cl.cltypes.uint2)
|
572
|
-
|
573
|
-
prg.maxpool_indices(ctx.cl_queue, (N*C, Y//py, X//px), None, input, ret, indices, osize, isize, ksize, np.int32(input.size))
|
574
|
-
|
575
|
-
ctx.save_for_backward(indices)
|
576
|
-
return ret
|
577
|
-
|
578
|
-
@staticmethod
|
579
|
-
def backward(ctx, grad_output):
|
580
|
-
indices, = ctx.saved_tensors
|
581
|
-
input_shape = ctx.data.shape
|
582
|
-
ret = buffer_zeros(ctx, input_shape)
|
583
|
-
prg = clbuild(ctx.cl_ctx, """
|
584
|
-
__kernel void maxpool_backward(
|
585
|
-
__global float *grad_input, __global const float *grad_output,
|
586
|
-
__global const int *indices, int nelem
|
587
|
-
) {
|
588
|
-
int gid = get_global_id(0);
|
589
|
-
if (gid < nelem) {
|
590
|
-
int idx = indices[gid];
|
591
|
-
grad_input[idx] += grad_output[gid];
|
592
|
-
}
|
593
|
-
}
|
594
|
-
""")
|
595
|
-
|
596
|
-
prg.maxpool_backward(ctx.cl_queue, [np.prod(grad_output.shape)], None, ret, grad_output, indices, np.int32(grad_output.size))
|
597
|
-
return ret
|
598
|
-
register('max_pool2d', MaxPool2D, gpu=True)
|
froog-0.4.0.dist-info/LICENSE
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Copyright (c) 2023 Kevin Buhler
|