tensor_stream-opencl 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/tensor_stream/opencl/array_ops.rb +10 -4
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +9 -6
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +15 -5
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +13 -5
- data/lib/tensor_stream/opencl/math_ops.rb +11 -4
- data/lib/tensor_stream/opencl/nn_ops.rb +78 -9
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +15 -5
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/mnist_data_2.2.rb +5 -5
- data/samples/mnist_data_2.3.rb +6 -5
- data/samples/mnist_data_3.0.rb +145 -0
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e4aa123289372c651cd4da3e7c206abc4f9f67a551d4062180c5cf6555dc243
|
4
|
+
data.tar.gz: 6517954207c85f56cd08b2892b0119d4bb7a35e2d4bd9b9cacc5d3c9ccfb9e42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f61d61be79dd1e06ebfdc77ed2dff9e717e0cdb292160fe20c9ca08693d867e1b0e0350c71db5d24feb4671a26e793f44d6b80762c384193c1985b6b1616376
|
7
|
+
data.tar.gz: 72c32530717fac8ff947ce4b204535755134bde14e0f70d0d120ff101b5654843312186317cb480fd5e1c620a25328a3590b1f35193faf1d196e7ad631d169b0
|
@@ -422,10 +422,16 @@ module TensorStream
|
|
422
422
|
a = inputs[0]
|
423
423
|
if a.data_type != tensor.data_type
|
424
424
|
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
425
|
+
work_group = if inputs[0].shape.size > 2
|
426
|
+
[ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
|
427
|
+
else
|
428
|
+
m, n = inputs[0].shape
|
429
|
+
[m || 1, n || 1]
|
430
|
+
end
|
431
|
+
|
432
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
433
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
434
|
+
|
429
435
|
event_wait_list = build_event_wait_list(inputs)
|
430
436
|
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
431
437
|
buffer
|
@@ -1,27 +1,30 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int batch_index = get_global_id(0);
|
6
6
|
const int h_index = get_global_id(1);
|
7
7
|
const int w_index = get_global_id(2);
|
8
|
-
const int h_index_with_stride = h_index * <%= stride[0] %>;
|
9
|
-
const int w_index_with_stride = w_index * <%= stride[1] %>;
|
8
|
+
const int h_index_with_stride = h_index * <%= stride[0] %> - <%= padding[0] %>;
|
9
|
+
const int w_index_with_stride = w_index * <%= stride[1] %> - <%= padding[1] %>;
|
10
10
|
|
11
11
|
const int image_index = batch_index * height * width * <%= ch %>;
|
12
12
|
const int image_row_width = width * <%= ch %>;
|
13
|
+
const int out_image_row_size = out_height * out_width * <%= out_ch %>;
|
13
14
|
|
14
15
|
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
15
16
|
<%= ctype %> sum = 0;
|
16
17
|
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
17
18
|
for(int y = 0; y < <%= fh %>; y++) {
|
18
19
|
for (int x = 0; x < <%= fw %>; x++) {
|
19
|
-
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width
|
20
|
-
|
20
|
+
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width &&
|
21
|
+
(h_index_with_stride + y) >= 0 && (w_index_with_stride + x) >=0) {
|
22
|
+
<%= ctype %> f = filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
23
|
+
sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * f;
|
21
24
|
}
|
22
25
|
}
|
23
26
|
}
|
24
27
|
}
|
25
|
-
output[batch_index *
|
28
|
+
output[batch_index * out_image_row_size + h_index * out_width * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
|
26
29
|
}
|
27
30
|
}
|
@@ -1,21 +1,31 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int fh_index = get_global_id(0);
|
6
6
|
const int fw_index = get_global_id(1);
|
7
7
|
const int f_out_channel = get_global_id(2);
|
8
8
|
const int image_size = height * width * <%= ch %>;
|
9
|
-
const int grad_image_row_width =
|
9
|
+
const int grad_image_row_width = out_width * <%= out_ch %>;
|
10
|
+
const int grad_image_size = out_height * out_width * <%= out_ch %>;
|
10
11
|
|
11
12
|
for(int channel = 0; channel < <%= ch %>; channel++) {
|
12
13
|
<%= ctype %> grad_sum = 0.0;
|
13
14
|
for(int batch = 0; batch < batch_size; batch++) {
|
14
|
-
|
15
|
+
int image_index = batch * grad_image_size;
|
15
16
|
for(int y = 0; y < height; y++) {
|
16
17
|
for (int x = 0; x < width; x++) {
|
17
|
-
|
18
|
-
|
18
|
+
int y_offset = y - fh_index + <%= padding[0] %>;
|
19
|
+
int x_offset = x - fw_index + <%= padding[1] %>;
|
20
|
+
int y_offset_end = y + (<%= fh %> - fh_index - 1) - <%= padding[2] %>;
|
21
|
+
int x_offset_end = x + (<%= fw %> - fw_index - 1) - <%= padding[3] %>;
|
22
|
+
|
23
|
+
if ( (y_offset % <%= stride[0]%>) == 0
|
24
|
+
&& (x_offset % <%= stride[1]%>) == 0
|
25
|
+
&& (y_offset >=0) && (x_offset >= 0)
|
26
|
+
&& (y_offset_end < height)
|
27
|
+
&& (x_offset_end < width)) {
|
28
|
+
<%= ctype %> image_grad = grad[image_index + (y_offset / <%= stride[0] %>) * grad_image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
|
19
29
|
grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
|
20
30
|
}
|
21
31
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d_backprop_input(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
int batch_index = get_global_id(0);
|
6
6
|
int h_index = get_global_id(1); // orig image y
|
@@ -8,8 +8,8 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
|
|
8
8
|
|
9
9
|
int h_index_with_stride = h_index / <%= stride[0] %>;
|
10
10
|
int w_index_with_stride = w_index / <%= stride[1] %>;
|
11
|
-
int grad_height =
|
12
|
-
int grad_width =
|
11
|
+
int grad_height = out_height;
|
12
|
+
int grad_width = out_width;
|
13
13
|
|
14
14
|
int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
|
15
15
|
int image_row_width = grad_width * <%= out_ch %>;
|
@@ -19,8 +19,16 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
|
|
19
19
|
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
20
20
|
for(int y = 0; y < <%= fh %>; y++) {
|
21
21
|
for (int x = 0; x < <%= fw %>; x++) {
|
22
|
-
|
23
|
-
|
22
|
+
int y_offset = h_index - y + <%= padding[0] %>;
|
23
|
+
int x_offset = w_index - x + <%= padding[1] %>;
|
24
|
+
|
25
|
+
if ( ( y_offset >= 0) && (x_offset >= 0) &&
|
26
|
+
( y_offset % <%= stride[0]%> == 0) &&
|
27
|
+
( x_offset % <%= stride[1]%> == 0) &&
|
28
|
+
( h_index + (<%= fh %> - y - 1) < (height + <%= padding[2] %>)) &&
|
29
|
+
( w_index + (<%= fw %> - x - 1) < (width + <%= padding[3] %>))
|
30
|
+
) {
|
31
|
+
<%= ctype %> imag_grad = grad[image_index + ( y_offset / <%= stride[0] %>) * image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
|
24
32
|
g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
25
33
|
}
|
26
34
|
}
|
@@ -14,10 +14,15 @@ module TensorStream
|
|
14
14
|
if inputs.size == 1
|
15
15
|
inputs[0]
|
16
16
|
else
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
work_group = if inputs[0].shape.size > 2
|
18
|
+
[ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
|
19
|
+
else
|
20
|
+
m, n = inputs[0].shape
|
21
|
+
[m || 1, n || 1]
|
22
|
+
end
|
23
|
+
|
24
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
25
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
21
26
|
cl_switch = OpenCL::Int1.new(0)
|
22
27
|
dtype = tensor.data_type
|
23
28
|
|
@@ -68,6 +73,7 @@ module TensorStream
|
|
68
73
|
|
69
74
|
raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
|
70
75
|
raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
|
76
|
+
raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
|
71
77
|
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
72
78
|
|
73
79
|
dtype = tensor.data_type
|
@@ -162,6 +168,7 @@ module TensorStream
|
|
162
168
|
|
163
169
|
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
164
170
|
input = complete_eval(value, child_context)
|
171
|
+
|
165
172
|
value = value.buffer.reshape(*value.shape.reverse)
|
166
173
|
rank = input.shape.size - 1
|
167
174
|
|
@@ -220,6 +220,9 @@ module TensorStream
|
|
220
220
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
221
221
|
|
222
222
|
m, n = a.shape
|
223
|
+
|
224
|
+
raise "unsupported rank " if a.shape.size > 2
|
225
|
+
|
223
226
|
work_group = [m]
|
224
227
|
n = m if n.nil?
|
225
228
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -236,6 +239,9 @@ module TensorStream
|
|
236
239
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
237
240
|
|
238
241
|
m, n = a.shape
|
242
|
+
|
243
|
+
raise "unsupported rank " if a.shape.size > 2
|
244
|
+
|
239
245
|
work_group = [m]
|
240
246
|
n = m if n.nil?
|
241
247
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -254,6 +260,9 @@ module TensorStream
|
|
254
260
|
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
255
261
|
rank = a.shape.size - 1
|
256
262
|
m, n = a.shape
|
263
|
+
|
264
|
+
raise "unsupported rank " if a.shape.size > 2
|
265
|
+
|
257
266
|
work_group = [m]
|
258
267
|
n = m if n.nil?
|
259
268
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -276,6 +285,9 @@ module TensorStream
|
|
276
285
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
277
286
|
|
278
287
|
m, n = a.shape
|
288
|
+
|
289
|
+
raise "unsupported rank " if a.shape.size > 2
|
290
|
+
|
279
291
|
work_group = [m]
|
280
292
|
n = m if n.nil?
|
281
293
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -305,6 +317,9 @@ module TensorStream
|
|
305
317
|
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
306
318
|
rank = a.shape.size - 1
|
307
319
|
m, n = a.shape
|
320
|
+
|
321
|
+
raise "unsupported rank " if a.shape.size > 2
|
322
|
+
|
308
323
|
work_group = [m]
|
309
324
|
n = m if n.nil?
|
310
325
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -326,6 +341,7 @@ module TensorStream
|
|
326
341
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
327
342
|
|
328
343
|
m, n = a.shape
|
344
|
+
raise "unsupported rank " if a.shape.size > 2
|
329
345
|
work_group = [m]
|
330
346
|
n = m if n.nil?
|
331
347
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -353,19 +369,29 @@ module TensorStream
|
|
353
369
|
|
354
370
|
raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
|
355
371
|
|
372
|
+
padding_option = tensor.options[:padding]
|
373
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
356
374
|
event_wait_list = build_event_wait_list(inputs)
|
357
375
|
|
358
|
-
f_height, f_width,
|
359
|
-
|
376
|
+
f_height, f_width, _in_channels, out_channels = filter_shape
|
377
|
+
|
378
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
379
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
380
|
+
|
381
|
+
out_shape = [batch, out_h, out_w, out_channels]
|
360
382
|
output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
|
361
383
|
|
362
384
|
cl_image_height = OpenCL::Int1.new(height)
|
363
385
|
cl_image_width = OpenCL::Int1.new(width)
|
386
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
387
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
364
388
|
|
365
|
-
work_dimen = [batch,
|
389
|
+
work_dimen = [batch, out_h, out_w]
|
366
390
|
|
367
|
-
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride]
|
368
|
-
|
391
|
+
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
|
392
|
+
send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
393
|
+
cl_out_height, cl_out_width, inputs[0].cl_buffer,
|
394
|
+
inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
369
395
|
output_buffer
|
370
396
|
end
|
371
397
|
|
@@ -385,13 +411,22 @@ module TensorStream
|
|
385
411
|
batch, height, width, channels = image_shape
|
386
412
|
f_height, f_width, in_channels, out_channels = filter_shape
|
387
413
|
|
414
|
+
padding_option = tensor.options[:padding]
|
415
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
388
416
|
work_dimen = [batch, height, width]
|
389
417
|
|
418
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
419
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
420
|
+
|
390
421
|
cl_image_height = OpenCL::Int1.new(height)
|
391
422
|
cl_image_width = OpenCL::Int1.new(width)
|
423
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
424
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
392
425
|
|
393
|
-
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).
|
394
|
-
|
426
|
+
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
|
427
|
+
send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
428
|
+
cl_out_height, cl_out_width, filter.cl_buffer, grad.cl_buffer,
|
429
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
395
430
|
output_buffer
|
396
431
|
end
|
397
432
|
|
@@ -411,14 +446,48 @@ module TensorStream
|
|
411
446
|
f_height, f_width, input_channels, output_channels = filter_shape
|
412
447
|
work_dimen = [f_height, f_width, output_channels]
|
413
448
|
|
449
|
+
padding_option = tensor.options[:padding]
|
450
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
451
|
+
|
452
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
453
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
454
|
+
|
414
455
|
cl_batch_size = OpenCL::Int1.new(batch_size)
|
415
456
|
cl_image_height = OpenCL::Int1.new(height)
|
416
457
|
cl_image_width = OpenCL::Int1.new(width)
|
458
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
459
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
417
460
|
|
418
|
-
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride]
|
419
|
-
|
461
|
+
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride], padding: padding ).
|
462
|
+
send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
|
463
|
+
cl_out_height, cl_out_width, images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
420
464
|
output_buffer
|
421
465
|
end
|
466
|
+
|
467
|
+
def conv2d_padding_options(padding_option, filter_shape, height, width, h_stride, w_stride)
|
468
|
+
case padding_option
|
469
|
+
when 'SAME'
|
470
|
+
[
|
471
|
+
calc_pad(height, h_stride, filter_shape[0]),
|
472
|
+
calc_pad(width, w_stride, filter_shape[1]),
|
473
|
+
calc_pad(height, h_stride, filter_shape[0], true),
|
474
|
+
calc_pad(width, w_stride, filter_shape[1], true)
|
475
|
+
]
|
476
|
+
when 'VALID'
|
477
|
+
[0, 0, 0, 0]
|
478
|
+
else
|
479
|
+
raise TensorStream::ValueError, "Unsupported padding value #{padding_option}, valid values 'SAME', 'VALID'"
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
def calc_pad(w, stride, f_shape, ceil = false)
|
484
|
+
r = ((w / stride - 1) * stride - w + f_shape)
|
485
|
+
if ceil
|
486
|
+
r.odd? ? r / 2 + 1 : r / 2
|
487
|
+
else
|
488
|
+
r / 2
|
489
|
+
end
|
490
|
+
end
|
422
491
|
end
|
423
492
|
end
|
424
493
|
end
|
@@ -112,6 +112,7 @@ module TensorStream
|
|
112
112
|
result = complete_eval(tensor, execution_context)
|
113
113
|
# puts "-------------------wait finish------------------------"
|
114
114
|
_opencl_queue.finish
|
115
|
+
# puts "-------------------done finish------------------------"
|
115
116
|
read_final_result(result)
|
116
117
|
end
|
117
118
|
|
@@ -170,6 +171,7 @@ module TensorStream
|
|
170
171
|
events = build_event_wait_list([buffer])
|
171
172
|
# puts "** wait #{tensor.name} **"
|
172
173
|
OpenCL.wait_for_events(events) unless events.empty?
|
174
|
+
# puts "** done #{tensor.name} **"
|
173
175
|
buffer
|
174
176
|
end
|
175
177
|
|
@@ -449,6 +451,7 @@ module TensorStream
|
|
449
451
|
events = build_event_wait_list(inputs)
|
450
452
|
# puts "** wait for event flow_group**"
|
451
453
|
OpenCL.wait_for_events(events) unless events.empty?
|
454
|
+
# puts "** done for event flow_group**"
|
452
455
|
nil
|
453
456
|
end
|
454
457
|
|
@@ -461,9 +464,7 @@ module TensorStream
|
|
461
464
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
462
465
|
return @context[cache_key] if @context.key?(cache_key)
|
463
466
|
|
464
|
-
# puts "opencl eval #{object_id} #{tensor.name}"
|
465
467
|
invoke(tensor, child_context).tap do |result|
|
466
|
-
# puts "result done opencl #{object_id}: #{tensor.name}"
|
467
468
|
if tensor.breakpoint
|
468
469
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
469
470
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -569,6 +570,9 @@ module TensorStream
|
|
569
570
|
[m, result_shape.reduce(:*) / m]
|
570
571
|
elsif result_shape.size <= 2
|
571
572
|
[m || 1, n || 1]
|
573
|
+
elsif (b.shape.size == 1) && (result_shape.last == b.shape.last)
|
574
|
+
last_dim = b.shape.last
|
575
|
+
[result_shape.reduce(:*) / last_dim, last_dim]
|
572
576
|
else
|
573
577
|
raise "rank > 2 not supported for now"
|
574
578
|
end
|
@@ -614,9 +618,15 @@ module TensorStream
|
|
614
618
|
output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
|
615
619
|
|
616
620
|
m, n = p.shape
|
617
|
-
|
618
|
-
|
619
|
-
|
621
|
+
|
622
|
+
work_group = if p.shape.size > 2
|
623
|
+
[m, p.shape.reduce(:*) / m]
|
624
|
+
else
|
625
|
+
[ m || 1, n || 1]
|
626
|
+
end
|
627
|
+
|
628
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
629
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
620
630
|
|
621
631
|
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
622
632
|
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
data/samples/mnist_data_2.2.rb
CHANGED
@@ -29,19 +29,19 @@ M = 60
|
|
29
29
|
N = 30
|
30
30
|
|
31
31
|
|
32
|
-
w1 = tf.variable(tf.
|
32
|
+
w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
|
33
33
|
b1 = tf.variable(tf.ones([K])/10)
|
34
34
|
|
35
|
-
w2 = tf.variable(tf.
|
35
|
+
w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
|
36
36
|
b2 = tf.variable(tf.ones([L])/10)
|
37
37
|
|
38
|
-
w3 = tf.variable(tf.
|
38
|
+
w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
|
39
39
|
b3 = tf.variable(tf.ones([M])/10)
|
40
40
|
|
41
|
-
w4 = tf.variable(tf.
|
41
|
+
w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
|
42
42
|
b4 = tf.variable(tf.ones([N])/10)
|
43
43
|
|
44
|
-
w5 = tf.variable(tf.
|
44
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
45
45
|
b5 = tf.variable(tf.zeros([10]))
|
46
46
|
|
47
47
|
x_ = tf.reshape(x, [-1, 784])
|
data/samples/mnist_data_2.3.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# A ruby port of the example code discussed by Martin Gorner in
|
2
2
|
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
3
|
#
|
4
|
+
# Five Layers with relu decay
|
4
5
|
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
6
|
#
|
6
7
|
# Requirements:
|
@@ -35,19 +36,19 @@ M = 60
|
|
35
36
|
N = 30
|
36
37
|
|
37
38
|
|
38
|
-
w1 = tf.variable(tf.
|
39
|
+
w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
|
39
40
|
b1 = tf.variable(tf.ones([K])/10)
|
40
41
|
|
41
|
-
w2 = tf.variable(tf.
|
42
|
+
w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
|
42
43
|
b2 = tf.variable(tf.ones([L])/10)
|
43
44
|
|
44
|
-
w3 = tf.variable(tf.
|
45
|
+
w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
|
45
46
|
b3 = tf.variable(tf.ones([M])/10)
|
46
47
|
|
47
|
-
w4 = tf.variable(tf.
|
48
|
+
w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
|
48
49
|
b4 = tf.variable(tf.ones([N])/10)
|
49
50
|
|
50
|
-
w5 = tf.variable(tf.
|
51
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
51
52
|
b5 = tf.variable(tf.zeros([10]))
|
52
53
|
|
53
54
|
x_ = tf.reshape(x, [-1, 784])
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
|
19
|
+
|
20
|
+
# Import MNIST data
|
21
|
+
puts "downloading minst data"
|
22
|
+
# Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
|
23
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
24
|
+
puts "downloading finished"
|
25
|
+
|
26
|
+
# neural network structure for this sample:
|
27
|
+
#
|
28
|
+
# · · · · · · · · · · (input data, 1-deep) X [batch, 28, 28, 1]
|
29
|
+
# @ @ @ @ @ @ @ @ @ @ -- conv. layer 5x5x1=>4 stride 1 W1 [5, 5, 1, 4] B1 [4]
|
30
|
+
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y1 [batch, 28, 28, 4]
|
31
|
+
# @ @ @ @ @ @ @ @ -- conv. layer 5x5x4=>8 stride 2 W2 [5, 5, 4, 8] B2 [8]
|
32
|
+
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y2 [batch, 14, 14, 8]
|
33
|
+
# @ @ @ @ @ @ -- conv. layer 4x4x8=>12 stride 2 W3 [4, 4, 8, 12] B3 [12]
|
34
|
+
# ∶∶∶∶∶∶∶∶∶∶∶ Y3 [batch, 7, 7, 12] => reshaped to YY [batch, 7*7*12]
|
35
|
+
# \x/x\x\x/ -- fully connected layer (relu) W4 [7*7*12, 200] B4 [200]
|
36
|
+
# · · · · Y4 [batch, 200]
|
37
|
+
# \x/x\x/ -- fully connected layer (softmax) W5 [200, 10] B5 [10]
|
38
|
+
# · · · Y [batch, 10]
|
39
|
+
|
40
|
+
|
41
|
+
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
|
42
|
+
x = tf.placeholder(:float32, shape: [nil, 28, 28, 1])
|
43
|
+
|
44
|
+
# correct answers will go here
|
45
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
46
|
+
|
47
|
+
# step for variable learning rate
|
48
|
+
step = tf.placeholder(:int32)
|
49
|
+
|
50
|
+
pkeep = tf.placeholder(tf.float32)
|
51
|
+
|
52
|
+
# three convolutional layers with their channel counts, and a
|
53
|
+
# fully connected layer (tha last layer has 10 softmax neurons)
|
54
|
+
|
55
|
+
K = 4 # first convolutional layer output depth
|
56
|
+
L = 8 # second convolutional layer output depth
|
57
|
+
M = 12 # third convolutional layer
|
58
|
+
N = 200 # fully connected layer
|
59
|
+
|
60
|
+
|
61
|
+
w1 = tf.variable(tf.truncated_normal([6, 6, 1, K], stddev: 0.1))
|
62
|
+
b1 = tf.variable(tf.ones([K])/10)
|
63
|
+
|
64
|
+
w2 = tf.variable(tf.truncated_normal([5, 5, K, L], stddev: 0.1))
|
65
|
+
b2 = tf.variable(tf.ones([L])/10)
|
66
|
+
|
67
|
+
w3 = tf.variable(tf.truncated_normal([4, 4, L, M], stddev: 0.1))
|
68
|
+
b3 = tf.variable(tf.ones([M])/10)
|
69
|
+
|
70
|
+
w4 = tf.variable(tf.truncated_normal([7 * 7 * M, N], stddev: 0.1))
|
71
|
+
b4 = tf.variable(tf.ones([N])/10)
|
72
|
+
|
73
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
74
|
+
b5 = tf.variable(tf.ones([10])/10)
|
75
|
+
|
76
|
+
# The model
|
77
|
+
stride = 1 # output is 28x28
|
78
|
+
y1 = tf.nn.relu(tf.nn.conv2d(tf.reshape(x, [-1, 28, 28, 1]), w1, [1, stride, stride, 1], 'SAME') + b1)
|
79
|
+
stride = 2 # output is 14x14
|
80
|
+
y2 = tf.nn.relu(tf.nn.conv2d(y1, w2, [1, stride, stride, 1], 'SAME') + b2)
|
81
|
+
stride = 2 # output is 7x7
|
82
|
+
y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
|
83
|
+
|
84
|
+
# reshape the output from the third convolution for the fully connected layer
|
85
|
+
yy = tf.reshape(y3, [-1, 7 * 7 * M])
|
86
|
+
y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
|
87
|
+
|
88
|
+
# dropout to prevent overfitting
|
89
|
+
yy4 = tf.nn.dropout(y4, pkeep)
|
90
|
+
|
91
|
+
ylogits = tf.matmul(yy4, w5) + b5
|
92
|
+
|
93
|
+
# model
|
94
|
+
y = tf.nn.softmax(ylogits)
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
# training step, learning rate = 0.003
|
99
|
+
|
100
|
+
|
101
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
102
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
103
|
+
# problems with log(0) which is NaN
|
104
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
105
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
106
|
+
|
107
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
108
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
109
|
+
|
110
|
+
# training step, learning rate = 0.003
|
111
|
+
lr = 0.0001.t + tf.train.exponential_decay(0.003, step, 2000, 1/Math::E)
|
112
|
+
train_step = TensorStream::Train::AdamOptimizer.new(lr).minimize(cross_entropy)
|
113
|
+
|
114
|
+
sess = tf.session
|
115
|
+
# Add ops to save and restore all the variables.
|
116
|
+
|
117
|
+
init = tf.global_variables_initializer
|
118
|
+
|
119
|
+
sess.run(init)
|
120
|
+
mnist_train = mnist.train
|
121
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels, pkeep => 1.0 }
|
122
|
+
|
123
|
+
|
124
|
+
(0..10001).each do |i|
|
125
|
+
# load batch of images and correct answers
|
126
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
127
|
+
train_data = { x => batch_x, y_ => batch_y, step => i, pkeep => 0.75 }
|
128
|
+
|
129
|
+
# train
|
130
|
+
sess.run(train_step, feed_dict: train_data)
|
131
|
+
|
132
|
+
if (i % 10 == 0)
|
133
|
+
# File.write("profile.json", TensorStream::ReportTool.profile_for(sess).to_json)
|
134
|
+
# success? add code to print it
|
135
|
+
a_train, c_train, l = sess.run([accuracy, cross_entropy, lr], feed_dict: { x => batch_x, y_ => batch_y, step => i, pkeep => 1.0})
|
136
|
+
puts "#{i}: accuracy:#{a_train} loss:#{c_train} (lr:#{l})"
|
137
|
+
end
|
138
|
+
|
139
|
+
if (i % 100 == 0)
|
140
|
+
# success on test data?
|
141
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data, pkeep => 1.0)
|
142
|
+
puts("#{i}: ******** test accuracy: #{a_test} test loss: #{c_test}")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
41
|
spec.add_development_dependency "simplecov"
|
42
|
-
spec.add_dependency "tensor_stream", "~> 0.9.
|
42
|
+
spec.add_dependency "tensor_stream", "~> 0.9.8"
|
43
43
|
spec.add_dependency "opencl_ruby_ffi"
|
44
44
|
spec.add_dependency "oily_png"
|
45
45
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 0.9.
|
117
|
+
version: 0.9.8
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 0.9.
|
124
|
+
version: 0.9.8
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: opencl_ruby_ffi
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -252,6 +252,7 @@ files:
|
|
252
252
|
- samples/mnist_data_2.1.rb
|
253
253
|
- samples/mnist_data_2.2.rb
|
254
254
|
- samples/mnist_data_2.3.rb
|
255
|
+
- samples/mnist_data_3.0.rb
|
255
256
|
- samples/multigpu.rb
|
256
257
|
- samples/nearest_neighbor.rb
|
257
258
|
- samples/rnn.rb
|