tensor_stream-opencl 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/tensor_stream/opencl/array_ops.rb +10 -4
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +9 -6
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +15 -5
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +13 -5
- data/lib/tensor_stream/opencl/math_ops.rb +11 -4
- data/lib/tensor_stream/opencl/nn_ops.rb +78 -9
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +15 -5
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/mnist_data_2.2.rb +5 -5
- data/samples/mnist_data_2.3.rb +6 -5
- data/samples/mnist_data_3.0.rb +145 -0
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e4aa123289372c651cd4da3e7c206abc4f9f67a551d4062180c5cf6555dc243
|
4
|
+
data.tar.gz: 6517954207c85f56cd08b2892b0119d4bb7a35e2d4bd9b9cacc5d3c9ccfb9e42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f61d61be79dd1e06ebfdc77ed2dff9e717e0cdb292160fe20c9ca08693d867e1b0e0350c71db5d24feb4671a26e793f44d6b80762c384193c1985b6b1616376
|
7
|
+
data.tar.gz: 72c32530717fac8ff947ce4b204535755134bde14e0f70d0d120ff101b5654843312186317cb480fd5e1c620a25328a3590b1f35193faf1d196e7ad631d169b0
|
@@ -422,10 +422,16 @@ module TensorStream
|
|
422
422
|
a = inputs[0]
|
423
423
|
if a.data_type != tensor.data_type
|
424
424
|
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
425
|
+
work_group = if inputs[0].shape.size > 2
|
426
|
+
[ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
|
427
|
+
else
|
428
|
+
m, n = inputs[0].shape
|
429
|
+
[m || 1, n || 1]
|
430
|
+
end
|
431
|
+
|
432
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
433
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
434
|
+
|
429
435
|
event_wait_list = build_event_wait_list(inputs)
|
430
436
|
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
431
437
|
buffer
|
@@ -1,27 +1,30 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int batch_index = get_global_id(0);
|
6
6
|
const int h_index = get_global_id(1);
|
7
7
|
const int w_index = get_global_id(2);
|
8
|
-
const int h_index_with_stride = h_index * <%= stride[0] %>;
|
9
|
-
const int w_index_with_stride = w_index * <%= stride[1] %>;
|
8
|
+
const int h_index_with_stride = h_index * <%= stride[0] %> - <%= padding[0] %>;
|
9
|
+
const int w_index_with_stride = w_index * <%= stride[1] %> - <%= padding[1] %>;
|
10
10
|
|
11
11
|
const int image_index = batch_index * height * width * <%= ch %>;
|
12
12
|
const int image_row_width = width * <%= ch %>;
|
13
|
+
const int out_image_row_size = out_height * out_width * <%= out_ch %>;
|
13
14
|
|
14
15
|
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
15
16
|
<%= ctype %> sum = 0;
|
16
17
|
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
17
18
|
for(int y = 0; y < <%= fh %>; y++) {
|
18
19
|
for (int x = 0; x < <%= fw %>; x++) {
|
19
|
-
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width
|
20
|
-
|
20
|
+
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width &&
|
21
|
+
(h_index_with_stride + y) >= 0 && (w_index_with_stride + x) >=0) {
|
22
|
+
<%= ctype %> f = filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
23
|
+
sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * f;
|
21
24
|
}
|
22
25
|
}
|
23
26
|
}
|
24
27
|
}
|
25
|
-
output[batch_index *
|
28
|
+
output[batch_index * out_image_row_size + h_index * out_width * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
|
26
29
|
}
|
27
30
|
}
|
@@ -1,21 +1,31 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int fh_index = get_global_id(0);
|
6
6
|
const int fw_index = get_global_id(1);
|
7
7
|
const int f_out_channel = get_global_id(2);
|
8
8
|
const int image_size = height * width * <%= ch %>;
|
9
|
-
const int grad_image_row_width =
|
9
|
+
const int grad_image_row_width = out_width * <%= out_ch %>;
|
10
|
+
const int grad_image_size = out_height * out_width * <%= out_ch %>;
|
10
11
|
|
11
12
|
for(int channel = 0; channel < <%= ch %>; channel++) {
|
12
13
|
<%= ctype %> grad_sum = 0.0;
|
13
14
|
for(int batch = 0; batch < batch_size; batch++) {
|
14
|
-
|
15
|
+
int image_index = batch * grad_image_size;
|
15
16
|
for(int y = 0; y < height; y++) {
|
16
17
|
for (int x = 0; x < width; x++) {
|
17
|
-
|
18
|
-
|
18
|
+
int y_offset = y - fh_index + <%= padding[0] %>;
|
19
|
+
int x_offset = x - fw_index + <%= padding[1] %>;
|
20
|
+
int y_offset_end = y + (<%= fh %> - fh_index - 1) - <%= padding[2] %>;
|
21
|
+
int x_offset_end = x + (<%= fw %> - fw_index - 1) - <%= padding[3] %>;
|
22
|
+
|
23
|
+
if ( (y_offset % <%= stride[0]%>) == 0
|
24
|
+
&& (x_offset % <%= stride[1]%>) == 0
|
25
|
+
&& (y_offset >=0) && (x_offset >= 0)
|
26
|
+
&& (y_offset_end < height)
|
27
|
+
&& (x_offset_end < width)) {
|
28
|
+
<%= ctype %> image_grad = grad[image_index + (y_offset / <%= stride[0] %>) * grad_image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
|
19
29
|
grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
|
20
30
|
}
|
21
31
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% ctype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
3
|
+
__kernel void conv2d_backprop_input(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
int batch_index = get_global_id(0);
|
6
6
|
int h_index = get_global_id(1); // orig image y
|
@@ -8,8 +8,8 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
|
|
8
8
|
|
9
9
|
int h_index_with_stride = h_index / <%= stride[0] %>;
|
10
10
|
int w_index_with_stride = w_index / <%= stride[1] %>;
|
11
|
-
int grad_height =
|
12
|
-
int grad_width =
|
11
|
+
int grad_height = out_height;
|
12
|
+
int grad_width = out_width;
|
13
13
|
|
14
14
|
int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
|
15
15
|
int image_row_width = grad_width * <%= out_ch %>;
|
@@ -19,8 +19,16 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
|
|
19
19
|
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
20
20
|
for(int y = 0; y < <%= fh %>; y++) {
|
21
21
|
for (int x = 0; x < <%= fw %>; x++) {
|
22
|
-
|
23
|
-
|
22
|
+
int y_offset = h_index - y + <%= padding[0] %>;
|
23
|
+
int x_offset = w_index - x + <%= padding[1] %>;
|
24
|
+
|
25
|
+
if ( ( y_offset >= 0) && (x_offset >= 0) &&
|
26
|
+
( y_offset % <%= stride[0]%> == 0) &&
|
27
|
+
( x_offset % <%= stride[1]%> == 0) &&
|
28
|
+
( h_index + (<%= fh %> - y - 1) < (height + <%= padding[2] %>)) &&
|
29
|
+
( w_index + (<%= fw %> - x - 1) < (width + <%= padding[3] %>))
|
30
|
+
) {
|
31
|
+
<%= ctype %> imag_grad = grad[image_index + ( y_offset / <%= stride[0] %>) * image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
|
24
32
|
g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
25
33
|
}
|
26
34
|
}
|
@@ -14,10 +14,15 @@ module TensorStream
|
|
14
14
|
if inputs.size == 1
|
15
15
|
inputs[0]
|
16
16
|
else
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
work_group = if inputs[0].shape.size > 2
|
18
|
+
[ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
|
19
|
+
else
|
20
|
+
m, n = inputs[0].shape
|
21
|
+
[m || 1, n || 1]
|
22
|
+
end
|
23
|
+
|
24
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
25
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
21
26
|
cl_switch = OpenCL::Int1.new(0)
|
22
27
|
dtype = tensor.data_type
|
23
28
|
|
@@ -68,6 +73,7 @@ module TensorStream
|
|
68
73
|
|
69
74
|
raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
|
70
75
|
raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
|
76
|
+
raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
|
71
77
|
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
72
78
|
|
73
79
|
dtype = tensor.data_type
|
@@ -162,6 +168,7 @@ module TensorStream
|
|
162
168
|
|
163
169
|
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
164
170
|
input = complete_eval(value, child_context)
|
171
|
+
|
165
172
|
value = value.buffer.reshape(*value.shape.reverse)
|
166
173
|
rank = input.shape.size - 1
|
167
174
|
|
@@ -220,6 +220,9 @@ module TensorStream
|
|
220
220
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
221
221
|
|
222
222
|
m, n = a.shape
|
223
|
+
|
224
|
+
raise "unsupported rank " if a.shape.size > 2
|
225
|
+
|
223
226
|
work_group = [m]
|
224
227
|
n = m if n.nil?
|
225
228
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -236,6 +239,9 @@ module TensorStream
|
|
236
239
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
237
240
|
|
238
241
|
m, n = a.shape
|
242
|
+
|
243
|
+
raise "unsupported rank " if a.shape.size > 2
|
244
|
+
|
239
245
|
work_group = [m]
|
240
246
|
n = m if n.nil?
|
241
247
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -254,6 +260,9 @@ module TensorStream
|
|
254
260
|
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
255
261
|
rank = a.shape.size - 1
|
256
262
|
m, n = a.shape
|
263
|
+
|
264
|
+
raise "unsupported rank " if a.shape.size > 2
|
265
|
+
|
257
266
|
work_group = [m]
|
258
267
|
n = m if n.nil?
|
259
268
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -276,6 +285,9 @@ module TensorStream
|
|
276
285
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
277
286
|
|
278
287
|
m, n = a.shape
|
288
|
+
|
289
|
+
raise "unsupported rank " if a.shape.size > 2
|
290
|
+
|
279
291
|
work_group = [m]
|
280
292
|
n = m if n.nil?
|
281
293
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -305,6 +317,9 @@ module TensorStream
|
|
305
317
|
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
306
318
|
rank = a.shape.size - 1
|
307
319
|
m, n = a.shape
|
320
|
+
|
321
|
+
raise "unsupported rank " if a.shape.size > 2
|
322
|
+
|
308
323
|
work_group = [m]
|
309
324
|
n = m if n.nil?
|
310
325
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -326,6 +341,7 @@ module TensorStream
|
|
326
341
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
327
342
|
|
328
343
|
m, n = a.shape
|
344
|
+
raise "unsupported rank " if a.shape.size > 2
|
329
345
|
work_group = [m]
|
330
346
|
n = m if n.nil?
|
331
347
|
cl_n = OpenCL::Int1.new(n || 1)
|
@@ -353,19 +369,29 @@ module TensorStream
|
|
353
369
|
|
354
370
|
raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
|
355
371
|
|
372
|
+
padding_option = tensor.options[:padding]
|
373
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
356
374
|
event_wait_list = build_event_wait_list(inputs)
|
357
375
|
|
358
|
-
f_height, f_width,
|
359
|
-
|
376
|
+
f_height, f_width, _in_channels, out_channels = filter_shape
|
377
|
+
|
378
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
379
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
380
|
+
|
381
|
+
out_shape = [batch, out_h, out_w, out_channels]
|
360
382
|
output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
|
361
383
|
|
362
384
|
cl_image_height = OpenCL::Int1.new(height)
|
363
385
|
cl_image_width = OpenCL::Int1.new(width)
|
386
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
387
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
364
388
|
|
365
|
-
work_dimen = [batch,
|
389
|
+
work_dimen = [batch, out_h, out_w]
|
366
390
|
|
367
|
-
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride]
|
368
|
-
|
391
|
+
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
|
392
|
+
send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
393
|
+
cl_out_height, cl_out_width, inputs[0].cl_buffer,
|
394
|
+
inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
369
395
|
output_buffer
|
370
396
|
end
|
371
397
|
|
@@ -385,13 +411,22 @@ module TensorStream
|
|
385
411
|
batch, height, width, channels = image_shape
|
386
412
|
f_height, f_width, in_channels, out_channels = filter_shape
|
387
413
|
|
414
|
+
padding_option = tensor.options[:padding]
|
415
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
388
416
|
work_dimen = [batch, height, width]
|
389
417
|
|
418
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
419
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
420
|
+
|
390
421
|
cl_image_height = OpenCL::Int1.new(height)
|
391
422
|
cl_image_width = OpenCL::Int1.new(width)
|
423
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
424
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
392
425
|
|
393
|
-
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).
|
394
|
-
|
426
|
+
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
|
427
|
+
send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
428
|
+
cl_out_height, cl_out_width, filter.cl_buffer, grad.cl_buffer,
|
429
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
395
430
|
output_buffer
|
396
431
|
end
|
397
432
|
|
@@ -411,14 +446,48 @@ module TensorStream
|
|
411
446
|
f_height, f_width, input_channels, output_channels = filter_shape
|
412
447
|
work_dimen = [f_height, f_width, output_channels]
|
413
448
|
|
449
|
+
padding_option = tensor.options[:padding]
|
450
|
+
padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
|
451
|
+
|
452
|
+
out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
|
453
|
+
out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
|
454
|
+
|
414
455
|
cl_batch_size = OpenCL::Int1.new(batch_size)
|
415
456
|
cl_image_height = OpenCL::Int1.new(height)
|
416
457
|
cl_image_width = OpenCL::Int1.new(width)
|
458
|
+
cl_out_height = OpenCL::Int1.new(out_h)
|
459
|
+
cl_out_width = OpenCL::Int1.new(out_w)
|
417
460
|
|
418
|
-
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride]
|
419
|
-
|
461
|
+
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride], padding: padding ).
|
462
|
+
send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
|
463
|
+
cl_out_height, cl_out_width, images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
420
464
|
output_buffer
|
421
465
|
end
|
466
|
+
|
467
|
+
def conv2d_padding_options(padding_option, filter_shape, height, width, h_stride, w_stride)
|
468
|
+
case padding_option
|
469
|
+
when 'SAME'
|
470
|
+
[
|
471
|
+
calc_pad(height, h_stride, filter_shape[0]),
|
472
|
+
calc_pad(width, w_stride, filter_shape[1]),
|
473
|
+
calc_pad(height, h_stride, filter_shape[0], true),
|
474
|
+
calc_pad(width, w_stride, filter_shape[1], true)
|
475
|
+
]
|
476
|
+
when 'VALID'
|
477
|
+
[0, 0, 0, 0]
|
478
|
+
else
|
479
|
+
raise TensorStream::ValueError, "Unsupported padding value #{padding_option}, valid values 'SAME', 'VALID'"
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
def calc_pad(w, stride, f_shape, ceil = false)
|
484
|
+
r = ((w / stride - 1) * stride - w + f_shape)
|
485
|
+
if ceil
|
486
|
+
r.odd? ? r / 2 + 1 : r / 2
|
487
|
+
else
|
488
|
+
r / 2
|
489
|
+
end
|
490
|
+
end
|
422
491
|
end
|
423
492
|
end
|
424
493
|
end
|
@@ -112,6 +112,7 @@ module TensorStream
|
|
112
112
|
result = complete_eval(tensor, execution_context)
|
113
113
|
# puts "-------------------wait finish------------------------"
|
114
114
|
_opencl_queue.finish
|
115
|
+
# puts "-------------------done finish------------------------"
|
115
116
|
read_final_result(result)
|
116
117
|
end
|
117
118
|
|
@@ -170,6 +171,7 @@ module TensorStream
|
|
170
171
|
events = build_event_wait_list([buffer])
|
171
172
|
# puts "** wait #{tensor.name} **"
|
172
173
|
OpenCL.wait_for_events(events) unless events.empty?
|
174
|
+
# puts "** done #{tensor.name} **"
|
173
175
|
buffer
|
174
176
|
end
|
175
177
|
|
@@ -449,6 +451,7 @@ module TensorStream
|
|
449
451
|
events = build_event_wait_list(inputs)
|
450
452
|
# puts "** wait for event flow_group**"
|
451
453
|
OpenCL.wait_for_events(events) unless events.empty?
|
454
|
+
# puts "** done for event flow_group**"
|
452
455
|
nil
|
453
456
|
end
|
454
457
|
|
@@ -461,9 +464,7 @@ module TensorStream
|
|
461
464
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
462
465
|
return @context[cache_key] if @context.key?(cache_key)
|
463
466
|
|
464
|
-
# puts "opencl eval #{object_id} #{tensor.name}"
|
465
467
|
invoke(tensor, child_context).tap do |result|
|
466
|
-
# puts "result done opencl #{object_id}: #{tensor.name}"
|
467
468
|
if tensor.breakpoint
|
468
469
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
469
470
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -569,6 +570,9 @@ module TensorStream
|
|
569
570
|
[m, result_shape.reduce(:*) / m]
|
570
571
|
elsif result_shape.size <= 2
|
571
572
|
[m || 1, n || 1]
|
573
|
+
elsif (b.shape.size == 1) && (result_shape.last == b.shape.last)
|
574
|
+
last_dim = b.shape.last
|
575
|
+
[result_shape.reduce(:*) / last_dim, last_dim]
|
572
576
|
else
|
573
577
|
raise "rank > 2 not supported for now"
|
574
578
|
end
|
@@ -614,9 +618,15 @@ module TensorStream
|
|
614
618
|
output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
|
615
619
|
|
616
620
|
m, n = p.shape
|
617
|
-
|
618
|
-
|
619
|
-
|
621
|
+
|
622
|
+
work_group = if p.shape.size > 2
|
623
|
+
[m, p.shape.reduce(:*) / m]
|
624
|
+
else
|
625
|
+
[ m || 1, n || 1]
|
626
|
+
end
|
627
|
+
|
628
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
629
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
620
630
|
|
621
631
|
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
622
632
|
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
data/samples/mnist_data_2.2.rb
CHANGED
@@ -29,19 +29,19 @@ M = 60
|
|
29
29
|
N = 30
|
30
30
|
|
31
31
|
|
32
|
-
w1 = tf.variable(tf.
|
32
|
+
w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
|
33
33
|
b1 = tf.variable(tf.ones([K])/10)
|
34
34
|
|
35
|
-
w2 = tf.variable(tf.
|
35
|
+
w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
|
36
36
|
b2 = tf.variable(tf.ones([L])/10)
|
37
37
|
|
38
|
-
w3 = tf.variable(tf.
|
38
|
+
w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
|
39
39
|
b3 = tf.variable(tf.ones([M])/10)
|
40
40
|
|
41
|
-
w4 = tf.variable(tf.
|
41
|
+
w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
|
42
42
|
b4 = tf.variable(tf.ones([N])/10)
|
43
43
|
|
44
|
-
w5 = tf.variable(tf.
|
44
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
45
45
|
b5 = tf.variable(tf.zeros([10]))
|
46
46
|
|
47
47
|
x_ = tf.reshape(x, [-1, 784])
|
data/samples/mnist_data_2.3.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# A ruby port of the example code discussed by Martin Gorner in
|
2
2
|
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
3
|
#
|
4
|
+
# Five Layers with relu decay
|
4
5
|
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
6
|
#
|
6
7
|
# Requirements:
|
@@ -35,19 +36,19 @@ M = 60
|
|
35
36
|
N = 30
|
36
37
|
|
37
38
|
|
38
|
-
w1 = tf.variable(tf.
|
39
|
+
w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
|
39
40
|
b1 = tf.variable(tf.ones([K])/10)
|
40
41
|
|
41
|
-
w2 = tf.variable(tf.
|
42
|
+
w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
|
42
43
|
b2 = tf.variable(tf.ones([L])/10)
|
43
44
|
|
44
|
-
w3 = tf.variable(tf.
|
45
|
+
w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
|
45
46
|
b3 = tf.variable(tf.ones([M])/10)
|
46
47
|
|
47
|
-
w4 = tf.variable(tf.
|
48
|
+
w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
|
48
49
|
b4 = tf.variable(tf.ones([N])/10)
|
49
50
|
|
50
|
-
w5 = tf.variable(tf.
|
51
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
51
52
|
b5 = tf.variable(tf.zeros([10]))
|
52
53
|
|
53
54
|
x_ = tf.reshape(x, [-1, 784])
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
|
19
|
+
|
20
|
+
# Import MNIST data
|
21
|
+
puts "downloading minst data"
|
22
|
+
# Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
|
23
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
24
|
+
puts "downloading finished"
|
25
|
+
|
26
|
+
# neural network structure for this sample:
|
27
|
+
#
|
28
|
+
# · · · · · · · · · · (input data, 1-deep) X [batch, 28, 28, 1]
|
29
|
+
# @ @ @ @ @ @ @ @ @ @ -- conv. layer 5x5x1=>4 stride 1 W1 [5, 5, 1, 4] B1 [4]
|
30
|
+
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y1 [batch, 28, 28, 4]
|
31
|
+
# @ @ @ @ @ @ @ @ -- conv. layer 5x5x4=>8 stride 2 W2 [5, 5, 4, 8] B2 [8]
|
32
|
+
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y2 [batch, 14, 14, 8]
|
33
|
+
# @ @ @ @ @ @ -- conv. layer 4x4x8=>12 stride 2 W3 [4, 4, 8, 12] B3 [12]
|
34
|
+
# ∶∶∶∶∶∶∶∶∶∶∶ Y3 [batch, 7, 7, 12] => reshaped to YY [batch, 7*7*12]
|
35
|
+
# \x/x\x\x/ -- fully connected layer (relu) W4 [7*7*12, 200] B4 [200]
|
36
|
+
# · · · · Y4 [batch, 200]
|
37
|
+
# \x/x\x/ -- fully connected layer (softmax) W5 [200, 10] B5 [10]
|
38
|
+
# · · · Y [batch, 10]
|
39
|
+
|
40
|
+
|
41
|
+
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
|
42
|
+
x = tf.placeholder(:float32, shape: [nil, 28, 28, 1])
|
43
|
+
|
44
|
+
# correct answers will go here
|
45
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
46
|
+
|
47
|
+
# step for variable learning rate
|
48
|
+
step = tf.placeholder(:int32)
|
49
|
+
|
50
|
+
pkeep = tf.placeholder(tf.float32)
|
51
|
+
|
52
|
+
# three convolutional layers with their channel counts, and a
|
53
|
+
# fully connected layer (tha last layer has 10 softmax neurons)
|
54
|
+
|
55
|
+
K = 4 # first convolutional layer output depth
|
56
|
+
L = 8 # second convolutional layer output depth
|
57
|
+
M = 12 # third convolutional layer
|
58
|
+
N = 200 # fully connected layer
|
59
|
+
|
60
|
+
|
61
|
+
w1 = tf.variable(tf.truncated_normal([6, 6, 1, K], stddev: 0.1))
|
62
|
+
b1 = tf.variable(tf.ones([K])/10)
|
63
|
+
|
64
|
+
w2 = tf.variable(tf.truncated_normal([5, 5, K, L], stddev: 0.1))
|
65
|
+
b2 = tf.variable(tf.ones([L])/10)
|
66
|
+
|
67
|
+
w3 = tf.variable(tf.truncated_normal([4, 4, L, M], stddev: 0.1))
|
68
|
+
b3 = tf.variable(tf.ones([M])/10)
|
69
|
+
|
70
|
+
w4 = tf.variable(tf.truncated_normal([7 * 7 * M, N], stddev: 0.1))
|
71
|
+
b4 = tf.variable(tf.ones([N])/10)
|
72
|
+
|
73
|
+
w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
|
74
|
+
b5 = tf.variable(tf.ones([10])/10)
|
75
|
+
|
76
|
+
# The model
|
77
|
+
stride = 1 # output is 28x28
|
78
|
+
y1 = tf.nn.relu(tf.nn.conv2d(tf.reshape(x, [-1, 28, 28, 1]), w1, [1, stride, stride, 1], 'SAME') + b1)
|
79
|
+
stride = 2 # output is 14x14
|
80
|
+
y2 = tf.nn.relu(tf.nn.conv2d(y1, w2, [1, stride, stride, 1], 'SAME') + b2)
|
81
|
+
stride = 2 # output is 7x7
|
82
|
+
y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
|
83
|
+
|
84
|
+
# reshape the output from the third convolution for the fully connected layer
|
85
|
+
yy = tf.reshape(y3, [-1, 7 * 7 * M])
|
86
|
+
y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
|
87
|
+
|
88
|
+
# dropout to prevent overfitting
|
89
|
+
yy4 = tf.nn.dropout(y4, pkeep)
|
90
|
+
|
91
|
+
ylogits = tf.matmul(yy4, w5) + b5
|
92
|
+
|
93
|
+
# model
|
94
|
+
y = tf.nn.softmax(ylogits)
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
# training step, learning rate = 0.003
|
99
|
+
|
100
|
+
|
101
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
102
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
103
|
+
# problems with log(0) which is NaN
|
104
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
105
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
106
|
+
|
107
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
108
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
109
|
+
|
110
|
+
# training step, learning rate = 0.003
|
111
|
+
lr = 0.0001.t + tf.train.exponential_decay(0.003, step, 2000, 1/Math::E)
|
112
|
+
train_step = TensorStream::Train::AdamOptimizer.new(lr).minimize(cross_entropy)
|
113
|
+
|
114
|
+
sess = tf.session
|
115
|
+
# Add ops to save and restore all the variables.
|
116
|
+
|
117
|
+
init = tf.global_variables_initializer
|
118
|
+
|
119
|
+
sess.run(init)
|
120
|
+
mnist_train = mnist.train
|
121
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels, pkeep => 1.0 }
|
122
|
+
|
123
|
+
|
124
|
+
(0..10001).each do |i|
|
125
|
+
# load batch of images and correct answers
|
126
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
127
|
+
train_data = { x => batch_x, y_ => batch_y, step => i, pkeep => 0.75 }
|
128
|
+
|
129
|
+
# train
|
130
|
+
sess.run(train_step, feed_dict: train_data)
|
131
|
+
|
132
|
+
if (i % 10 == 0)
|
133
|
+
# File.write("profile.json", TensorStream::ReportTool.profile_for(sess).to_json)
|
134
|
+
# success? add code to print it
|
135
|
+
a_train, c_train, l = sess.run([accuracy, cross_entropy, lr], feed_dict: { x => batch_x, y_ => batch_y, step => i, pkeep => 1.0})
|
136
|
+
puts "#{i}: accuracy:#{a_train} loss:#{c_train} (lr:#{l})"
|
137
|
+
end
|
138
|
+
|
139
|
+
if (i % 100 == 0)
|
140
|
+
# success on test data?
|
141
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data, pkeep => 1.0)
|
142
|
+
puts("#{i}: ******** test accuracy: #{a_test} test loss: #{c_test}")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
41
|
spec.add_development_dependency "simplecov"
|
42
|
-
spec.add_dependency "tensor_stream", "~> 0.9.
|
42
|
+
spec.add_dependency "tensor_stream", "~> 0.9.8"
|
43
43
|
spec.add_dependency "opencl_ruby_ffi"
|
44
44
|
spec.add_dependency "oily_png"
|
45
45
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 0.9.
|
117
|
+
version: 0.9.8
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 0.9.
|
124
|
+
version: 0.9.8
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: opencl_ruby_ffi
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -252,6 +252,7 @@ files:
|
|
252
252
|
- samples/mnist_data_2.1.rb
|
253
253
|
- samples/mnist_data_2.2.rb
|
254
254
|
- samples/mnist_data_2.3.rb
|
255
|
+
- samples/mnist_data_3.0.rb
|
255
256
|
- samples/multigpu.rb
|
256
257
|
- samples/nearest_neighbor.rb
|
257
258
|
- samples/rnn.rb
|