tensor_stream-opencl 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
4
- data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
3
+ metadata.gz: 3e4aa123289372c651cd4da3e7c206abc4f9f67a551d4062180c5cf6555dc243
4
+ data.tar.gz: 6517954207c85f56cd08b2892b0119d4bb7a35e2d4bd9b9cacc5d3c9ccfb9e42
5
5
  SHA512:
6
- metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
7
- data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
6
+ metadata.gz: 7f61d61be79dd1e06ebfdc77ed2dff9e717e0cdb292160fe20c9ca08693d867e1b0e0350c71db5d24feb4671a26e793f44d6b80762c384193c1985b6b1616376
7
+ data.tar.gz: 72c32530717fac8ff947ce4b204535755134bde14e0f70d0d120ff101b5654843312186317cb480fd5e1c620a25328a3590b1f35193faf1d196e7ad631d169b0
@@ -422,10 +422,16 @@ module TensorStream
422
422
  a = inputs[0]
423
423
  if a.data_type != tensor.data_type
424
424
  buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
425
- m, n = a.shape
426
- cl_m = OpenCL::Int1.new(m || 1)
427
- cl_n = OpenCL::Int1.new(n || 1)
428
- work_group = [m || 1, n || 1]
425
+ work_group = if inputs[0].shape.size > 2
426
+ [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
427
+ else
428
+ m, n = inputs[0].shape
429
+ [m || 1, n || 1]
430
+ end
431
+
432
+ cl_m = OpenCL::Int1.new(work_group[0])
433
+ cl_n = OpenCL::Int1.new(work_group[1])
434
+
429
435
  event_wait_list = build_event_wait_list(inputs)
430
436
  buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
431
437
  buffer
@@ -1,27 +1,30 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
3
+ __kernel void conv2d(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  const int batch_index = get_global_id(0);
6
6
  const int h_index = get_global_id(1);
7
7
  const int w_index = get_global_id(2);
8
- const int h_index_with_stride = h_index * <%= stride[0] %>;
9
- const int w_index_with_stride = w_index * <%= stride[1] %>;
8
+ const int h_index_with_stride = h_index * <%= stride[0] %> - <%= padding[0] %>;
9
+ const int w_index_with_stride = w_index * <%= stride[1] %> - <%= padding[1] %>;
10
10
 
11
11
  const int image_index = batch_index * height * width * <%= ch %>;
12
12
  const int image_row_width = width * <%= ch %>;
13
+ const int out_image_row_size = out_height * out_width * <%= out_ch %>;
13
14
 
14
15
  for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
15
16
  <%= ctype %> sum = 0;
16
17
  for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
17
18
  for(int y = 0; y < <%= fh %>; y++) {
18
19
  for (int x = 0; x < <%= fw %>; x++) {
19
- if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
20
- sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
20
+ if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width &&
21
+ (h_index_with_stride + y) >= 0 && (w_index_with_stride + x) >=0) {
22
+ <%= ctype %> f = filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
23
+ sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * f;
21
24
  }
22
25
  }
23
26
  }
24
27
  }
25
- output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
28
+ output[batch_index * out_image_row_size + h_index * out_width * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
26
29
  }
27
30
  }
@@ -1,21 +1,31 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
3
+ __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  const int fh_index = get_global_id(0);
6
6
  const int fw_index = get_global_id(1);
7
7
  const int f_out_channel = get_global_id(2);
8
8
  const int image_size = height * width * <%= ch %>;
9
- const int grad_image_row_width = width * <%= out_ch %>;
9
+ const int grad_image_row_width = out_width * <%= out_ch %>;
10
+ const int grad_image_size = out_height * out_width * <%= out_ch %>;
10
11
 
11
12
  for(int channel = 0; channel < <%= ch %>; channel++) {
12
13
  <%= ctype %> grad_sum = 0.0;
13
14
  for(int batch = 0; batch < batch_size; batch++) {
14
- const int image_index = batch * height * width * <%= out_ch %>;
15
+ int image_index = batch * grad_image_size;
15
16
  for(int y = 0; y < height; y++) {
16
17
  for (int x = 0; x < width; x++) {
17
- if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
18
- const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
18
+ int y_offset = y - fh_index + <%= padding[0] %>;
19
+ int x_offset = x - fw_index + <%= padding[1] %>;
20
+ int y_offset_end = y + (<%= fh %> - fh_index - 1) - <%= padding[2] %>;
21
+ int x_offset_end = x + (<%= fw %> - fw_index - 1) - <%= padding[3] %>;
22
+
23
+ if ( (y_offset % <%= stride[0]%>) == 0
24
+ && (x_offset % <%= stride[1]%>) == 0
25
+ && (y_offset >=0) && (x_offset >= 0)
26
+ && (y_offset_end < height)
27
+ && (x_offset_end < width)) {
28
+ <%= ctype %> image_grad = grad[image_index + (y_offset / <%= stride[0] %>) * grad_image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
19
29
  grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
20
30
  }
21
31
  }
@@ -1,6 +1,6 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
3
+ __kernel void conv2d_backprop_input(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  int batch_index = get_global_id(0);
6
6
  int h_index = get_global_id(1); // orig image y
@@ -8,8 +8,8 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
8
8
 
9
9
  int h_index_with_stride = h_index / <%= stride[0] %>;
10
10
  int w_index_with_stride = w_index / <%= stride[1] %>;
11
- int grad_height = height / <%= stride[0] %>;
12
- int grad_width = width / <%= stride[1] %>;
11
+ int grad_height = out_height;
12
+ int grad_width = out_width;
13
13
 
14
14
  int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
15
15
  int image_row_width = grad_width * <%= out_ch %>;
@@ -19,8 +19,16 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
19
19
  for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
20
20
  for(int y = 0; y < <%= fh %>; y++) {
21
21
  for (int x = 0; x < <%= fw %>; x++) {
22
- if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
23
- <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
22
+ int y_offset = h_index - y + <%= padding[0] %>;
23
+ int x_offset = w_index - x + <%= padding[1] %>;
24
+
25
+ if ( ( y_offset >= 0) && (x_offset >= 0) &&
26
+ ( y_offset % <%= stride[0]%> == 0) &&
27
+ ( x_offset % <%= stride[1]%> == 0) &&
28
+ ( h_index + (<%= fh %> - y - 1) < (height + <%= padding[2] %>)) &&
29
+ ( w_index + (<%= fw %> - x - 1) < (width + <%= padding[3] %>))
30
+ ) {
31
+ <%= ctype %> imag_grad = grad[image_index + ( y_offset / <%= stride[0] %>) * image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
24
32
  g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
25
33
  }
26
34
  }
@@ -14,10 +14,15 @@ module TensorStream
14
14
  if inputs.size == 1
15
15
  inputs[0]
16
16
  else
17
- m, n = inputs[0].shape
18
- work_group = [m || 1, n || 1]
19
- cl_m = OpenCL::Int1.new(m || 1)
20
- cl_n = OpenCL::Int1.new(n || 1)
17
+ work_group = if inputs[0].shape.size > 2
18
+ [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
19
+ else
20
+ m, n = inputs[0].shape
21
+ [m || 1, n || 1]
22
+ end
23
+
24
+ cl_m = OpenCL::Int1.new(work_group[0])
25
+ cl_n = OpenCL::Int1.new(work_group[1])
21
26
  cl_switch = OpenCL::Int1.new(0)
22
27
  dtype = tensor.data_type
23
28
 
@@ -68,6 +73,7 @@ module TensorStream
68
73
 
69
74
  raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
70
75
  raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
76
+ raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
71
77
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
72
78
 
73
79
  dtype = tensor.data_type
@@ -162,6 +168,7 @@ module TensorStream
162
168
 
163
169
  axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
164
170
  input = complete_eval(value, child_context)
171
+
165
172
  value = value.buffer.reshape(*value.shape.reverse)
166
173
  rank = input.shape.size - 1
167
174
 
@@ -220,6 +220,9 @@ module TensorStream
220
220
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
221
221
 
222
222
  m, n = a.shape
223
+
224
+ raise "unsupported rank " if a.shape.size > 2
225
+
223
226
  work_group = [m]
224
227
  n = m if n.nil?
225
228
  cl_n = OpenCL::Int1.new(n || 1)
@@ -236,6 +239,9 @@ module TensorStream
236
239
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
237
240
 
238
241
  m, n = a.shape
242
+
243
+ raise "unsupported rank " if a.shape.size > 2
244
+
239
245
  work_group = [m]
240
246
  n = m if n.nil?
241
247
  cl_n = OpenCL::Int1.new(n || 1)
@@ -254,6 +260,9 @@ module TensorStream
254
260
  output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
255
261
  rank = a.shape.size - 1
256
262
  m, n = a.shape
263
+
264
+ raise "unsupported rank " if a.shape.size > 2
265
+
257
266
  work_group = [m]
258
267
  n = m if n.nil?
259
268
  cl_n = OpenCL::Int1.new(n || 1)
@@ -276,6 +285,9 @@ module TensorStream
276
285
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
277
286
 
278
287
  m, n = a.shape
288
+
289
+ raise "unsupported rank " if a.shape.size > 2
290
+
279
291
  work_group = [m]
280
292
  n = m if n.nil?
281
293
  cl_n = OpenCL::Int1.new(n || 1)
@@ -305,6 +317,9 @@ module TensorStream
305
317
  output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
306
318
  rank = a.shape.size - 1
307
319
  m, n = a.shape
320
+
321
+ raise "unsupported rank " if a.shape.size > 2
322
+
308
323
  work_group = [m]
309
324
  n = m if n.nil?
310
325
  cl_n = OpenCL::Int1.new(n || 1)
@@ -326,6 +341,7 @@ module TensorStream
326
341
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
327
342
 
328
343
  m, n = a.shape
344
+ raise "unsupported rank " if a.shape.size > 2
329
345
  work_group = [m]
330
346
  n = m if n.nil?
331
347
  cl_n = OpenCL::Int1.new(n || 1)
@@ -353,19 +369,29 @@ module TensorStream
353
369
 
354
370
  raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
355
371
 
372
+ padding_option = tensor.options[:padding]
373
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
356
374
  event_wait_list = build_event_wait_list(inputs)
357
375
 
358
- f_height, f_width, in_channels, out_channels = filter_shape
359
- out_shape = [batch, height / height_stride, width / width_stride, out_channels]
376
+ f_height, f_width, _in_channels, out_channels = filter_shape
377
+
378
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
379
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
380
+
381
+ out_shape = [batch, out_h, out_w, out_channels]
360
382
  output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
361
383
 
362
384
  cl_image_height = OpenCL::Int1.new(height)
363
385
  cl_image_width = OpenCL::Int1.new(width)
386
+ cl_out_height = OpenCL::Int1.new(out_h)
387
+ cl_out_width = OpenCL::Int1.new(out_w)
364
388
 
365
- work_dimen = [batch, height / height_stride, width / width_stride]
389
+ work_dimen = [batch, out_h, out_w]
366
390
 
367
- output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
368
- inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
+ output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
392
+ send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
393
+ cl_out_height, cl_out_width, inputs[0].cl_buffer,
394
+ inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
369
395
  output_buffer
370
396
  end
371
397
 
@@ -385,13 +411,22 @@ module TensorStream
385
411
  batch, height, width, channels = image_shape
386
412
  f_height, f_width, in_channels, out_channels = filter_shape
387
413
 
414
+ padding_option = tensor.options[:padding]
415
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
388
416
  work_dimen = [batch, height, width]
389
417
 
418
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
419
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
420
+
390
421
  cl_image_height = OpenCL::Int1.new(height)
391
422
  cl_image_width = OpenCL::Int1.new(width)
423
+ cl_out_height = OpenCL::Int1.new(out_h)
424
+ cl_out_width = OpenCL::Int1.new(out_w)
392
425
 
393
- output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
394
- filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
426
+ output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
427
+ send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
428
+ cl_out_height, cl_out_width, filter.cl_buffer, grad.cl_buffer,
429
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
395
430
  output_buffer
396
431
  end
397
432
 
@@ -411,14 +446,48 @@ module TensorStream
411
446
  f_height, f_width, input_channels, output_channels = filter_shape
412
447
  work_dimen = [f_height, f_width, output_channels]
413
448
 
449
+ padding_option = tensor.options[:padding]
450
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
451
+
452
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
453
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
454
+
414
455
  cl_batch_size = OpenCL::Int1.new(batch_size)
415
456
  cl_image_height = OpenCL::Int1.new(height)
416
457
  cl_image_width = OpenCL::Int1.new(width)
458
+ cl_out_height = OpenCL::Int1.new(out_h)
459
+ cl_out_width = OpenCL::Int1.new(out_w)
417
460
 
418
- output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
419
- images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
461
+ output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride], padding: padding ).
462
+ send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
463
+ cl_out_height, cl_out_width, images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
420
464
  output_buffer
421
465
  end
466
+
467
+ def conv2d_padding_options(padding_option, filter_shape, height, width, h_stride, w_stride)
468
+ case padding_option
469
+ when 'SAME'
470
+ [
471
+ calc_pad(height, h_stride, filter_shape[0]),
472
+ calc_pad(width, w_stride, filter_shape[1]),
473
+ calc_pad(height, h_stride, filter_shape[0], true),
474
+ calc_pad(width, w_stride, filter_shape[1], true)
475
+ ]
476
+ when 'VALID'
477
+ [0, 0, 0, 0]
478
+ else
479
+ raise TensorStream::ValueError, "Unsupported padding value #{padding_option}, valid values 'SAME', 'VALID'"
480
+ end
481
+ end
482
+
483
+ def calc_pad(w, stride, f_shape, ceil = false)
484
+ r = ((w / stride - 1) * stride - w + f_shape)
485
+ if ceil
486
+ r.odd? ? r / 2 + 1 : r / 2
487
+ else
488
+ r / 2
489
+ end
490
+ end
422
491
  end
423
492
  end
424
493
  end
@@ -112,6 +112,7 @@ module TensorStream
112
112
  result = complete_eval(tensor, execution_context)
113
113
  # puts "-------------------wait finish------------------------"
114
114
  _opencl_queue.finish
115
+ # puts "-------------------done finish------------------------"
115
116
  read_final_result(result)
116
117
  end
117
118
 
@@ -170,6 +171,7 @@ module TensorStream
170
171
  events = build_event_wait_list([buffer])
171
172
  # puts "** wait #{tensor.name} **"
172
173
  OpenCL.wait_for_events(events) unless events.empty?
174
+ # puts "** done #{tensor.name} **"
173
175
  buffer
174
176
  end
175
177
 
@@ -449,6 +451,7 @@ module TensorStream
449
451
  events = build_event_wait_list(inputs)
450
452
  # puts "** wait for event flow_group**"
451
453
  OpenCL.wait_for_events(events) unless events.empty?
454
+ # puts "** done for event flow_group**"
452
455
  nil
453
456
  end
454
457
 
@@ -461,9 +464,7 @@ module TensorStream
461
464
  return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
462
465
  return @context[cache_key] if @context.key?(cache_key)
463
466
 
464
- # puts "opencl eval #{object_id} #{tensor.name}"
465
467
  invoke(tensor, child_context).tap do |result|
466
- # puts "result done opencl #{object_id}: #{tensor.name}"
467
468
  if tensor.breakpoint
468
469
  a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
469
470
  b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -569,6 +570,9 @@ module TensorStream
569
570
  [m, result_shape.reduce(:*) / m]
570
571
  elsif result_shape.size <= 2
571
572
  [m || 1, n || 1]
573
+ elsif (b.shape.size == 1) && (result_shape.last == b.shape.last)
574
+ last_dim = b.shape.last
575
+ [result_shape.reduce(:*) / last_dim, last_dim]
572
576
  else
573
577
  raise "rank > 2 not supported for now"
574
578
  end
@@ -614,9 +618,15 @@ module TensorStream
614
618
  output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
615
619
 
616
620
  m, n = p.shape
617
- work_group = [m || 1, n || 1]
618
- cl_m = OpenCL::Int1.new(m || 1)
619
- cl_n = OpenCL::Int1.new(n || 1)
621
+
622
+ work_group = if p.shape.size > 2
623
+ [m, p.shape.reduce(:*) / m]
624
+ else
625
+ [ m || 1, n || 1]
626
+ end
627
+
628
+ cl_m = OpenCL::Int1.new(work_group[0])
629
+ cl_n = OpenCL::Int1.new(work_group[1])
620
630
 
621
631
  event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
622
632
  output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.2.3"
3
+ VERSION = "0.2.4"
4
4
  end
5
5
  end
@@ -29,19 +29,19 @@ M = 60
29
29
  N = 30
30
30
 
31
31
 
32
- w1 = tf.variable(tf.random_normal([784, K]))
32
+ w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
33
33
  b1 = tf.variable(tf.ones([K])/10)
34
34
 
35
- w2 = tf.variable(tf.random_normal([K, L]))
35
+ w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
36
36
  b2 = tf.variable(tf.ones([L])/10)
37
37
 
38
- w3 = tf.variable(tf.random_normal([L, M]))
38
+ w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
39
39
  b3 = tf.variable(tf.ones([M])/10)
40
40
 
41
- w4 = tf.variable(tf.random_normal([M, N]))
41
+ w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
42
42
  b4 = tf.variable(tf.ones([N])/10)
43
43
 
44
- w5 = tf.variable(tf.random_normal([N, 10]))
44
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
45
45
  b5 = tf.variable(tf.zeros([10]))
46
46
 
47
47
  x_ = tf.reshape(x, [-1, 784])
@@ -1,6 +1,7 @@
1
1
  # A ruby port of the example code discussed by Martin Gorner in
2
2
  # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
3
  #
4
+ # Five Layers with relu decay
4
5
  # https://www.youtube.com/watch?v=u4alGiomYP4
5
6
  #
6
7
  # Requirements:
@@ -35,19 +36,19 @@ M = 60
35
36
  N = 30
36
37
 
37
38
 
38
- w1 = tf.variable(tf.random_normal([784, K]))
39
+ w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
39
40
  b1 = tf.variable(tf.ones([K])/10)
40
41
 
41
- w2 = tf.variable(tf.random_normal([K, L]))
42
+ w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
42
43
  b2 = tf.variable(tf.ones([L])/10)
43
44
 
44
- w3 = tf.variable(tf.random_normal([L, M]))
45
+ w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
45
46
  b3 = tf.variable(tf.ones([M])/10)
46
47
 
47
- w4 = tf.variable(tf.random_normal([M, N]))
48
+ w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
48
49
  b4 = tf.variable(tf.ones([N])/10)
49
50
 
50
- w5 = tf.variable(tf.random_normal([N, 10]))
51
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
51
52
  b5 = tf.variable(tf.zeros([10]))
52
53
 
53
54
  x_ = tf.reshape(x, [-1, 784])
@@ -0,0 +1,145 @@
1
+ # A ruby port of the example code discussed by Martin Gorner in
2
+ # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
+ #
4
+ # https://www.youtube.com/watch?v=u4alGiomYP4
5
+ #
6
+ # Requirements:
7
+ # mnist-learn gem
8
+ # opencl_ruby_ffi gem
9
+ require "bundler/setup"
10
+ require 'tensor_stream'
11
+ require 'mnist-learn'
12
+ require 'pry-byebug'
13
+
14
+ # Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
15
+ require 'tensor_stream/opencl'
16
+
17
+ tf = TensorStream
18
+ puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
19
+
20
+ # Import MNIST data
21
+ puts "downloading minst data"
22
+ # Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
23
+ mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
24
+ puts "downloading finished"
25
+
26
+ # neural network structure for this sample:
27
+ #
28
+ # · · · · · · · · · · (input data, 1-deep) X [batch, 28, 28, 1]
29
+ # @ @ @ @ @ @ @ @ @ @ -- conv. layer 5x5x1=>4 stride 1 W1 [5, 5, 1, 4] B1 [4]
30
+ # ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y1 [batch, 28, 28, 4]
31
+ # @ @ @ @ @ @ @ @ -- conv. layer 5x5x4=>8 stride 2 W2 [5, 5, 4, 8] B2 [8]
32
+ # ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y2 [batch, 14, 14, 8]
33
+ # @ @ @ @ @ @ -- conv. layer 4x4x8=>12 stride 2 W3 [4, 4, 8, 12] B3 [12]
34
+ # ∶∶∶∶∶∶∶∶∶∶∶ Y3 [batch, 7, 7, 12] => reshaped to YY [batch, 7*7*12]
35
+ # \x/x\x\x/ -- fully connected layer (relu) W4 [7*7*12, 200] B4 [200]
36
+ # · · · · Y4 [batch, 200]
37
+ # \x/x\x/ -- fully connected layer (softmax) W5 [200, 10] B5 [10]
38
+ # · · · Y [batch, 10]
39
+
40
+
41
+ # input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
42
+ x = tf.placeholder(:float32, shape: [nil, 28, 28, 1])
43
+
44
+ # correct answers will go here
45
+ y_ = tf.placeholder(:float32, shape: [nil, 10])
46
+
47
+ # step for variable learning rate
48
+ step = tf.placeholder(:int32)
49
+
50
+ pkeep = tf.placeholder(tf.float32)
51
+
52
+ # three convolutional layers with their channel counts, and a
53
+ # fully connected layer (tha last layer has 10 softmax neurons)
54
+
55
+ K = 4 # first convolutional layer output depth
56
+ L = 8 # second convolutional layer output depth
57
+ M = 12 # third convolutional layer
58
+ N = 200 # fully connected layer
59
+
60
+
61
+ w1 = tf.variable(tf.truncated_normal([6, 6, 1, K], stddev: 0.1))
62
+ b1 = tf.variable(tf.ones([K])/10)
63
+
64
+ w2 = tf.variable(tf.truncated_normal([5, 5, K, L], stddev: 0.1))
65
+ b2 = tf.variable(tf.ones([L])/10)
66
+
67
+ w3 = tf.variable(tf.truncated_normal([4, 4, L, M], stddev: 0.1))
68
+ b3 = tf.variable(tf.ones([M])/10)
69
+
70
+ w4 = tf.variable(tf.truncated_normal([7 * 7 * M, N], stddev: 0.1))
71
+ b4 = tf.variable(tf.ones([N])/10)
72
+
73
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
74
+ b5 = tf.variable(tf.ones([10])/10)
75
+
76
+ # The model
77
+ stride = 1 # output is 28x28
78
+ y1 = tf.nn.relu(tf.nn.conv2d(tf.reshape(x, [-1, 28, 28, 1]), w1, [1, stride, stride, 1], 'SAME') + b1)
79
+ stride = 2 # output is 14x14
80
+ y2 = tf.nn.relu(tf.nn.conv2d(y1, w2, [1, stride, stride, 1], 'SAME') + b2)
81
+ stride = 2 # output is 7x7
82
+ y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
83
+
84
+ # reshape the output from the third convolution for the fully connected layer
85
+ yy = tf.reshape(y3, [-1, 7 * 7 * M])
86
+ y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
87
+
88
+ # dropout to prevent overfitting
89
+ yy4 = tf.nn.dropout(y4, pkeep)
90
+
91
+ ylogits = tf.matmul(yy4, w5) + b5
92
+
93
+ # model
94
+ y = tf.nn.softmax(ylogits)
95
+
96
+
97
+
98
+ # training step, learning rate = 0.003
99
+
100
+
101
+ # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
102
+ # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
103
+ # problems with log(0) which is NaN
104
+ cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
105
+ cross_entropy = tf.reduce_mean(cross_entropy)*100
106
+
107
+ is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
108
+ accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
109
+
110
+ # training step, learning rate = 0.003
111
+ lr = 0.0001.t + tf.train.exponential_decay(0.003, step, 2000, 1/Math::E)
112
+ train_step = TensorStream::Train::AdamOptimizer.new(lr).minimize(cross_entropy)
113
+
114
+ sess = tf.session
115
+ # Add ops to save and restore all the variables.
116
+
117
+ init = tf.global_variables_initializer
118
+
119
+ sess.run(init)
120
+ mnist_train = mnist.train
121
+ test_data = { x => mnist.test.images, y_ => mnist.test.labels, pkeep => 1.0 }
122
+
123
+
124
+ (0..10001).each do |i|
125
+ # load batch of images and correct answers
126
+ batch_x, batch_y = mnist_train.next_batch(100)
127
+ train_data = { x => batch_x, y_ => batch_y, step => i, pkeep => 0.75 }
128
+
129
+ # train
130
+ sess.run(train_step, feed_dict: train_data)
131
+
132
+ if (i % 10 == 0)
133
+ # File.write("profile.json", TensorStream::ReportTool.profile_for(sess).to_json)
134
+ # success? add code to print it
135
+ a_train, c_train, l = sess.run([accuracy, cross_entropy, lr], feed_dict: { x => batch_x, y_ => batch_y, step => i, pkeep => 1.0})
136
+ puts "#{i}: accuracy:#{a_train} loss:#{c_train} (lr:#{l})"
137
+ end
138
+
139
+ if (i % 100 == 0)
140
+ # success on test data?
141
+ a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data, pkeep => 1.0)
142
+ puts("#{i}: ******** test accuracy: #{a_test} test loss: #{c_test}")
143
+ end
144
+ end
145
+
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "~> 0.9.7"
42
+ spec.add_dependency "tensor_stream", "~> 0.9.8"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-19 00:00:00.000000000 Z
11
+ date: 2018-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 0.9.7
117
+ version: 0.9.8
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 0.9.7
124
+ version: 0.9.8
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -252,6 +252,7 @@ files:
252
252
  - samples/mnist_data_2.1.rb
253
253
  - samples/mnist_data_2.2.rb
254
254
  - samples/mnist_data_2.3.rb
255
+ - samples/mnist_data_3.0.rb
255
256
  - samples/multigpu.rb
256
257
  - samples/nearest_neighbor.rb
257
258
  - samples/rnn.rb