tensor_stream-opencl 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
4
- data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
3
+ metadata.gz: 3e4aa123289372c651cd4da3e7c206abc4f9f67a551d4062180c5cf6555dc243
4
+ data.tar.gz: 6517954207c85f56cd08b2892b0119d4bb7a35e2d4bd9b9cacc5d3c9ccfb9e42
5
5
  SHA512:
6
- metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
7
- data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
6
+ metadata.gz: 7f61d61be79dd1e06ebfdc77ed2dff9e717e0cdb292160fe20c9ca08693d867e1b0e0350c71db5d24feb4671a26e793f44d6b80762c384193c1985b6b1616376
7
+ data.tar.gz: 72c32530717fac8ff947ce4b204535755134bde14e0f70d0d120ff101b5654843312186317cb480fd5e1c620a25328a3590b1f35193faf1d196e7ad631d169b0
@@ -422,10 +422,16 @@ module TensorStream
422
422
  a = inputs[0]
423
423
  if a.data_type != tensor.data_type
424
424
  buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
425
- m, n = a.shape
426
- cl_m = OpenCL::Int1.new(m || 1)
427
- cl_n = OpenCL::Int1.new(n || 1)
428
- work_group = [m || 1, n || 1]
425
+ work_group = if inputs[0].shape.size > 2
426
+ [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
427
+ else
428
+ m, n = inputs[0].shape
429
+ [m || 1, n || 1]
430
+ end
431
+
432
+ cl_m = OpenCL::Int1.new(work_group[0])
433
+ cl_n = OpenCL::Int1.new(work_group[1])
434
+
429
435
  event_wait_list = build_event_wait_list(inputs)
430
436
  buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
431
437
  buffer
@@ -1,27 +1,30 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
3
+ __kernel void conv2d(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  const int batch_index = get_global_id(0);
6
6
  const int h_index = get_global_id(1);
7
7
  const int w_index = get_global_id(2);
8
- const int h_index_with_stride = h_index * <%= stride[0] %>;
9
- const int w_index_with_stride = w_index * <%= stride[1] %>;
8
+ const int h_index_with_stride = h_index * <%= stride[0] %> - <%= padding[0] %>;
9
+ const int w_index_with_stride = w_index * <%= stride[1] %> - <%= padding[1] %>;
10
10
 
11
11
  const int image_index = batch_index * height * width * <%= ch %>;
12
12
  const int image_row_width = width * <%= ch %>;
13
+ const int out_image_row_size = out_height * out_width * <%= out_ch %>;
13
14
 
14
15
  for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
15
16
  <%= ctype %> sum = 0;
16
17
  for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
17
18
  for(int y = 0; y < <%= fh %>; y++) {
18
19
  for (int x = 0; x < <%= fw %>; x++) {
19
- if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
20
- sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
20
+ if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width &&
21
+ (h_index_with_stride + y) >= 0 && (w_index_with_stride + x) >=0) {
22
+ <%= ctype %> f = filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
23
+ sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * f;
21
24
  }
22
25
  }
23
26
  }
24
27
  }
25
- output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
28
+ output[batch_index * out_image_row_size + h_index * out_width * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
26
29
  }
27
30
  }
@@ -1,21 +1,31 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
3
+ __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  const int fh_index = get_global_id(0);
6
6
  const int fw_index = get_global_id(1);
7
7
  const int f_out_channel = get_global_id(2);
8
8
  const int image_size = height * width * <%= ch %>;
9
- const int grad_image_row_width = width * <%= out_ch %>;
9
+ const int grad_image_row_width = out_width * <%= out_ch %>;
10
+ const int grad_image_size = out_height * out_width * <%= out_ch %>;
10
11
 
11
12
  for(int channel = 0; channel < <%= ch %>; channel++) {
12
13
  <%= ctype %> grad_sum = 0.0;
13
14
  for(int batch = 0; batch < batch_size; batch++) {
14
- const int image_index = batch * height * width * <%= out_ch %>;
15
+ int image_index = batch * grad_image_size;
15
16
  for(int y = 0; y < height; y++) {
16
17
  for (int x = 0; x < width; x++) {
17
- if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
18
- const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
18
+ int y_offset = y - fh_index + <%= padding[0] %>;
19
+ int x_offset = x - fw_index + <%= padding[1] %>;
20
+ int y_offset_end = y + (<%= fh %> - fh_index - 1) - <%= padding[2] %>;
21
+ int x_offset_end = x + (<%= fw %> - fw_index - 1) - <%= padding[3] %>;
22
+
23
+ if ( (y_offset % <%= stride[0]%>) == 0
24
+ && (x_offset % <%= stride[1]%>) == 0
25
+ && (y_offset >=0) && (x_offset >= 0)
26
+ && (y_offset_end < height)
27
+ && (x_offset_end < width)) {
28
+ <%= ctype %> image_grad = grad[image_index + (y_offset / <%= stride[0] %>) * grad_image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
19
29
  grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
20
30
  }
21
31
  }
@@ -1,6 +1,6 @@
1
1
  % ctype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
3
+ __kernel void conv2d_backprop_input(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
4
  // Get the index of the current element to be processed
5
5
  int batch_index = get_global_id(0);
6
6
  int h_index = get_global_id(1); // orig image y
@@ -8,8 +8,8 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
8
8
 
9
9
  int h_index_with_stride = h_index / <%= stride[0] %>;
10
10
  int w_index_with_stride = w_index / <%= stride[1] %>;
11
- int grad_height = height / <%= stride[0] %>;
12
- int grad_width = width / <%= stride[1] %>;
11
+ int grad_height = out_height;
12
+ int grad_width = out_width;
13
13
 
14
14
  int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
15
15
  int image_row_width = grad_width * <%= out_ch %>;
@@ -19,8 +19,16 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
19
19
  for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
20
20
  for(int y = 0; y < <%= fh %>; y++) {
21
21
  for (int x = 0; x < <%= fw %>; x++) {
22
- if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
23
- <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
22
+ int y_offset = h_index - y + <%= padding[0] %>;
23
+ int x_offset = w_index - x + <%= padding[1] %>;
24
+
25
+ if ( ( y_offset >= 0) && (x_offset >= 0) &&
26
+ ( y_offset % <%= stride[0]%> == 0) &&
27
+ ( x_offset % <%= stride[1]%> == 0) &&
28
+ ( h_index + (<%= fh %> - y - 1) < (height + <%= padding[2] %>)) &&
29
+ ( w_index + (<%= fw %> - x - 1) < (width + <%= padding[3] %>))
30
+ ) {
31
+ <%= ctype %> imag_grad = grad[image_index + ( y_offset / <%= stride[0] %>) * image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
24
32
  g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
25
33
  }
26
34
  }
@@ -14,10 +14,15 @@ module TensorStream
14
14
  if inputs.size == 1
15
15
  inputs[0]
16
16
  else
17
- m, n = inputs[0].shape
18
- work_group = [m || 1, n || 1]
19
- cl_m = OpenCL::Int1.new(m || 1)
20
- cl_n = OpenCL::Int1.new(n || 1)
17
+ work_group = if inputs[0].shape.size > 2
18
+ [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
19
+ else
20
+ m, n = inputs[0].shape
21
+ [m || 1, n || 1]
22
+ end
23
+
24
+ cl_m = OpenCL::Int1.new(work_group[0])
25
+ cl_n = OpenCL::Int1.new(work_group[1])
21
26
  cl_switch = OpenCL::Int1.new(0)
22
27
  dtype = tensor.data_type
23
28
 
@@ -68,6 +73,7 @@ module TensorStream
68
73
 
69
74
  raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
70
75
  raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
76
+ raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
71
77
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
72
78
 
73
79
  dtype = tensor.data_type
@@ -162,6 +168,7 @@ module TensorStream
162
168
 
163
169
  axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
164
170
  input = complete_eval(value, child_context)
171
+
165
172
  value = value.buffer.reshape(*value.shape.reverse)
166
173
  rank = input.shape.size - 1
167
174
 
@@ -220,6 +220,9 @@ module TensorStream
220
220
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
221
221
 
222
222
  m, n = a.shape
223
+
224
+ raise "unsupported rank " if a.shape.size > 2
225
+
223
226
  work_group = [m]
224
227
  n = m if n.nil?
225
228
  cl_n = OpenCL::Int1.new(n || 1)
@@ -236,6 +239,9 @@ module TensorStream
236
239
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
237
240
 
238
241
  m, n = a.shape
242
+
243
+ raise "unsupported rank " if a.shape.size > 2
244
+
239
245
  work_group = [m]
240
246
  n = m if n.nil?
241
247
  cl_n = OpenCL::Int1.new(n || 1)
@@ -254,6 +260,9 @@ module TensorStream
254
260
  output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
255
261
  rank = a.shape.size - 1
256
262
  m, n = a.shape
263
+
264
+ raise "unsupported rank " if a.shape.size > 2
265
+
257
266
  work_group = [m]
258
267
  n = m if n.nil?
259
268
  cl_n = OpenCL::Int1.new(n || 1)
@@ -276,6 +285,9 @@ module TensorStream
276
285
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
277
286
 
278
287
  m, n = a.shape
288
+
289
+ raise "unsupported rank " if a.shape.size > 2
290
+
279
291
  work_group = [m]
280
292
  n = m if n.nil?
281
293
  cl_n = OpenCL::Int1.new(n || 1)
@@ -305,6 +317,9 @@ module TensorStream
305
317
  output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
306
318
  rank = a.shape.size - 1
307
319
  m, n = a.shape
320
+
321
+ raise "unsupported rank " if a.shape.size > 2
322
+
308
323
  work_group = [m]
309
324
  n = m if n.nil?
310
325
  cl_n = OpenCL::Int1.new(n || 1)
@@ -326,6 +341,7 @@ module TensorStream
326
341
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
327
342
 
328
343
  m, n = a.shape
344
+ raise "unsupported rank " if a.shape.size > 2
329
345
  work_group = [m]
330
346
  n = m if n.nil?
331
347
  cl_n = OpenCL::Int1.new(n || 1)
@@ -353,19 +369,29 @@ module TensorStream
353
369
 
354
370
  raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
355
371
 
372
+ padding_option = tensor.options[:padding]
373
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
356
374
  event_wait_list = build_event_wait_list(inputs)
357
375
 
358
- f_height, f_width, in_channels, out_channels = filter_shape
359
- out_shape = [batch, height / height_stride, width / width_stride, out_channels]
376
+ f_height, f_width, _in_channels, out_channels = filter_shape
377
+
378
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
379
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
380
+
381
+ out_shape = [batch, out_h, out_w, out_channels]
360
382
  output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
361
383
 
362
384
  cl_image_height = OpenCL::Int1.new(height)
363
385
  cl_image_width = OpenCL::Int1.new(width)
386
+ cl_out_height = OpenCL::Int1.new(out_h)
387
+ cl_out_width = OpenCL::Int1.new(out_w)
364
388
 
365
- work_dimen = [batch, height / height_stride, width / width_stride]
389
+ work_dimen = [batch, out_h, out_w]
366
390
 
367
- output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
368
- inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
+ output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
392
+ send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
393
+ cl_out_height, cl_out_width, inputs[0].cl_buffer,
394
+ inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
369
395
  output_buffer
370
396
  end
371
397
 
@@ -385,13 +411,22 @@ module TensorStream
385
411
  batch, height, width, channels = image_shape
386
412
  f_height, f_width, in_channels, out_channels = filter_shape
387
413
 
414
+ padding_option = tensor.options[:padding]
415
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
388
416
  work_dimen = [batch, height, width]
389
417
 
418
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
419
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
420
+
390
421
  cl_image_height = OpenCL::Int1.new(height)
391
422
  cl_image_width = OpenCL::Int1.new(width)
423
+ cl_out_height = OpenCL::Int1.new(out_h)
424
+ cl_out_width = OpenCL::Int1.new(out_w)
392
425
 
393
- output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
394
- filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
426
+ output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
427
+ send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
428
+ cl_out_height, cl_out_width, filter.cl_buffer, grad.cl_buffer,
429
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
395
430
  output_buffer
396
431
  end
397
432
 
@@ -411,14 +446,48 @@ module TensorStream
411
446
  f_height, f_width, input_channels, output_channels = filter_shape
412
447
  work_dimen = [f_height, f_width, output_channels]
413
448
 
449
+ padding_option = tensor.options[:padding]
450
+ padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
451
+
452
+ out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
453
+ out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
454
+
414
455
  cl_batch_size = OpenCL::Int1.new(batch_size)
415
456
  cl_image_height = OpenCL::Int1.new(height)
416
457
  cl_image_width = OpenCL::Int1.new(width)
458
+ cl_out_height = OpenCL::Int1.new(out_h)
459
+ cl_out_width = OpenCL::Int1.new(out_w)
417
460
 
418
- output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
419
- images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
461
+ output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride], padding: padding ).
462
+ send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
463
+ cl_out_height, cl_out_width, images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
420
464
  output_buffer
421
465
  end
466
+
467
+ def conv2d_padding_options(padding_option, filter_shape, height, width, h_stride, w_stride)
468
+ case padding_option
469
+ when 'SAME'
470
+ [
471
+ calc_pad(height, h_stride, filter_shape[0]),
472
+ calc_pad(width, w_stride, filter_shape[1]),
473
+ calc_pad(height, h_stride, filter_shape[0], true),
474
+ calc_pad(width, w_stride, filter_shape[1], true)
475
+ ]
476
+ when 'VALID'
477
+ [0, 0, 0, 0]
478
+ else
479
+ raise TensorStream::ValueError, "Unsupported padding value #{padding_option}, valid values 'SAME', 'VALID'"
480
+ end
481
+ end
482
+
483
+ def calc_pad(w, stride, f_shape, ceil = false)
484
+ r = ((w / stride - 1) * stride - w + f_shape)
485
+ if ceil
486
+ r.odd? ? r / 2 + 1 : r / 2
487
+ else
488
+ r / 2
489
+ end
490
+ end
422
491
  end
423
492
  end
424
493
  end
@@ -112,6 +112,7 @@ module TensorStream
112
112
  result = complete_eval(tensor, execution_context)
113
113
  # puts "-------------------wait finish------------------------"
114
114
  _opencl_queue.finish
115
+ # puts "-------------------done finish------------------------"
115
116
  read_final_result(result)
116
117
  end
117
118
 
@@ -170,6 +171,7 @@ module TensorStream
170
171
  events = build_event_wait_list([buffer])
171
172
  # puts "** wait #{tensor.name} **"
172
173
  OpenCL.wait_for_events(events) unless events.empty?
174
+ # puts "** done #{tensor.name} **"
173
175
  buffer
174
176
  end
175
177
 
@@ -449,6 +451,7 @@ module TensorStream
449
451
  events = build_event_wait_list(inputs)
450
452
  # puts "** wait for event flow_group**"
451
453
  OpenCL.wait_for_events(events) unless events.empty?
454
+ # puts "** done for event flow_group**"
452
455
  nil
453
456
  end
454
457
 
@@ -461,9 +464,7 @@ module TensorStream
461
464
  return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
462
465
  return @context[cache_key] if @context.key?(cache_key)
463
466
 
464
- # puts "opencl eval #{object_id} #{tensor.name}"
465
467
  invoke(tensor, child_context).tap do |result|
466
- # puts "result done opencl #{object_id}: #{tensor.name}"
467
468
  if tensor.breakpoint
468
469
  a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
469
470
  b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -569,6 +570,9 @@ module TensorStream
569
570
  [m, result_shape.reduce(:*) / m]
570
571
  elsif result_shape.size <= 2
571
572
  [m || 1, n || 1]
573
+ elsif (b.shape.size == 1) && (result_shape.last == b.shape.last)
574
+ last_dim = b.shape.last
575
+ [result_shape.reduce(:*) / last_dim, last_dim]
572
576
  else
573
577
  raise "rank > 2 not supported for now"
574
578
  end
@@ -614,9 +618,15 @@ module TensorStream
614
618
  output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
615
619
 
616
620
  m, n = p.shape
617
- work_group = [m || 1, n || 1]
618
- cl_m = OpenCL::Int1.new(m || 1)
619
- cl_n = OpenCL::Int1.new(n || 1)
621
+
622
+ work_group = if p.shape.size > 2
623
+ [m, p.shape.reduce(:*) / m]
624
+ else
625
+ [ m || 1, n || 1]
626
+ end
627
+
628
+ cl_m = OpenCL::Int1.new(work_group[0])
629
+ cl_n = OpenCL::Int1.new(work_group[1])
620
630
 
621
631
  event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
622
632
  output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.2.3"
3
+ VERSION = "0.2.4"
4
4
  end
5
5
  end
@@ -29,19 +29,19 @@ M = 60
29
29
  N = 30
30
30
 
31
31
 
32
- w1 = tf.variable(tf.random_normal([784, K]))
32
+ w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
33
33
  b1 = tf.variable(tf.ones([K])/10)
34
34
 
35
- w2 = tf.variable(tf.random_normal([K, L]))
35
+ w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
36
36
  b2 = tf.variable(tf.ones([L])/10)
37
37
 
38
- w3 = tf.variable(tf.random_normal([L, M]))
38
+ w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
39
39
  b3 = tf.variable(tf.ones([M])/10)
40
40
 
41
- w4 = tf.variable(tf.random_normal([M, N]))
41
+ w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
42
42
  b4 = tf.variable(tf.ones([N])/10)
43
43
 
44
- w5 = tf.variable(tf.random_normal([N, 10]))
44
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
45
45
  b5 = tf.variable(tf.zeros([10]))
46
46
 
47
47
  x_ = tf.reshape(x, [-1, 784])
@@ -1,6 +1,7 @@
1
1
  # A ruby port of the example code discussed by Martin Gorner in
2
2
  # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
3
  #
4
+ # Five Layers with relu decay
4
5
  # https://www.youtube.com/watch?v=u4alGiomYP4
5
6
  #
6
7
  # Requirements:
@@ -35,19 +36,19 @@ M = 60
35
36
  N = 30
36
37
 
37
38
 
38
- w1 = tf.variable(tf.random_normal([784, K]))
39
+ w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
39
40
  b1 = tf.variable(tf.ones([K])/10)
40
41
 
41
- w2 = tf.variable(tf.random_normal([K, L]))
42
+ w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
42
43
  b2 = tf.variable(tf.ones([L])/10)
43
44
 
44
- w3 = tf.variable(tf.random_normal([L, M]))
45
+ w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
45
46
  b3 = tf.variable(tf.ones([M])/10)
46
47
 
47
- w4 = tf.variable(tf.random_normal([M, N]))
48
+ w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
48
49
  b4 = tf.variable(tf.ones([N])/10)
49
50
 
50
- w5 = tf.variable(tf.random_normal([N, 10]))
51
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
51
52
  b5 = tf.variable(tf.zeros([10]))
52
53
 
53
54
  x_ = tf.reshape(x, [-1, 784])
@@ -0,0 +1,145 @@
1
+ # A ruby port of the example code discussed by Martin Gorner in
2
+ # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
+ #
4
+ # https://www.youtube.com/watch?v=u4alGiomYP4
5
+ #
6
+ # Requirements:
7
+ # mnist-learn gem
8
+ # opencl_ruby_ffi gem
9
+ require "bundler/setup"
10
+ require 'tensor_stream'
11
+ require 'mnist-learn'
12
+ require 'pry-byebug'
13
+
14
+ # Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
15
+ require 'tensor_stream/opencl'
16
+
17
+ tf = TensorStream
18
+ puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
19
+
20
+ # Import MNIST data
21
+ puts "downloading minst data"
22
+ # Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
23
+ mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
24
+ puts "downloading finished"
25
+
26
+ # neural network structure for this sample:
27
+ #
28
+ # · · · · · · · · · · (input data, 1-deep) X [batch, 28, 28, 1]
29
+ # @ @ @ @ @ @ @ @ @ @ -- conv. layer 5x5x1=>4 stride 1 W1 [5, 5, 1, 4] B1 [4]
30
+ # ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y1 [batch, 28, 28, 4]
31
+ # @ @ @ @ @ @ @ @ -- conv. layer 5x5x4=>8 stride 2 W2 [5, 5, 4, 8] B2 [8]
32
+ # ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶ Y2 [batch, 14, 14, 8]
33
+ # @ @ @ @ @ @ -- conv. layer 4x4x8=>12 stride 2 W3 [4, 4, 8, 12] B3 [12]
34
+ # ∶∶∶∶∶∶∶∶∶∶∶ Y3 [batch, 7, 7, 12] => reshaped to YY [batch, 7*7*12]
35
+ # \x/x\x\x/ -- fully connected layer (relu) W4 [7*7*12, 200] B4 [200]
36
+ # · · · · Y4 [batch, 200]
37
+ # \x/x\x/ -- fully connected layer (softmax) W5 [200, 10] B5 [10]
38
+ # · · · Y [batch, 10]
39
+
40
+
41
+ # input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
42
+ x = tf.placeholder(:float32, shape: [nil, 28, 28, 1])
43
+
44
+ # correct answers will go here
45
+ y_ = tf.placeholder(:float32, shape: [nil, 10])
46
+
47
+ # step for variable learning rate
48
+ step = tf.placeholder(:int32)
49
+
50
+ pkeep = tf.placeholder(tf.float32)
51
+
52
+ # three convolutional layers with their channel counts, and a
53
+ # fully connected layer (tha last layer has 10 softmax neurons)
54
+
55
+ K = 4 # first convolutional layer output depth
56
+ L = 8 # second convolutional layer output depth
57
+ M = 12 # third convolutional layer
58
+ N = 200 # fully connected layer
59
+
60
+
61
+ w1 = tf.variable(tf.truncated_normal([6, 6, 1, K], stddev: 0.1))
62
+ b1 = tf.variable(tf.ones([K])/10)
63
+
64
+ w2 = tf.variable(tf.truncated_normal([5, 5, K, L], stddev: 0.1))
65
+ b2 = tf.variable(tf.ones([L])/10)
66
+
67
+ w3 = tf.variable(tf.truncated_normal([4, 4, L, M], stddev: 0.1))
68
+ b3 = tf.variable(tf.ones([M])/10)
69
+
70
+ w4 = tf.variable(tf.truncated_normal([7 * 7 * M, N], stddev: 0.1))
71
+ b4 = tf.variable(tf.ones([N])/10)
72
+
73
+ w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
74
+ b5 = tf.variable(tf.ones([10])/10)
75
+
76
+ # The model
77
+ stride = 1 # output is 28x28
78
+ y1 = tf.nn.relu(tf.nn.conv2d(tf.reshape(x, [-1, 28, 28, 1]), w1, [1, stride, stride, 1], 'SAME') + b1)
79
+ stride = 2 # output is 14x14
80
+ y2 = tf.nn.relu(tf.nn.conv2d(y1, w2, [1, stride, stride, 1], 'SAME') + b2)
81
+ stride = 2 # output is 7x7
82
+ y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
83
+
84
+ # reshape the output from the third convolution for the fully connected layer
85
+ yy = tf.reshape(y3, [-1, 7 * 7 * M])
86
+ y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
87
+
88
+ # dropout to prevent overfitting
89
+ yy4 = tf.nn.dropout(y4, pkeep)
90
+
91
+ ylogits = tf.matmul(yy4, w5) + b5
92
+
93
+ # model
94
+ y = tf.nn.softmax(ylogits)
95
+
96
+
97
+
98
+ # training step, learning rate = 0.003
99
+
100
+
101
+ # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
102
+ # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
103
+ # problems with log(0) which is NaN
104
+ cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
105
+ cross_entropy = tf.reduce_mean(cross_entropy)*100
106
+
107
+ is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
108
+ accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
109
+
110
+ # training step, learning rate = 0.003
111
+ lr = 0.0001.t + tf.train.exponential_decay(0.003, step, 2000, 1/Math::E)
112
+ train_step = TensorStream::Train::AdamOptimizer.new(lr).minimize(cross_entropy)
113
+
114
+ sess = tf.session
115
+ # Add ops to save and restore all the variables.
116
+
117
+ init = tf.global_variables_initializer
118
+
119
+ sess.run(init)
120
+ mnist_train = mnist.train
121
+ test_data = { x => mnist.test.images, y_ => mnist.test.labels, pkeep => 1.0 }
122
+
123
+
124
+ (0..10001).each do |i|
125
+ # load batch of images and correct answers
126
+ batch_x, batch_y = mnist_train.next_batch(100)
127
+ train_data = { x => batch_x, y_ => batch_y, step => i, pkeep => 0.75 }
128
+
129
+ # train
130
+ sess.run(train_step, feed_dict: train_data)
131
+
132
+ if (i % 10 == 0)
133
+ # File.write("profile.json", TensorStream::ReportTool.profile_for(sess).to_json)
134
+ # success? add code to print it
135
+ a_train, c_train, l = sess.run([accuracy, cross_entropy, lr], feed_dict: { x => batch_x, y_ => batch_y, step => i, pkeep => 1.0})
136
+ puts "#{i}: accuracy:#{a_train} loss:#{c_train} (lr:#{l})"
137
+ end
138
+
139
+ if (i % 100 == 0)
140
+ # success on test data?
141
+ a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data, pkeep => 1.0)
142
+ puts("#{i}: ******** test accuracy: #{a_test} test loss: #{c_test}")
143
+ end
144
+ end
145
+
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "~> 0.9.7"
42
+ spec.add_dependency "tensor_stream", "~> 0.9.8"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-19 00:00:00.000000000 Z
11
+ date: 2018-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 0.9.7
117
+ version: 0.9.8
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 0.9.7
124
+ version: 0.9.8
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -252,6 +252,7 @@ files:
252
252
  - samples/mnist_data_2.1.rb
253
253
  - samples/mnist_data_2.2.rb
254
254
  - samples/mnist_data_2.3.rb
255
+ - samples/mnist_data_3.0.rb
255
256
  - samples/multigpu.rb
256
257
  - samples/nearest_neighbor.rb
257
258
  - samples/rnn.rb