tensor_stream 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +7 -7
  3. data/CHANGELOG.md +13 -0
  4. data/Dockerfile +25 -0
  5. data/Rakefile +6 -0
  6. data/benchmark/benchmark.rb +16 -57
  7. data/benchmark_intel.txt +21 -0
  8. data/benchmark_nvidia.txt +33 -0
  9. data/lib/tensor_stream.rb +4 -173
  10. data/lib/tensor_stream/debugging/debugging.rb +20 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
  33. data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
  34. data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
  35. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
  36. data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
  37. data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
  38. data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
  39. data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
  40. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
  41. data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
  42. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
  43. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
  44. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
  45. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
  46. data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
  47. data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
  48. data/lib/tensor_stream/math_gradients.rb +6 -5
  49. data/lib/tensor_stream/nn/nn_ops.rb +18 -2
  50. data/lib/tensor_stream/ops.rb +237 -44
  51. data/lib/tensor_stream/tensor.rb +16 -2
  52. data/lib/tensor_stream/utils.rb +205 -0
  53. data/lib/tensor_stream/variable.rb +2 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/error.graphml +2755 -0
  56. data/{test_samples → samples}/iris.rb +18 -24
  57. data/samples/logistic_regression.rb +0 -1
  58. data/test_samples/raw_neural_net_sample.rb +80 -23
  59. metadata +11 -3
@@ -1,4 +1,5 @@
1
- __kernel void round_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,5 +1,6 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- __kernel void sigmoid_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,14 +1,15 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- float sigmoid(float x) {
3
+ float sigmoid(<%= c_dtype %> x) {
3
4
  return 1.0f/(1.0f + exp(-x));
4
5
  }
5
6
 
6
- float sigmoid_grad(float x, float g) {
7
+ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
7
8
  return g * sigmoid(x) * ( 1.0f - sigmoid(x));
8
9
  }
9
10
 
10
11
  // same dimension add floating point op
11
- __kernel void sigmoid_grad_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
12
+ __kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
13
  // Get the index of the current element to be processed
13
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -17,7 +18,7 @@ float sigmoid_grad(float x, float g) {
17
18
  }
18
19
 
19
20
  // 1D + Scalar floating point add op
20
- __kernel void sigmoid_grad_c_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
21
+ __kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
21
22
  // Get the index of the current element to be processed
22
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
23
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -30,7 +31,7 @@ float sigmoid_grad(float x, float g) {
30
31
  }
31
32
 
32
33
  // 1D + Scalar floating point add op broadcast
33
- __kernel void sigmoid_grad_b_fp(const int M, const int N, const int M2, const int N2, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
34
+ __kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
34
35
  // Get the index of the current element to be processed
35
36
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
36
37
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,23 +1,21 @@
1
- __kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
4
  // Get the index of the current element to be processed
3
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
- float value = A[globalRow * N + globalCol];
7
+ <%= c_dtype %> value = A[globalRow * N + globalCol];
8
+ % if is_floating_point?(dtype)
6
9
  if (isnan(value) || value == 0.0f) {
7
10
  C[globalRow * N + globalCol] = 0.0;
8
11
  } else {
9
12
  C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
10
13
  }
11
- }
12
-
13
- __kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
14
- // Get the index of the current element to be processed
15
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
- float value = A[globalRow * N + globalCol];
18
- if (isnan(value) || value == 0) {
19
- C[globalRow * N + globalCol] = 0;
20
- } else {
21
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
22
- }
14
+ % else
15
+ if (value == 0) {
16
+ C[globalRow * N + globalCol] = 0;
17
+ } else {
18
+ C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
19
+ }
20
+ % end
23
21
  }
@@ -1,5 +1,6 @@
1
1
 
2
- __kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
25
+ }
26
+ }
@@ -0,0 +1,46 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void softmax_grad_<%= dtype %>(const int N,
3
+ const __global <%= c_dtype %>* A,
4
+ const __global <%= c_dtype %>* G,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ float acc = 0.0f;
12
+ float max = FLT_MIN;
13
+ float row[<%= size %>];
14
+ float grads[<%= size %>][<%= size %>];
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ row[k] = exp(A[globalRow*N + k] - max) / acc;
27
+ }
28
+
29
+ for (int a=0; a < N; a++) {
30
+ for(int b=0; b < N; b++) {
31
+ if (a != b) {
32
+ grads[a][b] = -row[a] * row[b];
33
+ } else {
34
+ grads[a][b] = row[a] * (1.0f - row[a]);
35
+ }
36
+ }
37
+ }
38
+
39
+ for (int k=0; k < N; k++) {
40
+ float total_grad = 0.0f;
41
+ for (int a = 0; a < N; a++) {
42
+ total_grad += grads[a][k] * G[globalRow*N + a];
43
+ }
44
+ C[globalRow*N + k] = total_grad;
45
+ }
46
+ }
@@ -1,5 +1,6 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- __kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,12 +1,6 @@
1
- __kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
2
- // Get the index of the current element to be processed
3
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
-
6
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
- }
1
+ % c_dtype = dtype_to_c_type(dtype)
8
2
 
9
- __kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
3
+ __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
10
4
  // Get the index of the current element to be processed
11
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,5 +1,3 @@
1
- % %w[fp int].product(%w[sub]).each do |dtype, fname|
2
1
  % c_dtype = dtype_to_c_type(dtype)
3
- % op = operator_to_c(fname)
4
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
- % end
2
+ % op = operator_to_c('sub')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
@@ -1,4 +1,5 @@
1
- __kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,4 +1,5 @@
1
- __kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,4 +1,5 @@
1
- __kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,12 +1,5 @@
1
- __kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
2
- // Get the index of the current element to be processed
3
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
-
6
- C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
7
- }
8
-
9
- __kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
10
3
  // Get the index of the current element to be processed
11
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -41,20 +41,18 @@ module TensorStream
41
41
  @preferred_device = preferred_device
42
42
  @retain = context[:retain] || []
43
43
  @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
-
44
+ @context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
45
45
  @context[:compute_history] = [] if log_intermediates
46
46
  end
47
47
 
48
48
  # opencl evaluator main entrypoint
49
49
  def run(tensor, execution_context)
50
50
  _create_opencl_context
51
- # _prepare_kernels
52
-
51
+ create_command_queue
53
52
  read_final_result(complete_eval(tensor, execution_context))
54
53
  end
55
54
 
56
55
  def complete_eval(tensor, context)
57
- create_command_queue
58
56
  buffer = _run(tensor, context)
59
57
  if buffer.is_a?(Array)
60
58
  buffer = buffer.collect do |b|
@@ -66,7 +64,6 @@ module TensorStream
66
64
  return buffer if buffer.nil? || buffer.buffer.size.zero?
67
65
  _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
68
66
  end
69
-
70
67
  _opencl_queue.finish
71
68
  buffer
72
69
  end
@@ -91,15 +88,18 @@ module TensorStream
91
88
  @preferred_device
92
89
  else
93
90
  device, _score, _platform, _index = choose_best_device
91
+ # puts "using #{device.name}"
94
92
  device
95
93
  end
96
94
  end
95
+ @context[:cl_device] = opencl_device
97
96
  @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
98
97
  end
99
98
 
100
99
  def choose_best_device
101
100
  @best_device ||= begin
102
101
  devices = OpenCL.platforms.flat_map do |p|
102
+
103
103
  p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
104
  score = 0
105
105
  if d.type.to_s == 'CPU'
@@ -108,13 +108,18 @@ module TensorStream
108
108
  score += 4
109
109
  end
110
110
 
111
+ if d.platform.name == 'NVIDIA CUDA'
112
+ score += 1000
113
+ end
114
+
111
115
  score += d.max_compute_units
116
+ score += d.max_clock_frequency
112
117
 
113
118
  [d, score, p.name, index]
114
119
  end
115
120
  end
121
+ devices.sort { |a| a[1] }.reverse.first
116
122
  end
117
- devices.max { |a| a[1] }
118
123
  end
119
124
 
120
125
  def create_command_queue
@@ -137,11 +142,13 @@ module TensorStream
137
142
  File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
138
143
  end
139
144
 
140
- def _cl_program(kernel)
141
- @context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
145
+ def _cl_program(kernel, args = {})
146
+ suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
147
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
142
148
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
143
149
  source = File.read(filename)
144
- source = OpenclTemplateHelper.new(source).generate
150
+ source = OpenclTemplateHelper.new(source).generate(args)
151
+ File.write("/tmp/#{kernel}.#{suffix}.cl", source)
145
152
  program = _opencl_context.create_program_with_source(source)
146
153
  program.build
147
154
  rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -152,7 +159,9 @@ module TensorStream
152
159
 
153
160
  def _run(tensor, execution_context)
154
161
  return tensor if tensor.is_a?(OpenCLBuffer)
155
- return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
162
+ if tensor.is_a?(Array) && tensor.size > 0 && tensor[0].is_a?(Tensor)
163
+ return tensor.map { |t| _run(t, execution_context) }
164
+ end
156
165
 
157
166
  return tensor if retain.include?(tensor) # if var is in retain don't eval to value
158
167
 
@@ -180,10 +189,11 @@ module TensorStream
180
189
 
181
190
  def eval_operation(tensor, child_context)
182
191
  return @context[tensor.name] if @context.key?(tensor.name)
183
-
192
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
193
+ return @context[cache_key] if @context.key?(cache_key)
184
194
  a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
185
195
  b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
186
-
196
+ # puts tensor.name
187
197
  case tensor.operation
188
198
  when :concat
189
199
  input_a = read_final_result(complete_eval(a, child_context))
@@ -238,7 +248,6 @@ module TensorStream
238
248
  when :assign_add
239
249
  a = _run(a, child_context)
240
250
  b = _run(b, child_context)
241
-
242
251
  value = execute_2_operand_func('add', tensor, a, b, child_context)
243
252
  assign_var(tensor, value, child_context)
244
253
  when :assign_sub
@@ -290,8 +299,8 @@ module TensorStream
290
299
  raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
291
300
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
292
301
 
293
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
294
- a, b = type_cast(a, b)
302
+ dtype = tensor.data_type
303
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
295
304
  output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
296
305
 
297
306
  cl_m = OpenCL::Int1.new(m)
@@ -301,7 +310,7 @@ module TensorStream
301
310
  transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
302
311
  transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
303
312
 
304
- output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
313
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
305
314
  output_buffer
306
315
  when :mul
307
316
  execute_2_operand_func('mul', tensor, a, b, child_context)
@@ -311,14 +320,12 @@ module TensorStream
311
320
  a = _run(a, child_context)
312
321
  if a.data_type != tensor.data_type
313
322
  buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
314
- s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
315
- t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
316
323
  m, n = a.shape
317
324
  cl_m = OpenCL::Int1.new(m || 1)
318
325
  cl_n = OpenCL::Int1.new(n || 1)
319
326
  work_group = [m || 1, n || 1]
320
327
 
321
- buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
328
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
322
329
  buffer
323
330
  else
324
331
  a
@@ -355,6 +362,34 @@ module TensorStream
355
362
  execute_func('log1p', tensor, a, child_context)
356
363
  when :round
357
364
  execute_func('round', tensor, a, child_context)
365
+ when :softmax
366
+ a = _run(a, child_context)
367
+ event_wait_list = [a.op].compact
368
+ dtype = tensor.data_type
369
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
370
+
371
+ m, n = a.shape
372
+ work_group = [m]
373
+ n = m if n.nil?
374
+ cl_n = OpenCL::Int1.new(n || 1)
375
+
376
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
377
+ output_buffer.op = event
378
+ output_buffer
379
+ when :softmax_grad
380
+ a = _run(a, child_context)
381
+ grad = _run(b, child_context)
382
+ event_wait_list = [a.op].compact
383
+ dtype = tensor.data_type
384
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
385
+
386
+ m, n = a.shape
387
+ work_group = [m]
388
+ n = m if n.nil?
389
+ cl_n = OpenCL::Int1.new(n || 1)
390
+ event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
+ output_buffer.op = event
392
+ output_buffer
358
393
  when :sigmoid_grad
359
394
  execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
360
395
  when :truncate
@@ -381,6 +416,14 @@ module TensorStream
381
416
  end
382
417
  end
383
418
  end
419
+ when :check_numerics
420
+ a = complete_eval(a, child_context)
421
+ name = tensor.options[:name]
422
+
423
+ a.buffer.each do |item|
424
+ raise "#{name} Invalid Argument" if item.nan? || item.infinite?
425
+ end
426
+ a
384
427
  when :zeros, :ones, :zeros_like, :ones_like
385
428
  shape = if %i[zeros_like ones_like].include?(tensor.operation)
386
429
  _run(a, child_context).shape
@@ -551,6 +594,7 @@ module TensorStream
551
594
  else
552
595
  raise "unknown op #{tensor.operation}"
553
596
  end.tap do |result|
597
+ # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
554
598
  if tensor.breakpoint
555
599
  a = read_final_result(complete_eval(a, child_context))
556
600
  b = read_final_result(complete_eval(b, child_context))
@@ -568,11 +612,13 @@ module TensorStream
568
612
  value: result
569
613
  }
570
614
  end
615
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
571
616
  @context[tensor.name] = result
572
617
  end
573
618
  rescue EvaluatorExcecutionException => e
574
619
  raise e
575
620
  rescue StandardError => e
621
+ _opencl_queue.finish # dump queue
576
622
  puts e.message
577
623
  puts e.backtrace.join("\n")
578
624
 
@@ -612,8 +658,12 @@ module TensorStream
612
658
  def assign_var(tensor, b, child_context)
613
659
  assign = tensor.items[0] || tensor
614
660
  buffer = complete_eval(b, child_context)
661
+
615
662
  if assign.buffer
616
- assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
663
+ buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
664
+ if assign.buffer.cl_buffer != buffer.cl_buffer
665
+ assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
666
+ end
617
667
  else
618
668
  assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
619
669
  end
@@ -624,8 +674,8 @@ module TensorStream
624
674
  def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
625
675
  a = _run(input_a, child_context)
626
676
  b = _run(input_b, child_context)
627
- a, b = type_cast(a, b)
628
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
677
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
678
+ dtype = tensor.data_type
629
679
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
630
680
 
631
681
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
@@ -646,9 +696,9 @@ module TensorStream
646
696
  else
647
697
  raise "rank > 2 not supported!"
648
698
  end
649
- _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
699
+ _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
700
  else
651
- _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
701
+ _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
702
  end
653
703
 
654
704
  output_buffer.op = event
@@ -660,8 +710,8 @@ module TensorStream
660
710
  a = _run(input_a, child_context)
661
711
  b = _run(input_b, child_context)
662
712
 
663
- a, b = type_cast(a, b)
664
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
713
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
714
+ dtype = tensor.data_type
665
715
 
666
716
  output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
667
717
 
@@ -671,14 +721,14 @@ module TensorStream
671
721
  cl_n = OpenCL::Int1.new(n || 1)
672
722
 
673
723
  event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
674
- output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
724
+ output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
675
725
  output_buffer
676
726
  end
677
727
 
678
728
  def execute_func(op_name, tensor, a, child_context)
679
729
  a = _run(a, child_context)
680
- event_wait_list = [a.op].compact
681
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
730
+ event_wait_list = [a.op].compact
731
+ dtype = tensor.data_type
682
732
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
683
733
 
684
734
  m, n = a.shape
@@ -686,43 +736,37 @@ module TensorStream
686
736
  cl_m = OpenCL::Int1.new(m || 1)
687
737
  cl_n = OpenCL::Int1.new(n || 1)
688
738
 
689
- event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
739
+ event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
690
740
  output_buffer.op = event
691
741
  output_buffer
692
742
  end
693
743
 
694
- def type_cast(a, b)
744
+ def auto_type_cast(a, b, name: nil)
695
745
  return [a, b] if a.data_type == b.data_type
696
746
  m, n = b.shape
697
747
  work_group = [m || 1, n || 1]
698
- buffer = buffer_for(b.shape, b.data_type)
699
- if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
700
- if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
701
- cl_m = OpenCL::Int1.new(m || 1)
702
- cl_n = OpenCL::Int1.new(n || 1)
748
+ event_wait_list = [b.op].compact
749
+ buffer = _create_result_buffer(b.data_type, b.shape, name)
703
750
 
704
- _cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
705
- return [a, buffer]
706
- end
707
- elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
708
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
709
- cl_m = OpenCL::Int1.new(m || 1)
710
- cl_n = OpenCL::Int1.new(n || 1)
711
- _cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
712
- return [a, buffer]
713
- end
714
- end
751
+ cl_m = OpenCL::Int1.new(m || 1)
752
+ cl_n = OpenCL::Int1.new(n || 1)
715
753
 
716
- [a, b]
754
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
755
+ [a, buffer]
717
756
  end
718
757
 
719
- def buffer_for(shape, data_type)
720
- size = shape.empty? ? 1 : shape.reduce(:*)
758
+ def type_cast(source, data_type, name: nil)
759
+ return source if source.data_type == data_type
760
+ m, n = source.shape
761
+ work_group = [m || 1, n || 1]
762
+ event_wait_list = [source.op].compact
763
+ buffer = _create_result_buffer(data_type, source.shape, name)
721
764
 
722
- buffer = allocate_narray_for_type(data_type, size)
765
+ cl_m = OpenCL::Int1.new(m || 1)
766
+ cl_n = OpenCL::Int1.new(n || 1)
723
767
 
724
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
725
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
768
+ buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
769
+ buffer
726
770
  end
727
771
 
728
772
  def wrap_opencl(tensor, data_type: nil, name: nil)
@@ -786,11 +830,16 @@ module TensorStream
786
830
  end
787
831
 
788
832
  def allocate_narray_for_type(data_type, narray_size)
789
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
833
+ case data_type
834
+ when :float, :float32
790
835
  NArray.sfloat(narray_size)
791
- elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
836
+ when :float64
837
+ NArray.float(narray_size)
838
+ when :int, :int32, :int64
792
839
  NArray.int(narray_size)
793
- elsif data_type.to_sym == :boolean
840
+ when :int16
841
+ NArray.sint(narray_size)
842
+ when :boolean
794
843
  NArray.int(narray_size)
795
844
  else
796
845
  raise "unsupported type #{data_type}"
@@ -798,7 +847,7 @@ module TensorStream
798
847
  end
799
848
 
800
849
  def _create_result_buffer(data_type, shape, name)
801
- @context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
850
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
802
851
  size = shape.empty? ? 1 : shape.reduce(:*)
803
852
  buffer = allocate_narray_for_type(data_type, size)
804
853
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
@@ -843,7 +892,8 @@ module TensorStream
843
892
  input = complete_eval(a, child_context)
844
893
  axis = read_final_result(complete_eval(b, child_context))
845
894
  if axis.nil?
846
- convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
895
+ red = input.buffer.send(func)
896
+ convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
847
897
  else
848
898
  return input if input.shape.empty?
849
899
  value = input.buffer.reshape(*input.shape.reverse)