tensor_stream 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +7 -7
  3. data/CHANGELOG.md +13 -0
  4. data/Dockerfile +25 -0
  5. data/Rakefile +6 -0
  6. data/benchmark/benchmark.rb +16 -57
  7. data/benchmark_intel.txt +21 -0
  8. data/benchmark_nvidia.txt +33 -0
  9. data/lib/tensor_stream.rb +4 -173
  10. data/lib/tensor_stream/debugging/debugging.rb +20 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
  33. data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
  34. data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
  35. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
  36. data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
  37. data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
  38. data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
  39. data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
  40. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
  41. data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
  42. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
  43. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
  44. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
  45. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
  46. data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
  47. data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
  48. data/lib/tensor_stream/math_gradients.rb +6 -5
  49. data/lib/tensor_stream/nn/nn_ops.rb +18 -2
  50. data/lib/tensor_stream/ops.rb +237 -44
  51. data/lib/tensor_stream/tensor.rb +16 -2
  52. data/lib/tensor_stream/utils.rb +205 -0
  53. data/lib/tensor_stream/variable.rb +2 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/error.graphml +2755 -0
  56. data/{test_samples → samples}/iris.rb +18 -24
  57. data/samples/logistic_regression.rb +0 -1
  58. data/test_samples/raw_neural_net_sample.rb +80 -23
  59. metadata +11 -3
@@ -1,4 +1,5 @@
1
- __kernel void round_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,5 +1,6 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- __kernel void sigmoid_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,14 +1,15 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- float sigmoid(float x) {
3
+ float sigmoid(<%= c_dtype %> x) {
3
4
  return 1.0f/(1.0f + exp(-x));
4
5
  }
5
6
 
6
- float sigmoid_grad(float x, float g) {
7
+ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
7
8
  return g * sigmoid(x) * ( 1.0f - sigmoid(x));
8
9
  }
9
10
 
10
11
  // same dimension add floating point op
11
- __kernel void sigmoid_grad_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
12
+ __kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
13
  // Get the index of the current element to be processed
13
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -17,7 +18,7 @@ float sigmoid_grad(float x, float g) {
17
18
  }
18
19
 
19
20
  // 1D + Scalar floating point add op
20
- __kernel void sigmoid_grad_c_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
21
+ __kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
21
22
  // Get the index of the current element to be processed
22
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
23
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -30,7 +31,7 @@ float sigmoid_grad(float x, float g) {
30
31
  }
31
32
 
32
33
  // 1D + Scalar floating point add op broadcast
33
- __kernel void sigmoid_grad_b_fp(const int M, const int N, const int M2, const int N2, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
34
+ __kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
34
35
  // Get the index of the current element to be processed
35
36
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
36
37
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,23 +1,21 @@
1
- __kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
4
  // Get the index of the current element to be processed
3
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
- float value = A[globalRow * N + globalCol];
7
+ <%= c_dtype %> value = A[globalRow * N + globalCol];
8
+ % if is_floating_point?(dtype)
6
9
  if (isnan(value) || value == 0.0f) {
7
10
  C[globalRow * N + globalCol] = 0.0;
8
11
  } else {
9
12
  C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
10
13
  }
11
- }
12
-
13
- __kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
14
- // Get the index of the current element to be processed
15
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
- float value = A[globalRow * N + globalCol];
18
- if (isnan(value) || value == 0) {
19
- C[globalRow * N + globalCol] = 0;
20
- } else {
21
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
22
- }
14
+ % else
15
+ if (value == 0) {
16
+ C[globalRow * N + globalCol] = 0;
17
+ } else {
18
+ C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
19
+ }
20
+ % end
23
21
  }
@@ -1,5 +1,6 @@
1
1
 
2
- __kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
25
+ }
26
+ }
@@ -0,0 +1,46 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void softmax_grad_<%= dtype %>(const int N,
3
+ const __global <%= c_dtype %>* A,
4
+ const __global <%= c_dtype %>* G,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ float acc = 0.0f;
12
+ float max = FLT_MIN;
13
+ float row[<%= size %>];
14
+ float grads[<%= size %>][<%= size %>];
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ row[k] = exp(A[globalRow*N + k] - max) / acc;
27
+ }
28
+
29
+ for (int a=0; a < N; a++) {
30
+ for(int b=0; b < N; b++) {
31
+ if (a != b) {
32
+ grads[a][b] = -row[a] * row[b];
33
+ } else {
34
+ grads[a][b] = row[a] * (1.0f - row[a]);
35
+ }
36
+ }
37
+ }
38
+
39
+ for (int k=0; k < N; k++) {
40
+ float total_grad = 0.0f;
41
+ for (int a = 0; a < N; a++) {
42
+ total_grad += grads[a][k] * G[globalRow*N + a];
43
+ }
44
+ C[globalRow*N + k] = total_grad;
45
+ }
46
+ }
@@ -1,5 +1,6 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
1
2
 
2
- __kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
4
  // Get the index of the current element to be processed
4
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,12 +1,6 @@
1
- __kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
2
- // Get the index of the current element to be processed
3
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
-
6
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
- }
1
+ % c_dtype = dtype_to_c_type(dtype)
8
2
 
9
- __kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
3
+ __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
10
4
  // Get the index of the current element to be processed
11
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,5 +1,3 @@
1
- % %w[fp int].product(%w[sub]).each do |dtype, fname|
2
1
  % c_dtype = dtype_to_c_type(dtype)
3
- % op = operator_to_c(fname)
4
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
- % end
2
+ % op = operator_to_c('sub')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
@@ -1,4 +1,5 @@
1
- __kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,4 +1,5 @@
1
- __kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,4 +1,5 @@
1
- __kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
3
  // Get the index of the current element to be processed
3
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,12 +1,5 @@
1
- __kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
2
- // Get the index of the current element to be processed
3
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
-
6
- C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
7
- }
8
-
9
- __kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
10
3
  // Get the index of the current element to be processed
11
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -41,20 +41,18 @@ module TensorStream
41
41
  @preferred_device = preferred_device
42
42
  @retain = context[:retain] || []
43
43
  @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
-
44
+ @context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
45
45
  @context[:compute_history] = [] if log_intermediates
46
46
  end
47
47
 
48
48
  # opencl evaluator main entrypoint
49
49
  def run(tensor, execution_context)
50
50
  _create_opencl_context
51
- # _prepare_kernels
52
-
51
+ create_command_queue
53
52
  read_final_result(complete_eval(tensor, execution_context))
54
53
  end
55
54
 
56
55
  def complete_eval(tensor, context)
57
- create_command_queue
58
56
  buffer = _run(tensor, context)
59
57
  if buffer.is_a?(Array)
60
58
  buffer = buffer.collect do |b|
@@ -66,7 +64,6 @@ module TensorStream
66
64
  return buffer if buffer.nil? || buffer.buffer.size.zero?
67
65
  _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
68
66
  end
69
-
70
67
  _opencl_queue.finish
71
68
  buffer
72
69
  end
@@ -91,15 +88,18 @@ module TensorStream
91
88
  @preferred_device
92
89
  else
93
90
  device, _score, _platform, _index = choose_best_device
91
+ # puts "using #{device.name}"
94
92
  device
95
93
  end
96
94
  end
95
+ @context[:cl_device] = opencl_device
97
96
  @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
98
97
  end
99
98
 
100
99
  def choose_best_device
101
100
  @best_device ||= begin
102
101
  devices = OpenCL.platforms.flat_map do |p|
102
+
103
103
  p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
104
  score = 0
105
105
  if d.type.to_s == 'CPU'
@@ -108,13 +108,18 @@ module TensorStream
108
108
  score += 4
109
109
  end
110
110
 
111
+ if d.platform.name == 'NVIDIA CUDA'
112
+ score += 1000
113
+ end
114
+
111
115
  score += d.max_compute_units
116
+ score += d.max_clock_frequency
112
117
 
113
118
  [d, score, p.name, index]
114
119
  end
115
120
  end
121
+ devices.sort { |a| a[1] }.reverse.first
116
122
  end
117
- devices.max { |a| a[1] }
118
123
  end
119
124
 
120
125
  def create_command_queue
@@ -137,11 +142,13 @@ module TensorStream
137
142
  File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
138
143
  end
139
144
 
140
- def _cl_program(kernel)
141
- @context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
145
+ def _cl_program(kernel, args = {})
146
+ suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
147
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
142
148
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
143
149
  source = File.read(filename)
144
- source = OpenclTemplateHelper.new(source).generate
150
+ source = OpenclTemplateHelper.new(source).generate(args)
151
+ File.write("/tmp/#{kernel}.#{suffix}.cl", source)
145
152
  program = _opencl_context.create_program_with_source(source)
146
153
  program.build
147
154
  rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -152,7 +159,9 @@ module TensorStream
152
159
 
153
160
  def _run(tensor, execution_context)
154
161
  return tensor if tensor.is_a?(OpenCLBuffer)
155
- return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
162
+ if tensor.is_a?(Array) && tensor.size > 0 && tensor[0].is_a?(Tensor)
163
+ return tensor.map { |t| _run(t, execution_context) }
164
+ end
156
165
 
157
166
  return tensor if retain.include?(tensor) # if var is in retain don't eval to value
158
167
 
@@ -180,10 +189,11 @@ module TensorStream
180
189
 
181
190
  def eval_operation(tensor, child_context)
182
191
  return @context[tensor.name] if @context.key?(tensor.name)
183
-
192
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
193
+ return @context[cache_key] if @context.key?(cache_key)
184
194
  a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
185
195
  b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
186
-
196
+ # puts tensor.name
187
197
  case tensor.operation
188
198
  when :concat
189
199
  input_a = read_final_result(complete_eval(a, child_context))
@@ -238,7 +248,6 @@ module TensorStream
238
248
  when :assign_add
239
249
  a = _run(a, child_context)
240
250
  b = _run(b, child_context)
241
-
242
251
  value = execute_2_operand_func('add', tensor, a, b, child_context)
243
252
  assign_var(tensor, value, child_context)
244
253
  when :assign_sub
@@ -290,8 +299,8 @@ module TensorStream
290
299
  raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
291
300
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
292
301
 
293
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
294
- a, b = type_cast(a, b)
302
+ dtype = tensor.data_type
303
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
295
304
  output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
296
305
 
297
306
  cl_m = OpenCL::Int1.new(m)
@@ -301,7 +310,7 @@ module TensorStream
301
310
  transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
302
311
  transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
303
312
 
304
- output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
313
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
305
314
  output_buffer
306
315
  when :mul
307
316
  execute_2_operand_func('mul', tensor, a, b, child_context)
@@ -311,14 +320,12 @@ module TensorStream
311
320
  a = _run(a, child_context)
312
321
  if a.data_type != tensor.data_type
313
322
  buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
314
- s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
315
- t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
316
323
  m, n = a.shape
317
324
  cl_m = OpenCL::Int1.new(m || 1)
318
325
  cl_n = OpenCL::Int1.new(n || 1)
319
326
  work_group = [m || 1, n || 1]
320
327
 
321
- buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
328
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
322
329
  buffer
323
330
  else
324
331
  a
@@ -355,6 +362,34 @@ module TensorStream
355
362
  execute_func('log1p', tensor, a, child_context)
356
363
  when :round
357
364
  execute_func('round', tensor, a, child_context)
365
+ when :softmax
366
+ a = _run(a, child_context)
367
+ event_wait_list = [a.op].compact
368
+ dtype = tensor.data_type
369
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
370
+
371
+ m, n = a.shape
372
+ work_group = [m]
373
+ n = m if n.nil?
374
+ cl_n = OpenCL::Int1.new(n || 1)
375
+
376
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
377
+ output_buffer.op = event
378
+ output_buffer
379
+ when :softmax_grad
380
+ a = _run(a, child_context)
381
+ grad = _run(b, child_context)
382
+ event_wait_list = [a.op].compact
383
+ dtype = tensor.data_type
384
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
385
+
386
+ m, n = a.shape
387
+ work_group = [m]
388
+ n = m if n.nil?
389
+ cl_n = OpenCL::Int1.new(n || 1)
390
+ event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
+ output_buffer.op = event
392
+ output_buffer
358
393
  when :sigmoid_grad
359
394
  execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
360
395
  when :truncate
@@ -381,6 +416,14 @@ module TensorStream
381
416
  end
382
417
  end
383
418
  end
419
+ when :check_numerics
420
+ a = complete_eval(a, child_context)
421
+ name = tensor.options[:name]
422
+
423
+ a.buffer.each do |item|
424
+ raise "#{name} Invalid Argument" if item.nan? || item.infinite?
425
+ end
426
+ a
384
427
  when :zeros, :ones, :zeros_like, :ones_like
385
428
  shape = if %i[zeros_like ones_like].include?(tensor.operation)
386
429
  _run(a, child_context).shape
@@ -551,6 +594,7 @@ module TensorStream
551
594
  else
552
595
  raise "unknown op #{tensor.operation}"
553
596
  end.tap do |result|
597
+ # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
554
598
  if tensor.breakpoint
555
599
  a = read_final_result(complete_eval(a, child_context))
556
600
  b = read_final_result(complete_eval(b, child_context))
@@ -568,11 +612,13 @@ module TensorStream
568
612
  value: result
569
613
  }
570
614
  end
615
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
571
616
  @context[tensor.name] = result
572
617
  end
573
618
  rescue EvaluatorExcecutionException => e
574
619
  raise e
575
620
  rescue StandardError => e
621
+ _opencl_queue.finish # dump queue
576
622
  puts e.message
577
623
  puts e.backtrace.join("\n")
578
624
 
@@ -612,8 +658,12 @@ module TensorStream
612
658
  def assign_var(tensor, b, child_context)
613
659
  assign = tensor.items[0] || tensor
614
660
  buffer = complete_eval(b, child_context)
661
+
615
662
  if assign.buffer
616
- assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
663
+ buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
664
+ if assign.buffer.cl_buffer != buffer.cl_buffer
665
+ assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
666
+ end
617
667
  else
618
668
  assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
619
669
  end
@@ -624,8 +674,8 @@ module TensorStream
624
674
  def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
625
675
  a = _run(input_a, child_context)
626
676
  b = _run(input_b, child_context)
627
- a, b = type_cast(a, b)
628
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
677
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
678
+ dtype = tensor.data_type
629
679
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
630
680
 
631
681
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
@@ -646,9 +696,9 @@ module TensorStream
646
696
  else
647
697
  raise "rank > 2 not supported!"
648
698
  end
649
- _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
699
+ _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
700
  else
651
- _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
701
+ _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
702
  end
653
703
 
654
704
  output_buffer.op = event
@@ -660,8 +710,8 @@ module TensorStream
660
710
  a = _run(input_a, child_context)
661
711
  b = _run(input_b, child_context)
662
712
 
663
- a, b = type_cast(a, b)
664
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
713
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
714
+ dtype = tensor.data_type
665
715
 
666
716
  output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
667
717
 
@@ -671,14 +721,14 @@ module TensorStream
671
721
  cl_n = OpenCL::Int1.new(n || 1)
672
722
 
673
723
  event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
674
- output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
724
+ output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
675
725
  output_buffer
676
726
  end
677
727
 
678
728
  def execute_func(op_name, tensor, a, child_context)
679
729
  a = _run(a, child_context)
680
- event_wait_list = [a.op].compact
681
- dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
730
+ event_wait_list = [a.op].compact
731
+ dtype = tensor.data_type
682
732
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
683
733
 
684
734
  m, n = a.shape
@@ -686,43 +736,37 @@ module TensorStream
686
736
  cl_m = OpenCL::Int1.new(m || 1)
687
737
  cl_n = OpenCL::Int1.new(n || 1)
688
738
 
689
- event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
739
+ event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
690
740
  output_buffer.op = event
691
741
  output_buffer
692
742
  end
693
743
 
694
- def type_cast(a, b)
744
+ def auto_type_cast(a, b, name: nil)
695
745
  return [a, b] if a.data_type == b.data_type
696
746
  m, n = b.shape
697
747
  work_group = [m || 1, n || 1]
698
- buffer = buffer_for(b.shape, b.data_type)
699
- if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
700
- if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
701
- cl_m = OpenCL::Int1.new(m || 1)
702
- cl_n = OpenCL::Int1.new(n || 1)
748
+ event_wait_list = [b.op].compact
749
+ buffer = _create_result_buffer(b.data_type, b.shape, name)
703
750
 
704
- _cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
705
- return [a, buffer]
706
- end
707
- elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
708
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
709
- cl_m = OpenCL::Int1.new(m || 1)
710
- cl_n = OpenCL::Int1.new(n || 1)
711
- _cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
712
- return [a, buffer]
713
- end
714
- end
751
+ cl_m = OpenCL::Int1.new(m || 1)
752
+ cl_n = OpenCL::Int1.new(n || 1)
715
753
 
716
- [a, b]
754
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
755
+ [a, buffer]
717
756
  end
718
757
 
719
- def buffer_for(shape, data_type)
720
- size = shape.empty? ? 1 : shape.reduce(:*)
758
+ def type_cast(source, data_type, name: nil)
759
+ return source if source.data_type == data_type
760
+ m, n = source.shape
761
+ work_group = [m || 1, n || 1]
762
+ event_wait_list = [source.op].compact
763
+ buffer = _create_result_buffer(data_type, source.shape, name)
721
764
 
722
- buffer = allocate_narray_for_type(data_type, size)
765
+ cl_m = OpenCL::Int1.new(m || 1)
766
+ cl_n = OpenCL::Int1.new(n || 1)
723
767
 
724
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
725
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
768
+ buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
769
+ buffer
726
770
  end
727
771
 
728
772
  def wrap_opencl(tensor, data_type: nil, name: nil)
@@ -786,11 +830,16 @@ module TensorStream
786
830
  end
787
831
 
788
832
  def allocate_narray_for_type(data_type, narray_size)
789
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
833
+ case data_type
834
+ when :float, :float32
790
835
  NArray.sfloat(narray_size)
791
- elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
836
+ when :float64
837
+ NArray.float(narray_size)
838
+ when :int, :int32, :int64
792
839
  NArray.int(narray_size)
793
- elsif data_type.to_sym == :boolean
840
+ when :int16
841
+ NArray.sint(narray_size)
842
+ when :boolean
794
843
  NArray.int(narray_size)
795
844
  else
796
845
  raise "unsupported type #{data_type}"
@@ -798,7 +847,7 @@ module TensorStream
798
847
  end
799
848
 
800
849
  def _create_result_buffer(data_type, shape, name)
801
- @context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
850
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
802
851
  size = shape.empty? ? 1 : shape.reduce(:*)
803
852
  buffer = allocate_narray_for_type(data_type, size)
804
853
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
@@ -843,7 +892,8 @@ module TensorStream
843
892
  input = complete_eval(a, child_context)
844
893
  axis = read_final_result(complete_eval(b, child_context))
845
894
  if axis.nil?
846
- convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
895
+ red = input.buffer.send(func)
896
+ convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
847
897
  else
848
898
  return input if input.shape.empty?
849
899
  value = input.buffer.reshape(*input.shape.reverse)