tensor_stream 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f14dd6388d5cdd10827cebde01a9cbca0686b653
4
- data.tar.gz: d2ccba35defe6474a21bd75fcb09f8d49ce42e79
2
+ SHA256:
3
+ metadata.gz: 6d647cef8f32fa7b3c10460365adfc55ccdd9872e71d453df090986349b615f5
4
+ data.tar.gz: baa7be71775bc5d39396343b6d4c32943cf3b79d5a2e591c885bd6fc9314883e
5
5
  SHA512:
6
- metadata.gz: 244026aae6ce13d8e932deada3c169b5320df517eb5dd7db5ea8c06c1cdedc9c9829d7f149261602f502c284ffe65ae831845016d9425250b0ad9d7d66fc6a0e
7
- data.tar.gz: 91811c88a464604f5ca1e776f86d0342dc316ee016d920d40d8a228e2978f6a275783c5613a97cf21af1ba9256c951ad3db777b66fae20e5d1f8f9659f170301
6
+ metadata.gz: d3207ef919464e696d03fe7bbd264ba606565bf09d66796d461b687f047e8b2b969259bcffaf7524677d6b22398ba32025ca8f94c33756cda3a3bb37f535a902
7
+ data.tar.gz: 735dbd55e54237619bb9c6653818dade6df257fb71a58cc4abd540a3053d51d318d734ba149c3889bd6403a5219166c291534a66528b83e481ba1240e2696e49
data/CHANGELOG.md CHANGED
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.6.0] - 2018-07-21
8
+ ### Added
9
+ - [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
10
+
11
+ ### Fixes
12
+ - [General] Some auto-differentation fixes
13
+ - [softmax_cross_entropy_with_logits_v2] Use numerically stable way of calculating values
14
+ - Other fixes related to shape computation
15
+
7
16
  ## [0.5.1] - 2018-06-27
8
17
  ### Added
9
18
  - Added support for control_dependencies
@@ -0,0 +1,36 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device AMD Accelerated Parallel Processing Ellesmere
5
+ Rehearsal --------------------------------------------------------------
6
+ pure ruby ooo matmul : 1.480000 0.000000 1.480000 ( 1.486855)
7
+ opencl ooo matmul : 0.190000 0.130000 0.320000 ( 0.332605)
8
+ pure ruby softmax : 0.280000 0.000000 0.280000 ( 0.278398)
9
+ opencl softmax : 0.040000 0.020000 0.060000 ( 0.070980)
10
+ pure ruby matmul : 0.730000 0.000000 0.730000 ( 0.726565)
11
+ opencl matmul : 0.020000 0.010000 0.030000 ( 0.046762)
12
+ pure ruby : 2.550000 0.000000 2.550000 ( 2.544265)
13
+ opencl : 0.290000 0.020000 0.310000 ( 0.318674)
14
+ pure ruby single function: 0.370000 0.000000 0.370000 ( 0.374805)
15
+ opencl singlefunction: 0.190000 0.050000 0.240000 ( 0.239913)
16
+ pure ruby pow float: 0.090000 0.000000 0.090000 ( 0.093488)
17
+ opencl pow float: 0.100000 0.010000 0.110000 ( 0.110532)
18
+ pure ruby pow int: 0.030000 0.000000 0.030000 ( 0.022236)
19
+ opencl pow int: 0.090000 0.010000 0.100000 ( 0.111199)
20
+ ----------------------------------------------------- total: 6.700000sec
21
+
22
+ user system total real
23
+ pure ruby ooo matmul : 1.460000 0.000000 1.460000 ( 1.468597)
24
+ opencl ooo matmul : 0.040000 0.000000 0.040000 ( 0.053625)
25
+ pure ruby softmax : 0.280000 0.000000 0.280000 ( 0.280252)
26
+ opencl softmax : 0.020000 0.010000 0.030000 ( 0.043143)
27
+ pure ruby matmul : 0.700000 0.000000 0.700000 ( 0.703540)
28
+ opencl matmul : 0.030000 0.000000 0.030000 ( 0.037716)
29
+ pure ruby : 2.540000 0.000000 2.540000 ( 2.539661)
30
+ opencl : 0.150000 0.000000 0.150000 ( 0.164203)
31
+ pure ruby single function: 0.350000 0.000000 0.350000 ( 0.351883)
32
+ opencl singlefunction: 0.090000 0.010000 0.100000 ( 0.092359)
33
+ pure ruby pow float: 0.080000 0.000000 0.080000 ( 0.080484)
34
+ opencl pow float: 0.030000 0.000000 0.030000 ( 0.032691)
35
+ pure ruby pow int: 0.020000 0.000000 0.020000 ( 0.019487)
36
+ opencl pow int: 0.020000 0.000000 0.020000 ( 0.026782)
@@ -0,0 +1,28 @@
1
+ module TensorStream
2
+ # Defines a TensorStream controlflow op
3
+ class DynamicStitch < Operation
4
+ attr_accessor :ops
5
+
6
+ def initialize(flow_type, inputs, ops = nil, options = {})
7
+ setup_initial_state(options)
8
+
9
+ @operation = :"flow_#{flow_type}"
10
+ @inputs = inputs
11
+
12
+ @data_type = Tensor.detect_type(inputs[1])
13
+ @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
+ @ops = ops
15
+ @shape = TensorShape.new([inputs.size])
16
+ @graph.add_node(self)
17
+ end
18
+
19
+ def set_data_type(_passed_data_type)
20
+ :unknown
21
+ end
22
+
23
+ def run
24
+ eval
25
+ end
26
+ end
27
+ end
28
+
@@ -79,7 +79,6 @@ module TensorStream
79
79
  @ops[op.to_sym] = { options: options, block: block }
80
80
  end
81
81
  else
82
-
83
82
  @ops[opcode.to_sym] = { options: options, block: block }
84
83
  end
85
84
  end
@@ -87,16 +86,24 @@ module TensorStream
87
86
  ##
88
87
  # gets all supported ops for this Evaluator class
89
88
  def self.ops
90
- @ops ||={}
89
+ @ops ||= {}
91
90
  end
92
91
 
93
92
  def invoke(tensor, execution_context)
93
+ return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
94
+
94
95
  if self.class.ops.key?(tensor.operation.to_sym)
95
96
  op = self.class.ops[tensor.operation.to_sym]
97
+
96
98
  op_options = op[:options]
97
99
  resolved_inputs = tensor.inputs.map do |i|
98
100
  next if i.nil?
99
- if @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
101
+
102
+ if i.is_a?(Array)
103
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
104
+ end
105
+
106
+ if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
100
107
  cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
101
108
  next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
102
109
 
@@ -116,6 +123,28 @@ module TensorStream
116
123
 
117
124
  protected
118
125
 
126
+ def get_broadcast_gradient_args(input_a, input_b)
127
+ return [[], []] if input_a == input_b
128
+
129
+ input_a_args = []
130
+ input_b_args = []
131
+
132
+ input_a = input_b.size.times.map { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
133
+ input_b = input_a.size.times.map { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
134
+
135
+ input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
136
+ a, b = item
137
+
138
+ if a.nil? || b && (a < b)
139
+ input_a_args << input_b.size - index - 1
140
+ elsif b.nil? || a && (a > b)
141
+ input_b_args << input_a.size - index - 1
142
+ end
143
+ end
144
+
145
+ [input_a_args.reverse, input_b_args.reverse]
146
+ end
147
+
119
148
  ##
120
149
  # converts from a ruby Buffer object to the evaluator's native buffer format
121
150
  def convert_from_buffer(tensor, result)
@@ -0,0 +1,48 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % fname = 'floor_div'
3
+ % result_t = c_dtype
4
+ // same dimension add floating point op
5
+ __kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
6
+ // Get the index of the current element to be processed
7
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
8
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
+
10
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
11
+ }
12
+
13
+ // 1D + Scalar floating point add op
14
+ __kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
15
+ // Get the index of the current element to be processed
16
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
17
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
18
+
19
+ if (switch_op == 0) {
20
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
21
+ } else {
22
+ C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
23
+ }
24
+ }
25
+
26
+ // 1D + Scalar floating point add op broadcast
27
+ __kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
28
+ // Get the index of the current element to be processed
29
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
30
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
31
+
32
+ int b_m_index = globalRow;
33
+ int b_n_index = globalCol;
34
+
35
+ if ( b_m_index >= M2) {
36
+ b_m_index = b_m_index % M2;
37
+ };
38
+
39
+ if (b_n_index >= N2) {
40
+ b_n_index = b_n_index % N2;
41
+ }
42
+
43
+ if (switch_op == 0) {
44
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
45
+ } else {
46
+ C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
47
+ }
48
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,53 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
8
+ <%= c_dtype %> y = B[globalRow * N + globalCol];
9
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
10
+ }
11
+
12
+ // 1D + Scalar floating point add op
13
+ __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
14
+ // Get the index of the current element to be processed
15
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
+
18
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
19
+ <%= c_dtype %> y = B[0];
20
+
21
+ if (switch_op == 0) {
22
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
23
+ } else {
24
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
25
+ }
26
+ }
27
+
28
+ // 1D + Scalar floating point add op broadcast
29
+ __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
30
+ // Get the index of the current element to be processed
31
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
32
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
33
+
34
+ int b_m_index = globalRow;
35
+ int b_n_index = globalCol;
36
+
37
+ if ( b_m_index >= M2) {
38
+ b_m_index = b_m_index % M2;
39
+ };
40
+
41
+ if (b_n_index >= N2) {
42
+ b_n_index = b_n_index % N2;
43
+ }
44
+
45
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
46
+ <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
47
+
48
+ if (switch_op == 0) {
49
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
50
+ } else {
51
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
52
+ }
53
+ }
@@ -25,8 +25,7 @@ module TensorStream
25
25
  op.command_queue.finish
26
26
  self.dirty = false
27
27
  end
28
-
29
- result = buffer.reshape(*shape.reverse).to_a
28
+ result = buffer.reshape(*shape.map { |s| s.to_i}.reverse).to_a
30
29
 
31
30
  if data_type == :boolean
32
31
  result = process_function_op(result, ->(a, _b) { a != 0 })
@@ -109,7 +109,9 @@ module TensorStream
109
109
  b
110
110
  end
111
111
  else
112
- return buffer if buffer.nil? || buffer.buffer.size.zero?
112
+ return buffer if buffer.nil?
113
+ return [] if buffer.buffer.nil?
114
+ return buffer if buffer.buffer.size.zero?
113
115
  _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
114
116
  end
115
117
  _opencl_queue.finish
@@ -202,6 +204,7 @@ module TensorStream
202
204
  suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
203
205
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
204
206
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
207
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
205
208
  source = File.read(filename)
206
209
  source = OpenclTemplateHelper.new(source).generate(args)
207
210
  # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
@@ -251,13 +254,13 @@ module TensorStream
251
254
  execute_func('log', tensor, inputs[0], context)
252
255
  end
253
256
 
254
- register_op :cond do |context, tensor, inputs|
257
+ register_op :cond, noop: true do |context, tensor, inputs|
255
258
  pred = complete_eval(tensor.options[:pred], context)
256
259
 
257
260
  if all_true?(pred.buffer)
258
- inputs[0]
261
+ complete_eval(inputs[0], context)
259
262
  else
260
- inputs[1]
263
+ complete_eval(inputs[1], context)
261
264
  end
262
265
  end
263
266
 
@@ -285,12 +288,20 @@ module TensorStream
285
288
  end
286
289
  end
287
290
 
288
- %i[max add div sub mul pow sigmoid_grad].each do |op|
291
+ %i[max add div sub mod mul pow sigmoid_grad squared_difference].each do |op|
289
292
  register_op op, noop: true do |context, tensor, inputs|
290
293
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
291
294
  end
292
295
  end
293
296
 
297
+ register_op :floor_div, noop: true do |context, tensor, inputs|
298
+ if fp_type?(tensor.data_type)
299
+ execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
300
+ else
301
+ execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
302
+ end
303
+ end
304
+
294
305
  register_op :where, noop: true do |context, tensor, inputs|
295
306
  pred = tensor.options[:pred]
296
307
  execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
@@ -479,11 +490,12 @@ module TensorStream
479
490
  end
480
491
 
481
492
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
482
- wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
493
+ rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
494
+ [ wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")]
483
495
  end
484
496
 
485
497
  register_op :shape do |_context, tensor, inputs|
486
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
498
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
487
499
  end
488
500
 
489
501
  register_op :reshape, buffer: true do |_context, _tensor, inputs|
@@ -504,6 +516,10 @@ module TensorStream
504
516
  inputs
505
517
  end
506
518
 
519
+ register_op :size do |_context, tensor, inputs|
520
+ wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
521
+ end
522
+
507
523
  %i[sum mean].each do |op|
508
524
  register_op op, noop: true do |context, tensor, inputs|
509
525
  reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
@@ -534,8 +550,9 @@ module TensorStream
534
550
  end
535
551
 
536
552
  def eval_operation(tensor, child_context)
537
- return @context[tensor.name] if @context.key?(tensor.name)
553
+
538
554
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
555
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
539
556
  return @context[cache_key] if @context.key?(cache_key)
540
557
  # puts tensor.name
541
558
  invoke(tensor, child_context).tap do |result|
@@ -559,8 +576,8 @@ module TensorStream
559
576
  value: result
560
577
  }
561
578
  end
562
- @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
563
- @context[tensor.name] = result
579
+ @context[cache_key] = result
580
+ @context[:_cache][cache_key] = result if tensor.is_const
564
581
  end
565
582
  rescue EvaluatorExcecutionException => e
566
583
  raise e
@@ -628,6 +645,7 @@ module TensorStream
628
645
  a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
629
646
  dtype = tensor.data_type
630
647
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
648
+ return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
631
649
 
632
650
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
633
651
  a, b, prog, switch_operands = select_program(a, b, op_name)
@@ -799,8 +817,9 @@ module TensorStream
799
817
  end
800
818
 
801
819
  def _create_result_buffer(data_type, shape, name)
820
+ return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
802
821
  @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
803
- size = shape.empty? ? 1 : shape.reduce(:*)
822
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
804
823
  buffer = allocate_narray_for_type(data_type, size)
805
824
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
806
825
  OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
@@ -840,6 +859,17 @@ module TensorStream
840
859
  end
841
860
  end
842
861
 
862
+ def _reduced_shape(input_shape, axes)
863
+ return [] if axes.nil? # reduce to scalar
864
+ axes = [ axes ] unless axes.is_a?(Array)
865
+ return input_shape if axes.empty?
866
+
867
+ axes.each do |dimen|
868
+ input_shape[dimen] = 1
869
+ end
870
+ input_shape
871
+ end
872
+
843
873
  def reduction(child_context, tensor, a, b, func)
844
874
  input = complete_eval(a, child_context)
845
875
  axis = read_final_result(complete_eval(b, child_context))
@@ -853,7 +883,8 @@ module TensorStream
853
883
 
854
884
  if axis.is_a?(Array)
855
885
  axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
856
- value = value.send(func, x)
886
+
887
+ value = value.send(func, x.to_i)
857
888
  end
858
889
  else
859
890
  value = value.send(func, rank - axis.abs)
@@ -867,7 +898,7 @@ module TensorStream
867
898
  end
868
899
 
869
900
  if tensor.options[:keepdims]
870
- new_shape = reduced_shape(input.shape.dup, axis)
901
+ new_shape = _reduced_shape(input.shape.dup, axis)
871
902
  end
872
903
 
873
904
  convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
@@ -948,17 +979,6 @@ module TensorStream
948
979
  shape.is_a?(Array) ? shape.size : 0
949
980
  end
950
981
 
951
- def get_broadcast_gradient_args(input_a, input_b)
952
- return [] if get_rank(input_b).zero? && get_rank(input_a).zero?
953
- return nil if get_rank(input_b).zero?
954
- # ruby scalar
955
- if get_rank(input_a).zero?
956
- _broadcast_gradient_op(input_b, input_a, 0, true)
957
- elsif get_rank(input_a) > 0
958
- _broadcast_gradient_op(input_a, input_b, 0)
959
- end
960
- end
961
-
962
982
  def concat_array(values, axis)
963
983
  combined_array = values.shift
964
984
  axis = get_rank(combined_array) - 1 if axis == -1