tensor_stream 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f14dd6388d5cdd10827cebde01a9cbca0686b653
4
- data.tar.gz: d2ccba35defe6474a21bd75fcb09f8d49ce42e79
2
+ SHA256:
3
+ metadata.gz: 6d647cef8f32fa7b3c10460365adfc55ccdd9872e71d453df090986349b615f5
4
+ data.tar.gz: baa7be71775bc5d39396343b6d4c32943cf3b79d5a2e591c885bd6fc9314883e
5
5
  SHA512:
6
- metadata.gz: 244026aae6ce13d8e932deada3c169b5320df517eb5dd7db5ea8c06c1cdedc9c9829d7f149261602f502c284ffe65ae831845016d9425250b0ad9d7d66fc6a0e
7
- data.tar.gz: 91811c88a464604f5ca1e776f86d0342dc316ee016d920d40d8a228e2978f6a275783c5613a97cf21af1ba9256c951ad3db777b66fae20e5d1f8f9659f170301
6
+ metadata.gz: d3207ef919464e696d03fe7bbd264ba606565bf09d66796d461b687f047e8b2b969259bcffaf7524677d6b22398ba32025ca8f94c33756cda3a3bb37f535a902
7
+ data.tar.gz: 735dbd55e54237619bb9c6653818dade6df257fb71a58cc4abd540a3053d51d318d734ba149c3889bd6403a5219166c291534a66528b83e481ba1240e2696e49
data/CHANGELOG.md CHANGED
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.6.0] - 2018-07-21
8
+ ### Added
9
+ - [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
10
+
11
+ ### Fixes
12
+ - [General] Some auto-differentation fixes
13
+ - [softmax_cross_entropy_with_logits_v2] Use numerically stable way of calculating values
14
+ - Other fixes related to shape computation
15
+
7
16
  ## [0.5.1] - 2018-06-27
8
17
  ### Added
9
18
  - Added support for control_dependencies
@@ -0,0 +1,36 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device AMD Accelerated Parallel Processing Ellesmere
5
+ Rehearsal --------------------------------------------------------------
6
+ pure ruby ooo matmul : 1.480000 0.000000 1.480000 ( 1.486855)
7
+ opencl ooo matmul : 0.190000 0.130000 0.320000 ( 0.332605)
8
+ pure ruby softmax : 0.280000 0.000000 0.280000 ( 0.278398)
9
+ opencl softmax : 0.040000 0.020000 0.060000 ( 0.070980)
10
+ pure ruby matmul : 0.730000 0.000000 0.730000 ( 0.726565)
11
+ opencl matmul : 0.020000 0.010000 0.030000 ( 0.046762)
12
+ pure ruby : 2.550000 0.000000 2.550000 ( 2.544265)
13
+ opencl : 0.290000 0.020000 0.310000 ( 0.318674)
14
+ pure ruby single function: 0.370000 0.000000 0.370000 ( 0.374805)
15
+ opencl singlefunction: 0.190000 0.050000 0.240000 ( 0.239913)
16
+ pure ruby pow float: 0.090000 0.000000 0.090000 ( 0.093488)
17
+ opencl pow float: 0.100000 0.010000 0.110000 ( 0.110532)
18
+ pure ruby pow int: 0.030000 0.000000 0.030000 ( 0.022236)
19
+ opencl pow int: 0.090000 0.010000 0.100000 ( 0.111199)
20
+ ----------------------------------------------------- total: 6.700000sec
21
+
22
+ user system total real
23
+ pure ruby ooo matmul : 1.460000 0.000000 1.460000 ( 1.468597)
24
+ opencl ooo matmul : 0.040000 0.000000 0.040000 ( 0.053625)
25
+ pure ruby softmax : 0.280000 0.000000 0.280000 ( 0.280252)
26
+ opencl softmax : 0.020000 0.010000 0.030000 ( 0.043143)
27
+ pure ruby matmul : 0.700000 0.000000 0.700000 ( 0.703540)
28
+ opencl matmul : 0.030000 0.000000 0.030000 ( 0.037716)
29
+ pure ruby : 2.540000 0.000000 2.540000 ( 2.539661)
30
+ opencl : 0.150000 0.000000 0.150000 ( 0.164203)
31
+ pure ruby single function: 0.350000 0.000000 0.350000 ( 0.351883)
32
+ opencl singlefunction: 0.090000 0.010000 0.100000 ( 0.092359)
33
+ pure ruby pow float: 0.080000 0.000000 0.080000 ( 0.080484)
34
+ opencl pow float: 0.030000 0.000000 0.030000 ( 0.032691)
35
+ pure ruby pow int: 0.020000 0.000000 0.020000 ( 0.019487)
36
+ opencl pow int: 0.020000 0.000000 0.020000 ( 0.026782)
@@ -0,0 +1,28 @@
1
+ module TensorStream
2
+ # Defines a TensorStream controlflow op
3
+ class DynamicStitch < Operation
4
+ attr_accessor :ops
5
+
6
+ def initialize(flow_type, inputs, ops = nil, options = {})
7
+ setup_initial_state(options)
8
+
9
+ @operation = :"flow_#{flow_type}"
10
+ @inputs = inputs
11
+
12
+ @data_type = Tensor.detect_type(inputs[1])
13
+ @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
+ @ops = ops
15
+ @shape = TensorShape.new([inputs.size])
16
+ @graph.add_node(self)
17
+ end
18
+
19
+ def set_data_type(_passed_data_type)
20
+ :unknown
21
+ end
22
+
23
+ def run
24
+ eval
25
+ end
26
+ end
27
+ end
28
+
@@ -79,7 +79,6 @@ module TensorStream
79
79
  @ops[op.to_sym] = { options: options, block: block }
80
80
  end
81
81
  else
82
-
83
82
  @ops[opcode.to_sym] = { options: options, block: block }
84
83
  end
85
84
  end
@@ -87,16 +86,24 @@ module TensorStream
87
86
  ##
88
87
  # gets all supported ops for this Evaluator class
89
88
  def self.ops
90
- @ops ||={}
89
+ @ops ||= {}
91
90
  end
92
91
 
93
92
  def invoke(tensor, execution_context)
93
+ return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
94
+
94
95
  if self.class.ops.key?(tensor.operation.to_sym)
95
96
  op = self.class.ops[tensor.operation.to_sym]
97
+
96
98
  op_options = op[:options]
97
99
  resolved_inputs = tensor.inputs.map do |i|
98
100
  next if i.nil?
99
- if @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
101
+
102
+ if i.is_a?(Array)
103
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
104
+ end
105
+
106
+ if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
100
107
  cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
101
108
  next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
102
109
 
@@ -116,6 +123,28 @@ module TensorStream
116
123
 
117
124
  protected
118
125
 
126
+ def get_broadcast_gradient_args(input_a, input_b)
127
+ return [[], []] if input_a == input_b
128
+
129
+ input_a_args = []
130
+ input_b_args = []
131
+
132
+ input_a = input_b.size.times.map { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
133
+ input_b = input_a.size.times.map { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
134
+
135
+ input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
136
+ a, b = item
137
+
138
+ if a.nil? || b && (a < b)
139
+ input_a_args << input_b.size - index - 1
140
+ elsif b.nil? || a && (a > b)
141
+ input_b_args << input_a.size - index - 1
142
+ end
143
+ end
144
+
145
+ [input_a_args.reverse, input_b_args.reverse]
146
+ end
147
+
119
148
  ##
120
149
  # converts from a ruby Buffer object to the evaluator's native buffer format
121
150
  def convert_from_buffer(tensor, result)
@@ -0,0 +1,48 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % fname = 'floor_div'
3
+ % result_t = c_dtype
4
+ // same dimension add floating point op
5
+ __kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
6
+ // Get the index of the current element to be processed
7
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
8
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
+
10
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
11
+ }
12
+
13
+ // 1D + Scalar floating point add op
14
+ __kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
15
+ // Get the index of the current element to be processed
16
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
17
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
18
+
19
+ if (switch_op == 0) {
20
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
21
+ } else {
22
+ C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
23
+ }
24
+ }
25
+
26
+ // 1D + Scalar floating point add op broadcast
27
+ __kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
28
+ // Get the index of the current element to be processed
29
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
30
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
31
+
32
+ int b_m_index = globalRow;
33
+ int b_n_index = globalCol;
34
+
35
+ if ( b_m_index >= M2) {
36
+ b_m_index = b_m_index % M2;
37
+ };
38
+
39
+ if (b_n_index >= N2) {
40
+ b_n_index = b_n_index % N2;
41
+ }
42
+
43
+ if (switch_op == 0) {
44
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
45
+ } else {
46
+ C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
47
+ }
48
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,53 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
8
+ <%= c_dtype %> y = B[globalRow * N + globalCol];
9
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
10
+ }
11
+
12
+ // 1D + Scalar floating point add op
13
+ __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
14
+ // Get the index of the current element to be processed
15
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
+
18
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
19
+ <%= c_dtype %> y = B[0];
20
+
21
+ if (switch_op == 0) {
22
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
23
+ } else {
24
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
25
+ }
26
+ }
27
+
28
+ // 1D + Scalar floating point add op broadcast
29
+ __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
30
+ // Get the index of the current element to be processed
31
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
32
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
33
+
34
+ int b_m_index = globalRow;
35
+ int b_n_index = globalCol;
36
+
37
+ if ( b_m_index >= M2) {
38
+ b_m_index = b_m_index % M2;
39
+ };
40
+
41
+ if (b_n_index >= N2) {
42
+ b_n_index = b_n_index % N2;
43
+ }
44
+
45
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
46
+ <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
47
+
48
+ if (switch_op == 0) {
49
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
50
+ } else {
51
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
52
+ }
53
+ }
@@ -25,8 +25,7 @@ module TensorStream
25
25
  op.command_queue.finish
26
26
  self.dirty = false
27
27
  end
28
-
29
- result = buffer.reshape(*shape.reverse).to_a
28
+ result = buffer.reshape(*shape.map { |s| s.to_i}.reverse).to_a
30
29
 
31
30
  if data_type == :boolean
32
31
  result = process_function_op(result, ->(a, _b) { a != 0 })
@@ -109,7 +109,9 @@ module TensorStream
109
109
  b
110
110
  end
111
111
  else
112
- return buffer if buffer.nil? || buffer.buffer.size.zero?
112
+ return buffer if buffer.nil?
113
+ return [] if buffer.buffer.nil?
114
+ return buffer if buffer.buffer.size.zero?
113
115
  _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
114
116
  end
115
117
  _opencl_queue.finish
@@ -202,6 +204,7 @@ module TensorStream
202
204
  suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
203
205
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
204
206
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
207
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
205
208
  source = File.read(filename)
206
209
  source = OpenclTemplateHelper.new(source).generate(args)
207
210
  # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
@@ -251,13 +254,13 @@ module TensorStream
251
254
  execute_func('log', tensor, inputs[0], context)
252
255
  end
253
256
 
254
- register_op :cond do |context, tensor, inputs|
257
+ register_op :cond, noop: true do |context, tensor, inputs|
255
258
  pred = complete_eval(tensor.options[:pred], context)
256
259
 
257
260
  if all_true?(pred.buffer)
258
- inputs[0]
261
+ complete_eval(inputs[0], context)
259
262
  else
260
- inputs[1]
263
+ complete_eval(inputs[1], context)
261
264
  end
262
265
  end
263
266
 
@@ -285,12 +288,20 @@ module TensorStream
285
288
  end
286
289
  end
287
290
 
288
- %i[max add div sub mul pow sigmoid_grad].each do |op|
291
+ %i[max add div sub mod mul pow sigmoid_grad squared_difference].each do |op|
289
292
  register_op op, noop: true do |context, tensor, inputs|
290
293
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
291
294
  end
292
295
  end
293
296
 
297
+ register_op :floor_div, noop: true do |context, tensor, inputs|
298
+ if fp_type?(tensor.data_type)
299
+ execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
300
+ else
301
+ execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
302
+ end
303
+ end
304
+
294
305
  register_op :where, noop: true do |context, tensor, inputs|
295
306
  pred = tensor.options[:pred]
296
307
  execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
@@ -479,11 +490,12 @@ module TensorStream
479
490
  end
480
491
 
481
492
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
482
- wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
493
+ rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
494
+ [ wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")]
483
495
  end
484
496
 
485
497
  register_op :shape do |_context, tensor, inputs|
486
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
498
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
487
499
  end
488
500
 
489
501
  register_op :reshape, buffer: true do |_context, _tensor, inputs|
@@ -504,6 +516,10 @@ module TensorStream
504
516
  inputs
505
517
  end
506
518
 
519
+ register_op :size do |_context, tensor, inputs|
520
+ wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
521
+ end
522
+
507
523
  %i[sum mean].each do |op|
508
524
  register_op op, noop: true do |context, tensor, inputs|
509
525
  reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
@@ -534,8 +550,9 @@ module TensorStream
534
550
  end
535
551
 
536
552
  def eval_operation(tensor, child_context)
537
- return @context[tensor.name] if @context.key?(tensor.name)
553
+
538
554
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
555
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
539
556
  return @context[cache_key] if @context.key?(cache_key)
540
557
  # puts tensor.name
541
558
  invoke(tensor, child_context).tap do |result|
@@ -559,8 +576,8 @@ module TensorStream
559
576
  value: result
560
577
  }
561
578
  end
562
- @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
563
- @context[tensor.name] = result
579
+ @context[cache_key] = result
580
+ @context[:_cache][cache_key] = result if tensor.is_const
564
581
  end
565
582
  rescue EvaluatorExcecutionException => e
566
583
  raise e
@@ -628,6 +645,7 @@ module TensorStream
628
645
  a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
629
646
  dtype = tensor.data_type
630
647
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
648
+ return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
631
649
 
632
650
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
633
651
  a, b, prog, switch_operands = select_program(a, b, op_name)
@@ -799,8 +817,9 @@ module TensorStream
799
817
  end
800
818
 
801
819
  def _create_result_buffer(data_type, shape, name)
820
+ return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
802
821
  @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
803
- size = shape.empty? ? 1 : shape.reduce(:*)
822
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
804
823
  buffer = allocate_narray_for_type(data_type, size)
805
824
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
806
825
  OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
@@ -840,6 +859,17 @@ module TensorStream
840
859
  end
841
860
  end
842
861
 
862
+ def _reduced_shape(input_shape, axes)
863
+ return [] if axes.nil? # reduce to scalar
864
+ axes = [ axes ] unless axes.is_a?(Array)
865
+ return input_shape if axes.empty?
866
+
867
+ axes.each do |dimen|
868
+ input_shape[dimen] = 1
869
+ end
870
+ input_shape
871
+ end
872
+
843
873
  def reduction(child_context, tensor, a, b, func)
844
874
  input = complete_eval(a, child_context)
845
875
  axis = read_final_result(complete_eval(b, child_context))
@@ -853,7 +883,8 @@ module TensorStream
853
883
 
854
884
  if axis.is_a?(Array)
855
885
  axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
856
- value = value.send(func, x)
886
+
887
+ value = value.send(func, x.to_i)
857
888
  end
858
889
  else
859
890
  value = value.send(func, rank - axis.abs)
@@ -867,7 +898,7 @@ module TensorStream
867
898
  end
868
899
 
869
900
  if tensor.options[:keepdims]
870
- new_shape = reduced_shape(input.shape.dup, axis)
901
+ new_shape = _reduced_shape(input.shape.dup, axis)
871
902
  end
872
903
 
873
904
  convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
@@ -948,17 +979,6 @@ module TensorStream
948
979
  shape.is_a?(Array) ? shape.size : 0
949
980
  end
950
981
 
951
- def get_broadcast_gradient_args(input_a, input_b)
952
- return [] if get_rank(input_b).zero? && get_rank(input_a).zero?
953
- return nil if get_rank(input_b).zero?
954
- # ruby scalar
955
- if get_rank(input_a).zero?
956
- _broadcast_gradient_op(input_b, input_a, 0, true)
957
- elsif get_rank(input_a) > 0
958
- _broadcast_gradient_op(input_a, input_b, 0)
959
- end
960
- end
961
-
962
982
  def concat_array(values, axis)
963
983
  combined_array = values.shift
964
984
  axis = get_rank(combined_array) - 1 if axis == -1