tensor_stream-opencl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +11 -4
  3. data/benchmark/benchmark.rb +91 -0
  4. data/benchmark_intel.txt +36 -0
  5. data/lib/tensor_stream/opencl/array_ops.rb +395 -0
  6. data/lib/tensor_stream/opencl/images_ops.rb +62 -0
  7. data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
  8. data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
  9. data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
  10. data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
  11. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
  12. data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
  13. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
  14. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
  15. data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
  16. data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
  17. data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
  18. data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
  19. data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
  20. data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
  21. data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
  22. data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
  23. data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
  24. data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
  25. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
  26. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
  27. data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
  28. data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
  29. data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
  30. data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
  31. data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
  32. data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
  33. data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
  34. data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
  35. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
  36. data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
  37. data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
  38. data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
  39. data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
  40. data/lib/tensor_stream/opencl/version.rb +1 -1
  41. data/samples/iris.data +150 -0
  42. data/samples/iris.rb +110 -0
  43. data/samples/mnist_data.rb +65 -0
  44. data/samples/multigpu.rb +73 -0
  45. data/samples/nearest_neighbor.rb +56 -0
  46. data/samples/rnn.rb +108 -0
  47. data/tensor_stream-opencl.gemspec +4 -1
  48. metadata +62 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dccdd97c6bdddfa8e1100dc135e7eb74d78218c3e91c75a0ef06e69be5b5ab2e
4
- data.tar.gz: 52b061f6e1eb393ab9d0f54d7feebd497d0ad6b9d735eb1ec602f21cb1fcbd79
3
+ metadata.gz: 80aa4c8e84193ba879b9c7863b8103cd345b6591ec0a534162c53965609f1bd1
4
+ data.tar.gz: 88840b00a6c4a71540d837a4e20378cf2aafe4efda1990d2f978f401cae35c83
5
5
  SHA512:
6
- metadata.gz: '0990739a203b75ca8900cefb77781675abc866a1fa9a5a2aefe19fbd528f06a83ba2e06e4ddb0d04f41cf76b460fedc6cf7bfd70e05816e0346ed96fb9c022d4'
7
- data.tar.gz: 8b438e82d0d3d9234053b12154d49b84397fe0e0cb1a1a4cd9159bb957e0132e91b0aa8f6ffb4df119e20947f49d1a9639b53ff793b3c2645e88ab99daeef2dd
6
+ metadata.gz: c8c74bbc136ea42c8a01506a0b606bbde8a83a922026caef917d3eb8dbad1c41298fea37040e846ea1eee0683af35f0dd24df7d5449dac75c3e175ed07d94d49
7
+ data.tar.gz: 2235974d1d8dc5cfe9117991cb5ea4dff2b75409e26e20b197414613484a68b482c41796ff74699b041dbcf5f963721ffffa2f5f1a9f28e1f05b5bb96a081039
data/Gemfile.lock CHANGED
@@ -1,13 +1,15 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tensor_stream-opencl (0.1.3)
4
+ tensor_stream-opencl (0.2.0)
5
+ oily_png
5
6
  opencl_ruby_ffi
6
- tensor_stream
7
+ tensor_stream (~> 0.9.0)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
12
+ awesome_print (1.8.0)
11
13
  byebug (10.0.2)
12
14
  chunky_png (1.3.10)
13
15
  coderay (1.1.2)
@@ -16,10 +18,13 @@ GEM
16
18
  diff-lcs (1.3)
17
19
  ffi (1.9.25)
18
20
  method_source (0.9.0)
21
+ mnist-learn (0.1.1)
19
22
  narray (0.6.1.2)
20
- narray_ffi (1.4.3)
23
+ narray_ffi (1.4.4)
21
24
  ffi (~> 1.9, >= 1.9.3)
22
25
  narray (~> 0.6, >= 0.6.0.8)
26
+ oily_png (1.2.1)
27
+ chunky_png (~> 1.3.7)
23
28
  opencl_ruby_ffi (1.3.4)
24
29
  ffi (~> 1.9, >= 1.9.3)
25
30
  narray (~> 0.6, >= 0.6.0.8)
@@ -44,7 +49,7 @@ GEM
44
49
  diff-lcs (>= 1.2.0, < 2.0)
45
50
  rspec-support (~> 3.8.0)
46
51
  rspec-support (3.8.0)
47
- tensor_stream (0.8.5)
52
+ tensor_stream (0.9.0)
48
53
  chunky_png
49
54
  concurrent-ruby
50
55
  deep_merge
@@ -53,7 +58,9 @@ PLATFORMS
53
58
  ruby
54
59
 
55
60
  DEPENDENCIES
61
+ awesome_print
56
62
  bundler (~> 1.16)
63
+ mnist-learn
57
64
  pry-byebug
58
65
  rake (~> 10.0)
59
66
  rspec (~> 3.0)
@@ -0,0 +1,91 @@
1
+ require "bundler/setup"
2
+ require 'tensor_stream'
3
+ require 'benchmark'
4
+ require 'pry-byebug'
5
+ require 'awesome_print'
6
+ require 'tensor_stream/opencl'
7
+
8
+ def tr(t, places = 1)
9
+ if t.is_a?(Array)
10
+ return t.collect do |v|
11
+ tr(v, places)
12
+ end
13
+ end
14
+
15
+ return t unless t.is_a?(Float)
16
+
17
+ t.round(places)
18
+ end
19
+
20
+ tf = TensorStream
21
+
22
+ srand(5)
23
+ seed = 5
24
+ tf.set_random_seed(seed)
25
+
26
+ SHAPES = [32, 32]
27
+
28
+ sess = tf.session(:ruby_evaluator)
29
+
30
+ a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
31
+ a_int = tf.constant([
32
+ [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
33
+ [2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
34
+ [3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
35
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
36
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
37
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
38
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
39
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
40
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
41
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
42
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
43
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
44
+ ])
45
+
46
+ b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
47
+
48
+ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
49
+
50
+ d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
51
+
52
+ p = tf.placeholder('float')
53
+ q = tf.placeholder('float')
54
+
55
+ model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
56
+ single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
57
+ pow_f = tf.pow(a, 3)
58
+ pow_i = tf.pow(a_int, 3)
59
+ matmul = tf.matmul(a, b)
60
+ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
61
+ softmax = tf.nn.softmax(a)
62
+ add_n = tf.add_n([a,b,c,d])
63
+ split = tf.split(a, 4)
64
+
65
+ puts TensorStream::Evaluator.default_evaluators
66
+
67
+ sess2 = tf.session
68
+
69
+ puts `cat /proc/cpuinfo | grep "model name" | head -1`
70
+ device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
71
+ puts "OpenCL device #{device.platform.to_s} #{device.name}"
72
+ Benchmark.bmbm do |x|
73
+ x.report("pure ruby split :") { 100.times do sess.run(split) end }
74
+ x.report("opencl split :") { 100.times do sess2.run(split) end }
75
+ x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
76
+ x.report("opencl add_n :") { 100.times do sess2.run(add_n) end }
77
+ x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
78
+ x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
79
+ x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
80
+ x.report("opencl softmax :") { 100.times do sess2.run(softmax) end }
81
+ x.report("pure ruby matmul :") { 100.times do sess.run(matmul) end }
82
+ x.report("opencl matmul :") { 100.times do sess2.run(matmul) end }
83
+ x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
84
+ x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
85
+ x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
86
+ x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
87
+ x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
88
+ x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
89
+ x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
90
+ x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
91
+ end
@@ -0,0 +1,36 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
4
+ OpenCL device Intel Gen OCL Driver Intel(R) HD Graphics Skylake ULT GT2
5
+ Rehearsal --------------------------------------------------------------
6
+ pure ruby ooo matmul : 1.800000 0.000000 1.800000 ( 1.803752)
7
+ opencl ooo matmul : 0.520000 0.050000 0.570000 ( 0.630992)
8
+ pure ruby softmax : 0.300000 0.000000 0.300000 ( 0.303185)
9
+ opencl softmax : 0.180000 0.010000 0.190000 ( 0.200246)
10
+ pure ruby matmul : 0.860000 0.010000 0.870000 ( 0.869387)
11
+ opencl matmul : 0.260000 0.020000 0.280000 ( 0.335164)
12
+ pure ruby : 2.960000 0.020000 2.980000 ( 2.980800)
13
+ opencl : 1.050000 0.090000 1.140000 ( 1.258354)
14
+ pure ruby single function: 0.460000 0.000000 0.460000 ( 0.464543)
15
+ opencl singlefunction: 0.570000 0.020000 0.590000 ( 0.590300)
16
+ pure ruby pow float: 0.120000 0.000000 0.120000 ( 0.123025)
17
+ opencl pow float: 0.290000 0.010000 0.300000 ( 0.316175)
18
+ pure ruby pow int: 0.020000 0.000000 0.020000 ( 0.021570)
19
+ opencl pow int: 0.180000 0.000000 0.180000 ( 0.194088)
20
+ ----------------------------------------------------- total: 9.800000sec
21
+
22
+ user system total real
23
+ pure ruby ooo matmul : 1.860000 0.000000 1.860000 ( 1.866387)
24
+ opencl ooo matmul : 0.410000 0.040000 0.450000 ( 0.505565)
25
+ pure ruby softmax : 0.300000 0.000000 0.300000 ( 0.298407)
26
+ opencl softmax : 0.120000 0.000000 0.120000 ( 0.128033)
27
+ pure ruby matmul : 0.830000 0.000000 0.830000 ( 0.836471)
28
+ opencl matmul : 0.240000 0.010000 0.250000 ( 0.269629)
29
+ pure ruby : 2.950000 0.000000 2.950000 ( 2.947306)
30
+ opencl : 0.930000 0.100000 1.030000 ( 1.205344)
31
+ pure ruby single function: 0.650000 0.000000 0.650000 ( 0.642834)
32
+ opencl singlefunction: 0.840000 0.040000 0.880000 ( 1.097814)
33
+ pure ruby pow float: 0.140000 0.000000 0.140000 ( 0.140097)
34
+ opencl pow float: 0.190000 0.010000 0.200000 ( 0.269772)
35
+ pure ruby pow int: 0.030000 0.000000 0.030000 ( 0.030491)
36
+ opencl pow int: 0.040000 0.010000 0.050000 ( 0.084335)
@@ -0,0 +1,395 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module ArrayOps
5
+ def ArrayOps.included(klass)
6
+ klass.class_eval do
7
+ register_op :expand_dims, buffer: true do |_context, tensor, inputs|
8
+ axis = inputs[1].buffer[0]
9
+ shape = inputs[0].shape.dup
10
+ axis = -axis if axis == shape.size
11
+ new_shape = shape.insert(axis, 1).compact
12
+ new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
13
+ convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
14
+ end
15
+
16
+ register_op :fill, buffer: true do |_context, tensor, inputs|
17
+ shape = inputs[0]
18
+ value = inputs[1]
19
+
20
+ narray_size = shape.buffer.to_a.reduce(:*) || 1
21
+ cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
22
+
23
+ buffer = if cl_buffer
24
+ cl_buffer.buffer
25
+ else
26
+ allocate_narray_for_type(tensor.data_type, narray_size)
27
+ end
28
+
29
+ buffer.fill!(value.buffer[0])
30
+ convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
31
+ end
32
+
33
+ register_op :split do |context, tensor, inputs|
34
+ value, num_split, axis = inputs
35
+ value_shape = value.shape
36
+ axis = read_final_result(complete_eval(axis, context))
37
+ num_split = read_final_result(complete_eval(num_split, context))
38
+
39
+ multipliers = value_shape.dup.drop(1).reverse.inject([1]) do |a, s|
40
+ a << s * a.last
41
+ end.reverse
42
+
43
+ outputs = if !num_split.is_a?(Array) # scalar split
44
+ split_target = value_shape[axis]
45
+ raise TensorStream::ValueError, "#{num_split} does not divide #{split_target} evenly" if split_target % num_split != 0
46
+
47
+ piece_size = split_target / num_split
48
+
49
+ new_shape = value_shape.dup
50
+ new_shape[axis] = piece_size
51
+
52
+ if axis.zero? # axis zero fast copy path
53
+ Array.new(num_split) do |index|
54
+ _create_result_sub_buffer(value, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{num_split}")
55
+ end
56
+ else
57
+ # create buffers for each piece
58
+ work_buffer = _create_result_buffer(tensor.data_type, value_shape, "#{tensor.name}/out")
59
+ piece_size = new_shape.reduce(:*)
60
+ work_group = [num_split, piece_size]
61
+
62
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
63
+ a << s * a.last
64
+ end.reverse
65
+
66
+ cl_piece_size = OpenCL::Int1.new(piece_size)
67
+ event_wait_list = build_event_wait_list(inputs)
68
+ step = value_shape[axis] / num_split
69
+ event = _cl_program('split', step: step, axis: axis, mul: multipliers, dest: divisors, data_type: tensor.data_type).split(_opencl_queue, work_group,
70
+ cl_piece_size,
71
+ value.cl_buffer,
72
+ work_buffer.cl_buffer,
73
+ event_wait_list: event_wait_list)
74
+ work_buffer.op = event
75
+
76
+ Array.new(num_split) do |index|
77
+ _create_result_sub_buffer(work_buffer, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{num_split}")
78
+ end
79
+ end
80
+ else
81
+ raise TensorStream::ValueError, "#{num_split} does not divide #{value_shape[axis]} evenly" if num_split.reduce(:+) != value_shape[axis]
82
+ # compute shapes of individual output buffers
83
+ new_shapes = num_split.each_with_index.collect do |num, index|
84
+ new_shape = value_shape.dup
85
+ new_shape[axis] = num
86
+ new_shape
87
+ end
88
+ if axis.zero? # axis zero fast copy path
89
+ start = 0
90
+ out = []
91
+ new_shapes.each_with_index do |new_shape, index|
92
+ element_count = new_shape.reduce(:*) || 1
93
+ region_size_in_bytes = element_count * value.buffer.element_size
94
+ out << _create_variable_result_sub_buffer(value, index, start, region_size_in_bytes, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{new_shape.join('.')}")
95
+ start += region_size_in_bytes
96
+ end
97
+ out
98
+ else
99
+ # create buffers for each piece
100
+ work_buffer = _create_result_buffer(tensor.data_type, value_shape, "#{tensor.name}/out")
101
+ out = []
102
+ start = 0
103
+
104
+ steps = num_split.dup.reverse.drop(1).inject([0]) do |a, s|
105
+ a << s + a.last
106
+ end
107
+
108
+ offsets = new_shapes.dup.reverse.drop(1).inject([0]) do |a, shape|
109
+ size_bytes = shape.reduce(:*) || 1
110
+ a << a.last + size_bytes
111
+ end
112
+
113
+ events = new_shapes.each_with_index.collect do |shape, index|
114
+ offset = offsets[index]
115
+ step = steps[index]
116
+ divisors = shape.dup.drop(1).reverse.inject([1]) do |a, s|
117
+ a << s * a.last
118
+ end.reverse
119
+ piece_size = shape.reduce(:*) || 1
120
+ work_group = [piece_size]
121
+ cl_offset = OpenCL::Int1.new(offset)
122
+
123
+ _cl_program('split_n', axis: axis,
124
+ div: divisors,
125
+ mul: multipliers,
126
+ step: step,
127
+ data_type: tensor.data_type).
128
+ split(_opencl_queue,
129
+ work_group,
130
+ cl_offset,
131
+ value.cl_buffer,
132
+ work_buffer.cl_buffer,
133
+ event_wait_list: event_wait_list)
134
+ end
135
+ work_buffer.op = events
136
+ new_shapes.each_with_index do |new_shape, index|
137
+ element_count = new_shape.reduce(:*) || 1
138
+ region_size_in_bytes = element_count * work_buffer.buffer.element_size
139
+ out << _create_variable_result_sub_buffer(work_buffer, index, start, region_size_in_bytes, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{new_shape.join('.')}")
140
+ start += region_size_in_bytes
141
+ end
142
+ out
143
+ end
144
+ end
145
+
146
+ TensorStream::Evaluator::OutputGroup.new(outputs, outputs.map(&:data_type))
147
+ end
148
+
149
+ register_op :concat do |context, tensor, inputs|
150
+ axis = inputs.shift
151
+ shape = inputs[0].shape
152
+
153
+ normal_shape = inputs[0].shape.dup
154
+
155
+ axis = read_final_result(_run(axis, context))
156
+ axis = normal_shape.size - 1 if axis == -1
157
+
158
+ divisors = normal_shape.dup.drop(1).reverse.inject([1]) do |a, s|
159
+ a << s * a.last
160
+ end.reverse
161
+
162
+ new_shape = inputs[0].shape.dup
163
+ new_shape[axis] = 0
164
+ inputs.each do |input|
165
+ new_shape[axis] += input.shape[axis]
166
+ end
167
+
168
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
169
+ a << s * a.last
170
+ end.reverse
171
+
172
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
173
+ ops = if axis.zero? # fast path
174
+ inputs.each_with_index.map do |input, index|
175
+ next if input.empty_value?
176
+ start = index * input.buffer.size * input.buffer.element_size
177
+ region = [input.buffer.size * input.buffer.element_size, 1, 1]
178
+ event_wait_list = build_event_wait_list(input)
179
+ _opencl_queue.enqueue_copy_buffer_rect(input.cl_buffer, output_buffer.cl_buffer,
180
+ region, dst_origin: [start, 0, 0], event_wait_list: event_wait_list)
181
+ end.compact
182
+ else
183
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
184
+ cl_n = OpenCL::Int1.new(elem_size)
185
+
186
+ steps = inputs.map(&:shape).reverse.drop(1).inject([0]) do |a, shape|
187
+ a << shape[axis] + a.last
188
+ end
189
+
190
+ work_group = [elem_size]
191
+ event_wait_list = build_event_wait_list(inputs)
192
+
193
+ inputs.each_with_index.map do |input, index|
194
+ cl_index = OpenCL::Int1.new(index)
195
+ step = OpenCL::Int1.new(steps[index])
196
+ _cl_program('concat', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).
197
+ concat(_opencl_queue, work_group, cl_n, cl_index, step, input.cl_buffer,
198
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
199
+ end
200
+ end
201
+ output_buffer.op = ops
202
+ output_buffer
203
+ end
204
+
205
+ register_op :stack do |_context, tensor, inputs|
206
+ axis = tensor.options[:axis] || 0
207
+ shape = inputs[0].shape
208
+ rank = shape.size + 1
209
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
210
+
211
+ new_shape = [inputs.size]
212
+ shape.inject(new_shape) { |ns, s| ns << s }
213
+
214
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
215
+ a << s * a.last
216
+ end.reverse
217
+
218
+ axis = rank + axis if axis < 0
219
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
220
+ new_shape = rotated_shape.rotate! + new_shape
221
+
222
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
223
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
224
+ a << s * a.last
225
+ end.reverse
226
+
227
+ cl_n = OpenCL::Int1.new(elem_size)
228
+ work_group = [elem_size]
229
+
230
+ ops = if axis.zero? # fast path if axis == 0
231
+ step = multipliers[0]
232
+ inputs.each_with_index.map do |input, index|
233
+ start = index * step * input.buffer.element_size
234
+ region = [input.buffer.size * input.buffer.element_size, 1, 1]
235
+ _opencl_queue.enqueue_copy_buffer_rect(input.cl_buffer, output_buffer.cl_buffer, region, dst_origin: [start, 0, 0], event_wait_list: input.op)
236
+ end
237
+ else
238
+ event_wait_list = build_event_wait_list(inputs)
239
+ inputs.each_with_index.map do |input, index|
240
+ cl_index = OpenCL::Int1.new(index)
241
+ _cl_program('pack', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
242
+ end
243
+ end
244
+
245
+ output_buffer.op = ops
246
+ output_buffer
247
+ end
248
+
249
+ register_op :unstack do |context, tensor, inputs|
250
+ value = inputs[0]
251
+ axis = tensor.options[:axis] || 0
252
+ new_shape = value.shape.dup
253
+ rank = new_shape.size - 1
254
+
255
+ elem_size = new_shape.empty? ? 1 : new_shape.reduce(:*)
256
+
257
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
258
+ a << s * a.last
259
+ end.reverse
260
+
261
+ axis = rank + axis if axis < 0
262
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
263
+ new_shape = rotated_shape.rotate!(-1) + new_shape
264
+
265
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
266
+ a << s * a.last
267
+ end.reverse
268
+
269
+ step = multipliers[0]
270
+ sub_shape = new_shape.dup
271
+ sub_shape.shift
272
+
273
+ outputs = if axis.zero? # shortcut for axis == 0
274
+ Array.new(new_shape[0]) do |index|
275
+ _create_result_sub_buffer(value, index, tensor.data_type, sub_shape, "#{tensor.name}/out_#{index}")
276
+ end
277
+ else
278
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
279
+ cl_n = OpenCL::Int1.new(elem_size)
280
+ work_group = [elem_size]
281
+ event_wait_list = build_event_wait_list(inputs)
282
+ ops = inputs.each_with_index.map do |input, index|
283
+ cl_index = OpenCL::Int1.new(index)
284
+ _cl_program('unpack', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).unpack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
285
+ end
286
+ output_buffer.op = ops
287
+ Array.new(new_shape[0]) do |index|
288
+ _create_result_sub_buffer(output_buffer, index, tensor.data_type, sub_shape, "#{tensor.name}/out_#{index}")
289
+ end
290
+ end
291
+
292
+ TensorStream::Evaluator::OutputGroup.new(outputs, outputs.map(&:data_type))
293
+ end
294
+
295
+ register_op :index, noop: true do |context, tensor, inputs|
296
+ a = _run(inputs[0], context)
297
+ index = read_final_result(_run(inputs[1], context))
298
+
299
+ if a.is_a?(TensorStream::Evaluator::OutputGroup)
300
+ a.outputs[index]
301
+ elsif a.is_a?(Array)
302
+ a[index]
303
+ else
304
+ new_shape = a.shape.dup
305
+ new_shape.shift
306
+ input_a = read_final_result(a)
307
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
308
+ end
309
+ end
310
+
311
+ register_op :shape do |_context, tensor, inputs|
312
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
313
+ end
314
+
315
+ register_op :shape_n do |_context, tensor, inputs|
316
+ shapes = inputs.collect do |input|
317
+ wrap_opencl(input.shape, name: tensor.name, data_type: tensor.data_type)
318
+ end
319
+ TensorStream::Evaluator::OutputGroup.new(shapes, shapes.map { tensor.data_type })
320
+ end
321
+
322
+ register_op :reshape do |context, tensor, inputs|
323
+ arr = inputs[0]
324
+ new_shape = read_final_result(complete_eval(inputs[1], context))
325
+
326
+ shape = if new_shape.size.zero? && arr.buffer.size == 1
327
+ new_shape
328
+ else
329
+ TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
330
+ end
331
+
332
+ OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
333
+ shape: shape, buffer: arr.buffer,
334
+ cl_buffer: arr.cl_buffer,
335
+ op: arr.op)
336
+ end
337
+
338
+ register_op :transpose, buffer: true do |_context, tensor, inputs|
339
+ t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
340
+
341
+ if inputs[0].shape.size == 2 && inputs[1].nil?
342
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
343
+ res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
344
+ res
345
+ else
346
+ rank = inputs[0].shape.size
347
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
348
+ new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
349
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
350
+ transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
351
+
352
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
353
+ output_buffer.op = write_op
354
+ output_buffer
355
+ end
356
+ end
357
+
358
+ register_op :slice, noop: true do |context, tensor, inputs|
359
+ input_a = complete_eval(inputs[0], context)
360
+ input_b = read_final_result(complete_eval(inputs[1], context))
361
+ size = tensor.options[:size]
362
+
363
+ shape = input_a.shape
364
+
365
+ slice_param = input_b.zip(size).collect.with_index { | p, index| p[1] = (p[1] == -1) ? shape[index] : p[1] ; p[0]..p[0] + p[1] - 1 }.reverse
366
+
367
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
368
+ sliced = new_buf.slice[*slice_param]
369
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
370
+ end
371
+
372
+ register_op :rank do |_context, tensor, inputs|
373
+ wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
374
+ end
375
+
376
+ register_op :cast do |_context, tensor, inputs|
377
+ a = inputs[0]
378
+ if a.data_type != tensor.data_type
379
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
380
+ m, n = a.shape
381
+ cl_m = OpenCL::Int1.new(m || 1)
382
+ cl_n = OpenCL::Int1.new(n || 1)
383
+ work_group = [m || 1, n || 1]
384
+ event_wait_list = build_event_wait_list(inputs)
385
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
386
+ buffer
387
+ else
388
+ a
389
+ end
390
+ end
391
+ end
392
+ end
393
+ end
394
+ end
395
+ end