tensor_stream-opencl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
4
- data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
3
+ metadata.gz: b433e9e7ab38a517c21b57065e5a43b112640fd7c419fb7baa2f3319128cdacf
4
+ data.tar.gz: fab7d48513cb0f8481e151d18b088782918cb1539b59586613a00c4d5f5aeed2
5
5
  SHA512:
6
- metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
7
- data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
6
+ metadata.gz: 04d106f5ee5fac49eba20ff143bb2212a1cafd5140fc04cee20958ffea0c5909d352824948badf16ec5bc8ca2a7b13b4dcf7748eb03cbd6dc8a466c6ae0f5040
7
+ data.tar.gz: e17171f28641ce3496c0b338b6913c96e10d9fd5ce93b7980dae6edef00e63e5f7c4dcb60ed04fed5271a474b4940d069ebcf6a00bbfd3c4e6eafa2c0c4f26ed
@@ -4,6 +4,25 @@ require 'benchmark'
4
4
  require 'pry-byebug'
5
5
  require 'awesome_print'
6
6
  require 'tensor_stream/opencl'
7
+ require 'rbconfig'
8
+
9
+ def os
10
+ @os ||= (
11
+ host_os = RbConfig::CONFIG['host_os']
12
+ case host_os
13
+ when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
14
+ :windows
15
+ when /darwin|mac os/
16
+ :macosx
17
+ when /linux/
18
+ :linux
19
+ when /solaris|bsd/
20
+ :unix
21
+ else
22
+ raise Error::WebDriverError, "unknown os: #{host_os.inspect}"
23
+ end
24
+ )
25
+ end
7
26
 
8
27
  def tr(t, places = 1)
9
28
  if t.is_a?(Array)
@@ -77,49 +96,56 @@ conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
77
96
 
78
97
  bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
79
98
  bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
99
+ dropout = tf.nn.dropout(large_tensor, 0.8)
80
100
 
81
101
  puts TensorStream::Evaluator.default_evaluators
82
102
 
83
103
  sess2 = tf.session
84
104
 
85
- puts `cat /proc/cpuinfo | grep "model name" | head -1`
105
+ if os == :macosx
106
+ puts `sysctl -n machdep.cpu.brand_string`
107
+ else
108
+ puts `cat /proc/cpuinfo | grep "model name" | head -1`
109
+ end
86
110
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
87
111
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
88
112
  Benchmark.bmbm do |x|
89
- x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
90
- x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
91
- x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
92
- x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
93
- x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
94
- x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
95
- x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
96
- x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
97
- x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
98
- x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
99
- x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
100
- x.report("opencl arr index :") { 100.times do sess2.run(index) end }
101
- x.report("pure ruby min :") { 100.times do sess.run(min) end }
102
- x.report("opencl min :") { 100.times do sess2.run(min) end }
103
- x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
104
- x.report("opencl sum :") { 100.times do sess2.run(sum) end }
105
- x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
106
- x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
107
- x.report("pure ruby split :") { 100.times do sess.run(split) end }
108
- x.report("opencl split :") { 100.times do sess2.run(split) end }
109
- x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
110
- x.report("opencl add_n :") { 100.times do sess2.run(add_n) end }
111
- x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
112
- x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
113
- x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
114
- x.report("opencl softmax :") { 100.times do sess2.run(softmax) end }
115
- x.report("pure ruby matmul :") { 100.times do sess.run(matmul) end }
116
- x.report("opencl matmul :") { 100.times do sess2.run(matmul) end }
117
- x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
118
- x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
119
- x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
120
- x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
121
- x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
122
- x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
123
- x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
124
- x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
113
+ x.report("ruby argmin :") { 100.times do sess.run(argmin) end }
114
+ x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
115
+ x.report("ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
116
+ x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
117
+ x.report("ruby bias_add :") { 100.times do sess.run(bias_add) end }
118
+ x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
119
+ x.report("ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
120
+ x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
121
+ x.report("ruby conv2d :") { 100.times do sess.run(conv2d) end }
122
+ x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
123
+ x.report("ruby arr index :") { 100.times do sess.run(index) end }
124
+ x.report("opencl arr index :") { 100.times do sess2.run(index) end }
125
+ x.report("ruby min :") { 100.times do sess.run(min) end }
126
+ x.report("opencl min :") { 100.times do sess2.run(min) end }
127
+ x.report("ruby sum :") { 100.times do sess.run(sum) end }
128
+ x.report("opencl sum :") { 100.times do sess2.run(sum) end }
129
+ x.report("ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
130
+ x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
131
+ x.report("ruby split :") { 100.times do sess.run(split) end }
132
+ x.report("opencl split :") { 100.times do sess2.run(split) end }
133
+ x.report("ruby add_n :") { 100.times do sess.run(add_n) end }
134
+ x.report("opencl add_n :") { 100.times do sess2.run(add_n) end }
135
+ x.report("ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
136
+ x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
137
+ x.report("ruby softmax :") { 100.times do sess.run(softmax) end }
138
+ x.report("opencl softmax :") { 100.times do sess2.run(softmax) end }
139
+ x.report("ruby matmul :") { 100.times do sess.run(matmul) end }
140
+ x.report("opencl matmul :") { 100.times do sess2.run(matmul) end }
141
+ x.report("ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
142
+ x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
143
+ x.report("ruby single function :") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
144
+ x.report("opencl single function :") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
145
+ x.report("ruby pow float :") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
146
+ x.report("opencl pow float :") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
147
+ x.report("ruby pow int :") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
148
+ x.report("opencl pow int :") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
149
+ x.report("ruby dropout :") { 100.times do sess.run(dropout) end }
150
+ x.report("opencl dropout :") { 100.times do sess2.run(dropout) end }
125
151
  end
@@ -0,0 +1,80 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ Intel(R) Core(TM) i5-5575R CPU @ 2.80GHz
4
+ OpenCL device Apple Intel(R) Iris(TM) Pro Graphics 6200
5
+ Rehearsal ------------------------------------------------------------
6
+ ruby argmin : 0.940000 0.030000 0.970000 ( 1.197240)
7
+ opencl argmin : 0.070000 0.020000 0.090000 ( 0.093035)
8
+ ruby bias_add_grad : 2.390000 0.060000 2.450000 ( 2.558622)
9
+ opencl bias_add_grad : 0.020000 0.010000 0.030000 ( 0.030563)
10
+ ruby bias_add : 2.530000 0.070000 2.600000 ( 2.749423)
11
+ opencl bias_add : 0.150000 0.030000 0.180000 ( 0.191476)
12
+ ruby conv2d_backprop : 4.020000 0.060000 4.080000 ( 5.306408)
13
+ opencl conv2d_backprop : 0.040000 0.030000 0.070000 ( 0.077737)
14
+ ruby conv2d : 0.890000 0.010000 0.900000 ( 0.963062)
15
+ opencl conv2d : 0.030000 0.010000 0.040000 ( 0.042274)
16
+ ruby arr index : 0.000000 0.000000 0.000000 ( 0.004072)
17
+ opencl arr index : 0.010000 0.010000 0.020000 ( 0.023981)
18
+ ruby min : 3.710000 0.040000 3.750000 ( 4.329215)
19
+ opencl min : 0.160000 0.030000 0.190000 ( 0.191062)
20
+ ruby sum : 6.930000 0.080000 7.010000 ( 7.467194)
21
+ opencl sum : 0.010000 0.010000 0.020000 ( 0.034392)
22
+ ruby sum axis 1 : 6.920000 0.070000 6.990000 ( 7.412997)
23
+ opencl sum axis 1 : 0.020000 0.020000 0.040000 ( 0.027614)
24
+ ruby split : 0.020000 0.000000 0.020000 ( 0.022597)
25
+ opencl split : 0.060000 0.040000 0.100000 ( 0.099309)
26
+ ruby add_n : 0.150000 0.000000 0.150000 ( 0.162702)
27
+ opencl add_n : 0.020000 0.020000 0.040000 ( 0.033757)
28
+ ruby ooo matmul : 1.670000 0.010000 1.680000 ( 1.738712)
29
+ opencl ooo matmul : 0.020000 0.010000 0.030000 ( 0.029647)
30
+ ruby softmax : 0.030000 0.000000 0.030000 ( 0.033050)
31
+ opencl softmax : 0.020000 0.010000 0.030000 ( 0.030572)
32
+ ruby matmul : 0.820000 0.010000 0.830000 ( 0.851559)
33
+ opencl matmul : 0.010000 0.010000 0.020000 ( 0.026167)
34
+ ruby : 2.860000 0.020000 2.880000 ( 3.033034)
35
+ opencl : 0.220000 0.070000 0.290000 ( 0.240857)
36
+ ruby single function : 0.380000 0.000000 0.380000 ( 0.398911)
37
+ opencl single function : 0.150000 0.050000 0.200000 ( 0.162006)
38
+ ruby pow float : 0.090000 0.000000 0.090000 ( 0.098400)
39
+ opencl pow float : 0.020000 0.020000 0.040000 ( 0.033370)
40
+ ruby pow int : 0.020000 0.000000 0.020000 ( 0.023459)
41
+ opencl pow int : 0.020000 0.010000 0.030000 ( 0.030894)
42
+ -------------------------------------------------- total: 36.290000sec
43
+
44
+ user system total real
45
+ ruby argmin : 0.880000 0.010000 0.890000 ( 0.933367)
46
+ opencl argmin : 0.010000 0.010000 0.020000 ( 0.023140)
47
+ ruby bias_add_grad : 2.350000 0.050000 2.400000 ( 2.539349)
48
+ opencl bias_add_grad : 0.010000 0.010000 0.020000 ( 0.024700)
49
+ ruby bias_add : 2.510000 0.060000 2.570000 ( 2.667330)
50
+ opencl bias_add : 0.150000 0.020000 0.170000 ( 0.184056)
51
+ ruby conv2d_backprop : 3.910000 0.040000 3.950000 ( 4.320383)
52
+ opencl conv2d_backprop : 0.030000 0.020000 0.050000 ( 0.058036)
53
+ ruby conv2d : 0.910000 0.020000 0.930000 ( 1.120605)
54
+ opencl conv2d : 0.020000 0.010000 0.030000 ( 0.034972)
55
+ ruby arr index : 0.000000 0.000000 0.000000 ( 0.004119)
56
+ opencl arr index : 0.020000 0.010000 0.030000 ( 0.024126)
57
+ ruby min : 3.670000 0.030000 3.700000 ( 4.024439)
58
+ opencl min : 0.140000 0.030000 0.170000 ( 0.178683)
59
+ ruby sum : 6.920000 0.050000 6.970000 ( 7.314338)
60
+ opencl sum : 0.010000 0.020000 0.030000 ( 0.024655)
61
+ ruby sum axis 1 : 6.900000 0.050000 6.950000 ( 7.332897)
62
+ opencl sum axis 1 : 0.020000 0.020000 0.040000 ( 0.026150)
63
+ ruby split : 0.010000 0.000000 0.010000 ( 0.018866)
64
+ opencl split : 0.050000 0.040000 0.090000 ( 0.096327)
65
+ ruby add_n : 0.140000 0.000000 0.140000 ( 0.151006)
66
+ opencl add_n : 0.020000 0.010000 0.030000 ( 0.025622)
67
+ ruby ooo matmul : 1.670000 0.010000 1.680000 ( 1.732486)
68
+ opencl ooo matmul : 0.020000 0.020000 0.040000 ( 0.027051)
69
+ ruby softmax : 0.030000 0.000000 0.030000 ( 0.032848)
70
+ opencl softmax : 0.010000 0.010000 0.020000 ( 0.026403)
71
+ ruby matmul : 0.810000 0.000000 0.810000 ( 0.866297)
72
+ opencl matmul : 0.020000 0.020000 0.040000 ( 0.026677)
73
+ ruby : 2.870000 0.020000 2.890000 ( 3.237224)
74
+ opencl : 0.240000 0.080000 0.320000 ( 0.302463)
75
+ ruby single function : 0.390000 0.010000 0.400000 ( 0.470700)
76
+ opencl single function : 0.150000 0.060000 0.210000 ( 0.228528)
77
+ ruby pow float : 0.090000 0.000000 0.090000 ( 0.113073)
78
+ opencl pow float : 0.020000 0.010000 0.030000 ( 0.036938)
79
+ ruby pow int : 0.020000 0.000000 0.020000 ( 0.023728)
80
+ opencl pow int : 0.020000 0.020000 0.040000 ( 0.031909)
@@ -1,5 +1,6 @@
1
1
  require "tensor_stream/opencl/version"
2
2
  require 'tensor_stream'
3
+ require "tensor_stream/opencl/utils"
3
4
  require "tensor_stream/opencl/opencl_evaluator"
4
5
 
5
6
  module TensorStream
@@ -10,13 +10,13 @@ module TensorStream
10
10
  shape = if %i[zeros_like ones_like].include?(tensor.operation)
11
11
  inputs[0].shape
12
12
  elsif !inputs[0].nil?
13
- read_final_result(complete_eval(inputs[0], context))
13
+ complete_eval(inputs[0], context).buffer.to_a
14
14
  else
15
15
  tensor.shape.shape
16
16
  end
17
17
  cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
18
18
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
19
- buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
19
+ buffer = OpenCLBuffer.allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
20
20
  if %i[zeros zeros_like].include?(tensor.operation)
21
21
  buffer.fill!(0)
22
22
  else
@@ -47,7 +47,7 @@ module TensorStream
47
47
  buffer = if cl_buffer
48
48
  cl_buffer.buffer
49
49
  else
50
- allocate_narray_for_type(tensor.data_type, narray_size)
50
+ OpenCLBuffer.allocate_narray_for_type(tensor.data_type, narray_size)
51
51
  end
52
52
 
53
53
  buffer.fill!(value.buffer[0])
@@ -365,8 +365,8 @@ module TensorStream
365
365
  end
366
366
 
367
367
  register_op :reshape do |context, tensor, inputs|
368
- arr = inputs[0]
369
- new_shape = read_final_result(complete_eval(inputs[1], context))
368
+ arr, new_shape = inputs
369
+ new_shape = complete_eval(new_shape, context).buffer.to_a
370
370
 
371
371
  shape = if new_shape.size.zero? && arr.buffer.size == 1
372
372
  new_shape
@@ -389,9 +389,9 @@ module TensorStream
389
389
  res
390
390
  else
391
391
  rank = inputs[0].shape.size
392
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
392
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer!
393
393
  new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
394
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
394
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name, allocate_host: true)
395
395
  transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
396
396
 
397
397
  write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
@@ -442,6 +442,36 @@ module TensorStream
442
442
  a
443
443
  end
444
444
  end
445
+
446
+ register_op :range do |context, tensor, inputs|
447
+ start, limit, delta = complete_eval(inputs, context).map { |p| p.buffer.to_a.first }
448
+
449
+ if limit.zero?
450
+ limit = start
451
+ start = 0
452
+ end
453
+
454
+ raise " delta !=0 " if delta.zero?
455
+ raise " Requires start <= limit when delta > 0" if (start > limit) && delta > 0
456
+ raise " Requires start >= limit when delta < 0" if (start < limit) && delta < 0
457
+ cache_key = "range_#{start}_#{limit}_#{delta}_#{tensor.data_type}"
458
+
459
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
460
+ delta = fp_type?(tensor.options[:output_type]) ? delta.to_f : delta.to_i
461
+ cur_step = fp_type?(tensor.options[:output_type]) ? start.to_f : start.to_i
462
+ r = []
463
+ Kernel.loop do
464
+ break if start == limit
465
+ break if (start < limit) && (cur_step >= limit)
466
+ break if (start > limit) && (cur_step <= limit)
467
+
468
+ r << cur_step
469
+ cur_step += delta
470
+ end
471
+ r
472
+ convert_to_opencl(r, [r.size], data_type: tensor.options[:output_type], name: tensor.name)
473
+ end
474
+ end
445
475
  end
446
476
  end
447
477
  end
@@ -24,7 +24,7 @@ module TensorStream
24
24
  end
25
25
  end
26
26
 
27
- output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
27
+ output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}", allocate_host: true)
28
28
 
29
29
  image.grayscale! if channels == 1
30
30
  image.pixels.each_with_index do |pixel, index|
@@ -6,8 +6,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
6
6
  __global <%= c_dtype %>* C) {
7
7
 
8
8
  // Get the index of the current element to be processed
9
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
10
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
+ const int index = get_global_id(0);
10
+ const int globalRow = get_global_id(1); // Row ID of C (0..M)
11
+ const int globalCol = get_global_id(2); // Col ID of C (0..N)
11
12
 
12
13
  // Compute a single element (loop over K)
13
14
  <%= c_dtype %> acc = 0.0f;
@@ -16,9 +17,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
16
17
  int b_index = k*N + globalCol;
17
18
  <% if ta %>a_index = M*k + globalRow;<% end %>
18
19
  <% if tb %>b_index = globalCol*K + k;<% end %>
19
- acc += A[a_index] * B[b_index];
20
+ acc += A[a_index + index * <%= n_a %>] * B[b_index + index * <%= n_b %>];
20
21
  }
21
22
 
22
23
  // Store the result
23
- C[globalRow*N + globalCol] = acc;
24
+ C[index * <%= n %> + globalRow*N + globalCol] = acc;
24
25
  }
@@ -0,0 +1,7 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void random_uniform_<%= dtype %>(const int seed_ptr, const float min, const float max, __global const <%= c_dtype %> *rand_table, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ <%= c_dtype %> rand_value = rand_table[ (seed_ptr + id) % <%= tsize %>];
6
+ C[id] = rand_value * (max - min) + min;
7
+ }
@@ -54,26 +54,28 @@ module TensorStream
54
54
  register_op :mat_mul do |_context, tensor, inputs|
55
55
  a, b = inputs
56
56
 
57
- m = a.shape[0]
58
- n = b.shape[1]
59
- v = b.shape[0]
60
- k = a.shape[1]
57
+ a_matrix_shape = a.shape.dup
58
+ b_matrix_shape = b.shape.dup
59
+
60
+ k = a_matrix_shape.pop
61
+ m = a_matrix_shape.pop
62
+ n = b_matrix_shape.pop
63
+ v = b_matrix_shape.pop
61
64
 
62
65
  if tensor.options[:transpose_a]
63
- m = a.shape[1]
64
- k = a.shape[0]
66
+ m, k = k, m
65
67
  end
66
68
 
67
69
  if tensor.options[:transpose_b]
68
- n = b.shape[0]
69
- v = b.shape[1]
70
+ n, v = v, n
70
71
  end
71
72
 
72
- result_shape = [m, n]
73
+ result_shape = [a_matrix_shape.first, m, n].compact
74
+ work_group = [a_matrix_shape.first || 1, m, n]
73
75
 
74
76
  raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
75
77
  raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
76
- raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
78
+ raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size > 3 || a.shape.size > 3
77
79
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
78
80
 
79
81
  dtype = tensor.data_type
@@ -85,7 +87,7 @@ module TensorStream
85
87
  cl_k = OpenCL::Int1.new(k)
86
88
 
87
89
  event_wait_list = build_event_wait_list([a, b])
88
- output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
90
+ output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], n: m * n, n_a: m * k, n_b: n * v, dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
89
91
 
90
92
  output_buffer
91
93
  end
@@ -1,7 +1,48 @@
1
1
  module TensorStream
2
2
  # Buffer used by the OpenCL evaluator
3
3
  class OpenCLBuffer < Buffer
4
+ class LazyBuffer
5
+ attr_reader :data_type
6
+
7
+ def initialize(data_type, size)
8
+ @data_type = data_type
9
+ @size = size
10
+ end
11
+
12
+ def size
13
+ @size
14
+ end
15
+
16
+ def element_size
17
+ buffer_size_for_type(@data_type)
18
+ end
19
+
20
+ def buffer_size_for_type(data_type)
21
+ case data_type
22
+ when :float, :float32, :float16
23
+ 4
24
+ when :float64
25
+ 8
26
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
27
+ 4
28
+ when :int16, :uint16
29
+ 2
30
+ when :uint8, :int8
31
+ 1
32
+ when :boolean
33
+ 1
34
+ when :string
35
+ 1
36
+ when :unknown
37
+ nil
38
+ else
39
+ raise "unsupported type #{data_type}"
40
+ end
41
+ end
42
+ end
43
+
4
44
  include ArrayOpsHelper
45
+ include TensorStream::CLEventHelpers
5
46
 
6
47
  attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
7
48
 
@@ -24,15 +65,33 @@ module TensorStream
24
65
  end
25
66
 
26
67
  def inspect
27
- "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
68
+ "CLBuffer(name: #{name} shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
69
+ end
70
+
71
+ def buffer!
72
+ return buffer if buffer.is_a?(NArray)
73
+
74
+ @buffer = OpenCLBuffer.allocate_narray_for_type(buffer.data_type, buffer.size) if buffer.is_a?(LazyBuffer)
75
+
76
+ command_queue.enqueue_read_buffer(cl_buffer, @buffer, blocking: true, event_wait_list: build_event_wait_list([self]))
77
+ @buffer
78
+ end
79
+
80
+ def command_queue
81
+ @command_queue ||= begin
82
+ first_op = op.is_a?(Array) ? op.first : op
83
+ first_op.command_queue
84
+ end
28
85
  end
29
86
 
30
87
  def to_ruby
88
+ buffer! if buffer.is_a?(LazyBuffer)
89
+
31
90
  return [] if buffer.empty?
32
91
 
33
92
  if dirty
34
- op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
35
- op.command_queue.finish
93
+ command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
94
+ command_queue.finish
36
95
  self.dirty = false
37
96
  end
38
97
 
@@ -54,5 +113,28 @@ module TensorStream
54
113
  def self.nil_buffer(owner, name, data_type)
55
114
  OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
56
115
  end
116
+
117
+ def self.allocate_narray_for_type(data_type, narray_size)
118
+ case data_type
119
+ when :float, :float32, :float16
120
+ NArray.sfloat(narray_size)
121
+ when :float64
122
+ NArray.float(narray_size)
123
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
124
+ NArray.int(narray_size)
125
+ when :int16, :uint16
126
+ NArray.sint(narray_size)
127
+ when :uint8, :int8
128
+ NArray.byte(narray_size)
129
+ when :boolean
130
+ NArray.byte(narray_size)
131
+ when :string
132
+ NArray.byte(narray_size)
133
+ when :unknown
134
+ nil
135
+ else
136
+ raise "unsupported type #{data_type}"
137
+ end
138
+ end
57
139
  end
58
140
  end
@@ -13,6 +13,7 @@ require 'tensor_stream/opencl/math_ops'
13
13
  require 'tensor_stream/opencl/nn_ops'
14
14
  require 'tensor_stream/opencl/images_ops'
15
15
  require 'tensor_stream/opencl/array_ops'
16
+ require 'tensor_stream/opencl/random_ops'
16
17
  require 'tensor_stream/helpers/op_helper'
17
18
 
18
19
  module TensorStream
@@ -49,6 +50,8 @@ module TensorStream
49
50
  include TensorStream::OpenCLHelpers::NNOps
50
51
  include TensorStream::OpenCLHelpers::ImagesOps
51
52
  include TensorStream::OpenCLHelpers::ArrayOps
53
+ include TensorStream::OpenCLHelpers::RandomOps
54
+ include TensorStream::CLEventHelpers
52
55
 
53
56
  def initialize(session, device, thread_pool: nil, log_intermediates: false)
54
57
  super
@@ -159,6 +162,9 @@ module TensorStream
159
162
  return [] if buffer.buffer.nil?
160
163
  return buffer if buffer.buffer.size.zero?
161
164
 
165
+ # lazy allocate
166
+ buffer.buffer = OpenCLBuffer.allocate_narray_for_type(buffer.buffer.data_type, buffer.buffer.size) if buffer.buffer.is_a?(OpenCLBuffer::LazyBuffer)
167
+
162
168
  buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
163
169
  buffer
164
170
  end
@@ -167,12 +173,19 @@ module TensorStream
167
173
  def complete_eval(tensor, context)
168
174
  return nil if tensor.nil?
169
175
 
170
- buffer = enqueue_buffer_read(tensor, context)
171
- events = build_event_wait_list([buffer])
176
+ buffers = if tensor.is_a?(Array)
177
+ tensor.map { |t|
178
+ enqueue_buffer_read(t, context)
179
+ }
180
+ else
181
+ [enqueue_buffer_read(tensor, context)]
182
+ end
183
+
184
+ events = build_event_wait_list(buffers)
172
185
  # puts "** wait #{tensor.name} **"
173
186
  OpenCL.wait_for_events(events) unless events.empty?
174
187
  # puts "** done #{tensor.name} **"
175
- buffer
188
+ tensor.is_a?(Array) ? buffers : buffers.first
176
189
  end
177
190
 
178
191
  def self.query_devices_with_score
@@ -355,9 +368,13 @@ module TensorStream
355
368
 
356
369
  register_op :identity do |_context, tensor, inputs|
357
370
  value = inputs[0]
358
- buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
359
- buffer.op = build_event_wait_list(inputs)
360
- buffer
371
+ if value.is_a?(OutputGroup)
372
+ value
373
+ else
374
+ buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
375
+ buffer.op = build_event_wait_list(inputs)
376
+ buffer
377
+ end
361
378
  end
362
379
 
363
380
  register_op :assign, noop: true do |context, tensor, inputs|
@@ -773,9 +790,9 @@ module TensorStream
773
790
  value
774
791
  elsif data_type == :string && shape.empty?
775
792
  cl_buffer_size = value[0].bytesize
776
- allocate_narray_for_type(data_type, value[0].bytesize)
793
+ OpenCLBuffer.allocate_narray_for_type(data_type, value[0].bytesize)
777
794
  else
778
- allocate_narray_for_type(data_type, narray_size)
795
+ OpenCLBuffer.allocate_narray_for_type(data_type, narray_size)
779
796
  end
780
797
 
781
798
  return nil if buffer.nil?
@@ -818,39 +835,17 @@ module TensorStream
818
835
  cl_object
819
836
  end
820
837
 
821
- def allocate_narray_for_type(data_type, narray_size)
822
- case data_type
823
- when :float, :float32, :float16
824
- NArray.sfloat(narray_size)
825
- when :float64
826
- NArray.float(narray_size)
827
- when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
828
- NArray.int(narray_size)
829
- when :int16, :uint16
830
- NArray.sint(narray_size)
831
- when :uint8, :int8
832
- NArray.byte(narray_size)
833
- when :boolean
834
- NArray.byte(narray_size)
835
- when :string
836
- NArray.byte(narray_size)
837
- when :unknown
838
- nil
839
- else
840
- raise "unsupported type #{data_type}"
841
- end
842
- end
843
-
844
- def _create_result_buffer(data_type, shape, name)
838
+ def _create_result_buffer(data_type, shape, name, allocate_host: false)
845
839
  return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
846
840
 
847
841
  cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
848
842
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
849
843
  # puts "create result buffer #{cache_key}"
850
844
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
851
- buffer = allocate_narray_for_type(data_type, size)
852
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
853
- OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
845
+ lazy_buffer = !allocate_host ? OpenCLBuffer::LazyBuffer.new(data_type, size) : OpenCLBuffer.allocate_narray_for_type(data_type, size)
846
+ cl_buffer = _opencl_context.create_buffer(size * lazy_buffer.element_size)
847
+
848
+ OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: lazy_buffer, cl_buffer: cl_buffer, name: name)
854
849
  end
855
850
  end
856
851
 
@@ -859,7 +854,7 @@ module TensorStream
859
854
  cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
860
855
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
861
856
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
862
- buffer = allocate_narray_for_type(data_type, size)
857
+ buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
863
858
 
864
859
  if parent_buffer.cl_buffer.associated_memobject.nil?
865
860
  start = index * buffer.size * buffer.element_size
@@ -890,7 +885,7 @@ module TensorStream
890
885
  cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
891
886
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
892
887
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
893
- buffer = allocate_narray_for_type(data_type, size)
888
+ buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
894
889
 
895
890
  if parent_buffer.cl_buffer.associated_memobject.nil?
896
891
  region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
@@ -980,14 +975,6 @@ module TensorStream
980
975
  shape.is_a?(Array) ? shape.size : 0
981
976
  end
982
977
 
983
- def build_event_wait_list(inputs)
984
- if inputs.is_a?(Array)
985
- inputs.flatten.compact.map(&:op).compact.uniq
986
- else
987
- inputs.op ? [inputs.op] : []
988
- end
989
- end
990
-
991
978
  def resolve_placeholder(placeholder, _execution_context = {})
992
979
  return nil if placeholder.nil?
993
980
  return placeholder unless placeholder.is_a?(Placeholder)
@@ -0,0 +1,54 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module RandomOps
5
+ RAND_TABLE_SIZE = 1024 * 1024
6
+
7
+ def RandomOps.included(klass)
8
+ klass.class_eval do
9
+ register_op :random_uniform do |context, tensor, inputs|
10
+ maxval = tensor.options.fetch(:maxval, 1)
11
+ minval = tensor.options.fetch(:minval, 0)
12
+ seed = tensor.options[:seed]
13
+
14
+ rand_buffer = @context[:_cache][:_cl_buffers]["_rand"] ||= begin
15
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = 0
16
+ random = _get_randomizer(tensor, seed)
17
+ rand_table = RAND_TABLE_SIZE.times.map { random.rand }
18
+ convert_to_opencl(rand_table, [RAND_TABLE_SIZE], data_type: tensor.data_type, name: "rand_#{tensor.data_type}")
19
+ end
20
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] ||= 0
21
+
22
+ seed_ptr = @context[:_cache][:_cl_buffers]["_rand_seed_ptr"]
23
+
24
+ shape = read_final_result(complete_eval(inputs[0], context))
25
+ shape = shape || tensor.shape.shape
26
+ workgroup = [shape.reduce(:*) || 1 ]
27
+ cl_seed_ptr = OpenCL::Int1.new(seed_ptr)
28
+ cl_min = OpenCL::Float1.new(minval)
29
+ cl_max = OpenCL::Float1.new(maxval)
30
+
31
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = (seed_ptr + (shape.reduce(:*) || 0) ) % RAND_TABLE_SIZE
32
+ buffer = _create_result_buffer(tensor.data_type, shape, tensor.name)
33
+ buffer.op = _cl_program("random_uniform", dtype: tensor.data_type, tsize: RAND_TABLE_SIZE).send(:"random_uniform_#{tensor.data_type}", _opencl_queue, workgroup, cl_seed_ptr, cl_min, cl_max, rand_buffer.cl_buffer, buffer.cl_buffer)
34
+ buffer
35
+ end
36
+
37
+ def _get_randomizer(tensor, seed)
38
+ if tensor.graph.random_seed && seed
39
+ Random.new(tensor.graph.random_seed ^ seed)
40
+ elsif tensor.graph.random_seed
41
+ @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
42
+ @session.randomizer[tensor.graph.object_id]
43
+ elsif seed
44
+ @session.randomizer[tensor.operation] ||= Random.new(seed)
45
+ @session.randomizer[tensor.operation]
46
+ else
47
+ Random.new
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,27 @@
1
+ module TensorStream
2
+ class OpenCLUtil
3
+ ##
4
+ # initializes a OpenCL helper class based on a session
5
+ def initialize(session)
6
+ @session = session
7
+ end
8
+
9
+ ##
10
+ # Retrieves OpenCL memory usage
11
+ def get_memory_usage
12
+ cl_buffer_uniq_set = Set.new
13
+ @session.last_session_context[:_cache][:_cl_buffers].inject(0) do |sum, elem|
14
+ cl_buffer_uniq_set.add?(elem[1].cl_buffer.object_id) ? sum + elem[1].cl_buffer.size : sum
15
+ end
16
+ end
17
+ end
18
+ module CLEventHelpers
19
+ def build_event_wait_list(inputs)
20
+ if inputs.is_a?(Array)
21
+ inputs.flatten.compact.map(&:op).compact.uniq
22
+ else
23
+ inputs.op ? [inputs.op] : []
24
+ end
25
+ end
26
+ end
27
+ end
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
@@ -53,11 +53,16 @@ b5 = tf.variable(tf.zeros([10]))
53
53
 
54
54
  x_ = tf.reshape(x, [-1, 784])
55
55
 
56
+
56
57
  y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
57
- y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
58
- y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
59
- y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
60
- ylogits = tf.matmul(y4, w5) + b5
58
+ Y1d = tf.nn.dropout(y1, pkeep)
59
+ y2 = tf.nn.relu(tf.matmul(Y1d, w2) + b2)
60
+ Y2d = tf.nn.dropout(y2, pkeep)
61
+ y3 = tf.nn.relu(tf.matmul(Y2d, w3) + b3)
62
+ Y3d = tf.nn.dropout(y3, pkeep)
63
+ y4 = tf.nn.relu(tf.matmul(Y3d, w4) + b4)
64
+ Y4d = tf.nn.dropout(y4, pkeep)
65
+ ylogits = tf.matmul(Y4d, w5) + b5
61
66
 
62
67
  # model
63
68
  y = tf.nn.softmax(ylogits)
@@ -85,8 +85,8 @@ y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
85
85
  # reshape the output from the third convolution for the fully connected layer
86
86
  yy = tf.reshape(y3, [-1, 7 * 7 * M])
87
87
  y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
88
-
89
- ylogits = tf.matmul(y4, w5) + b5
88
+ YY4 = tf.nn.dropout(y4, pkeep)
89
+ ylogits = tf.matmul(YY4, w5) + b5
90
90
 
91
91
  # model
92
92
  y = tf.nn.softmax(ylogits, name: 'out')
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "1.0.6"
42
+ spec.add_dependency "tensor_stream", "1.0.7"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - '='
116
116
  - !ruby/object:Gem::Version
117
- version: 1.0.6
117
+ version: 1.0.7
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
- version: 1.0.6
124
+ version: 1.0.7
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -167,6 +167,7 @@ files:
167
167
  - README.md
168
168
  - Rakefile
169
169
  - benchmark/benchmark.rb
170
+ - benchmark_imac2015_iris.txt
170
171
  - benchmark_intel.txt
171
172
  - benchmark_ryzen.txt
172
173
  - benchmark_ryzen_nvidia.txt
@@ -219,6 +220,7 @@ files:
219
220
  - lib/tensor_stream/opencl/kernels/pack.cl
220
221
  - lib/tensor_stream/opencl/kernels/pow.cl
221
222
  - lib/tensor_stream/opencl/kernels/prod.cl
223
+ - lib/tensor_stream/opencl/kernels/random_uniform.cl
222
224
  - lib/tensor_stream/opencl/kernels/real_div.cl
223
225
  - lib/tensor_stream/opencl/kernels/reciprocal.cl
224
226
  - lib/tensor_stream/opencl/kernels/reduce_axis.cl
@@ -250,6 +252,8 @@ files:
250
252
  - lib/tensor_stream/opencl/opencl_device.rb
251
253
  - lib/tensor_stream/opencl/opencl_evaluator.rb
252
254
  - lib/tensor_stream/opencl/opencl_template_helper.rb
255
+ - lib/tensor_stream/opencl/random_ops.rb
256
+ - lib/tensor_stream/opencl/utils.rb
253
257
  - lib/tensor_stream/opencl/version.rb
254
258
  - samples/build_mnist_model.rb
255
259
  - samples/classify.rb