tensor_stream-opencl 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
4
- data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
3
+ metadata.gz: b433e9e7ab38a517c21b57065e5a43b112640fd7c419fb7baa2f3319128cdacf
4
+ data.tar.gz: fab7d48513cb0f8481e151d18b088782918cb1539b59586613a00c4d5f5aeed2
5
5
  SHA512:
6
- metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
7
- data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
6
+ metadata.gz: 04d106f5ee5fac49eba20ff143bb2212a1cafd5140fc04cee20958ffea0c5909d352824948badf16ec5bc8ca2a7b13b4dcf7748eb03cbd6dc8a466c6ae0f5040
7
+ data.tar.gz: e17171f28641ce3496c0b338b6913c96e10d9fd5ce93b7980dae6edef00e63e5f7c4dcb60ed04fed5271a474b4940d069ebcf6a00bbfd3c4e6eafa2c0c4f26ed
@@ -4,6 +4,25 @@ require 'benchmark'
4
4
  require 'pry-byebug'
5
5
  require 'awesome_print'
6
6
  require 'tensor_stream/opencl'
7
+ require 'rbconfig'
8
+
9
+ def os
10
+ @os ||= (
11
+ host_os = RbConfig::CONFIG['host_os']
12
+ case host_os
13
+ when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
14
+ :windows
15
+ when /darwin|mac os/
16
+ :macosx
17
+ when /linux/
18
+ :linux
19
+ when /solaris|bsd/
20
+ :unix
21
+ else
22
+ raise Error::WebDriverError, "unknown os: #{host_os.inspect}"
23
+ end
24
+ )
25
+ end
7
26
 
8
27
  def tr(t, places = 1)
9
28
  if t.is_a?(Array)
@@ -77,49 +96,56 @@ conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
77
96
 
78
97
  bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
79
98
  bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
99
+ dropout = tf.nn.dropout(large_tensor, 0.8)
80
100
 
81
101
  puts TensorStream::Evaluator.default_evaluators
82
102
 
83
103
  sess2 = tf.session
84
104
 
85
- puts `cat /proc/cpuinfo | grep "model name" | head -1`
105
+ if os == :macosx
106
+ puts `sysctl -n machdep.cpu.brand_string`
107
+ else
108
+ puts `cat /proc/cpuinfo | grep "model name" | head -1`
109
+ end
86
110
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
87
111
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
88
112
  Benchmark.bmbm do |x|
89
- x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
90
- x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
91
- x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
92
- x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
93
- x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
94
- x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
95
- x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
96
- x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
97
- x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
98
- x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
99
- x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
100
- x.report("opencl arr index :") { 100.times do sess2.run(index) end }
101
- x.report("pure ruby min :") { 100.times do sess.run(min) end }
102
- x.report("opencl min :") { 100.times do sess2.run(min) end }
103
- x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
104
- x.report("opencl sum :") { 100.times do sess2.run(sum) end }
105
- x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
106
- x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
107
- x.report("pure ruby split :") { 100.times do sess.run(split) end }
108
- x.report("opencl split :") { 100.times do sess2.run(split) end }
109
- x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
110
- x.report("opencl add_n :") { 100.times do sess2.run(add_n) end }
111
- x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
112
- x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
113
- x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
114
- x.report("opencl softmax :") { 100.times do sess2.run(softmax) end }
115
- x.report("pure ruby matmul :") { 100.times do sess.run(matmul) end }
116
- x.report("opencl matmul :") { 100.times do sess2.run(matmul) end }
117
- x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
118
- x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
119
- x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
120
- x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
121
- x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
122
- x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
123
- x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
124
- x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
113
+ x.report("ruby argmin :") { 100.times do sess.run(argmin) end }
114
+ x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
115
+ x.report("ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
116
+ x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
117
+ x.report("ruby bias_add :") { 100.times do sess.run(bias_add) end }
118
+ x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
119
+ x.report("ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
120
+ x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
121
+ x.report("ruby conv2d :") { 100.times do sess.run(conv2d) end }
122
+ x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
123
+ x.report("ruby arr index :") { 100.times do sess.run(index) end }
124
+ x.report("opencl arr index :") { 100.times do sess2.run(index) end }
125
+ x.report("ruby min :") { 100.times do sess.run(min) end }
126
+ x.report("opencl min :") { 100.times do sess2.run(min) end }
127
+ x.report("ruby sum :") { 100.times do sess.run(sum) end }
128
+ x.report("opencl sum :") { 100.times do sess2.run(sum) end }
129
+ x.report("ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
130
+ x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
131
+ x.report("ruby split :") { 100.times do sess.run(split) end }
132
+ x.report("opencl split :") { 100.times do sess2.run(split) end }
133
+ x.report("ruby add_n :") { 100.times do sess.run(add_n) end }
134
+ x.report("opencl add_n :") { 100.times do sess2.run(add_n) end }
135
+ x.report("ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
136
+ x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
137
+ x.report("ruby softmax :") { 100.times do sess.run(softmax) end }
138
+ x.report("opencl softmax :") { 100.times do sess2.run(softmax) end }
139
+ x.report("ruby matmul :") { 100.times do sess.run(matmul) end }
140
+ x.report("opencl matmul :") { 100.times do sess2.run(matmul) end }
141
+ x.report("ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
142
+ x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
143
+ x.report("ruby single function :") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
144
+ x.report("opencl single function :") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
145
+ x.report("ruby pow float :") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
146
+ x.report("opencl pow float :") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
147
+ x.report("ruby pow int :") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
148
+ x.report("opencl pow int :") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
149
+ x.report("ruby dropout :") { 100.times do sess.run(dropout) end }
150
+ x.report("opencl dropout :") { 100.times do sess2.run(dropout) end }
125
151
  end
@@ -0,0 +1,80 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ Intel(R) Core(TM) i5-5575R CPU @ 2.80GHz
4
+ OpenCL device Apple Intel(R) Iris(TM) Pro Graphics 6200
5
+ Rehearsal ------------------------------------------------------------
6
+ ruby argmin : 0.940000 0.030000 0.970000 ( 1.197240)
7
+ opencl argmin : 0.070000 0.020000 0.090000 ( 0.093035)
8
+ ruby bias_add_grad : 2.390000 0.060000 2.450000 ( 2.558622)
9
+ opencl bias_add_grad : 0.020000 0.010000 0.030000 ( 0.030563)
10
+ ruby bias_add : 2.530000 0.070000 2.600000 ( 2.749423)
11
+ opencl bias_add : 0.150000 0.030000 0.180000 ( 0.191476)
12
+ ruby conv2d_backprop : 4.020000 0.060000 4.080000 ( 5.306408)
13
+ opencl conv2d_backprop : 0.040000 0.030000 0.070000 ( 0.077737)
14
+ ruby conv2d : 0.890000 0.010000 0.900000 ( 0.963062)
15
+ opencl conv2d : 0.030000 0.010000 0.040000 ( 0.042274)
16
+ ruby arr index : 0.000000 0.000000 0.000000 ( 0.004072)
17
+ opencl arr index : 0.010000 0.010000 0.020000 ( 0.023981)
18
+ ruby min : 3.710000 0.040000 3.750000 ( 4.329215)
19
+ opencl min : 0.160000 0.030000 0.190000 ( 0.191062)
20
+ ruby sum : 6.930000 0.080000 7.010000 ( 7.467194)
21
+ opencl sum : 0.010000 0.010000 0.020000 ( 0.034392)
22
+ ruby sum axis 1 : 6.920000 0.070000 6.990000 ( 7.412997)
23
+ opencl sum axis 1 : 0.020000 0.020000 0.040000 ( 0.027614)
24
+ ruby split : 0.020000 0.000000 0.020000 ( 0.022597)
25
+ opencl split : 0.060000 0.040000 0.100000 ( 0.099309)
26
+ ruby add_n : 0.150000 0.000000 0.150000 ( 0.162702)
27
+ opencl add_n : 0.020000 0.020000 0.040000 ( 0.033757)
28
+ ruby ooo matmul : 1.670000 0.010000 1.680000 ( 1.738712)
29
+ opencl ooo matmul : 0.020000 0.010000 0.030000 ( 0.029647)
30
+ ruby softmax : 0.030000 0.000000 0.030000 ( 0.033050)
31
+ opencl softmax : 0.020000 0.010000 0.030000 ( 0.030572)
32
+ ruby matmul : 0.820000 0.010000 0.830000 ( 0.851559)
33
+ opencl matmul : 0.010000 0.010000 0.020000 ( 0.026167)
34
+ ruby : 2.860000 0.020000 2.880000 ( 3.033034)
35
+ opencl : 0.220000 0.070000 0.290000 ( 0.240857)
36
+ ruby single function : 0.380000 0.000000 0.380000 ( 0.398911)
37
+ opencl single function : 0.150000 0.050000 0.200000 ( 0.162006)
38
+ ruby pow float : 0.090000 0.000000 0.090000 ( 0.098400)
39
+ opencl pow float : 0.020000 0.020000 0.040000 ( 0.033370)
40
+ ruby pow int : 0.020000 0.000000 0.020000 ( 0.023459)
41
+ opencl pow int : 0.020000 0.010000 0.030000 ( 0.030894)
42
+ -------------------------------------------------- total: 36.290000sec
43
+
44
+ user system total real
45
+ ruby argmin : 0.880000 0.010000 0.890000 ( 0.933367)
46
+ opencl argmin : 0.010000 0.010000 0.020000 ( 0.023140)
47
+ ruby bias_add_grad : 2.350000 0.050000 2.400000 ( 2.539349)
48
+ opencl bias_add_grad : 0.010000 0.010000 0.020000 ( 0.024700)
49
+ ruby bias_add : 2.510000 0.060000 2.570000 ( 2.667330)
50
+ opencl bias_add : 0.150000 0.020000 0.170000 ( 0.184056)
51
+ ruby conv2d_backprop : 3.910000 0.040000 3.950000 ( 4.320383)
52
+ opencl conv2d_backprop : 0.030000 0.020000 0.050000 ( 0.058036)
53
+ ruby conv2d : 0.910000 0.020000 0.930000 ( 1.120605)
54
+ opencl conv2d : 0.020000 0.010000 0.030000 ( 0.034972)
55
+ ruby arr index : 0.000000 0.000000 0.000000 ( 0.004119)
56
+ opencl arr index : 0.020000 0.010000 0.030000 ( 0.024126)
57
+ ruby min : 3.670000 0.030000 3.700000 ( 4.024439)
58
+ opencl min : 0.140000 0.030000 0.170000 ( 0.178683)
59
+ ruby sum : 6.920000 0.050000 6.970000 ( 7.314338)
60
+ opencl sum : 0.010000 0.020000 0.030000 ( 0.024655)
61
+ ruby sum axis 1 : 6.900000 0.050000 6.950000 ( 7.332897)
62
+ opencl sum axis 1 : 0.020000 0.020000 0.040000 ( 0.026150)
63
+ ruby split : 0.010000 0.000000 0.010000 ( 0.018866)
64
+ opencl split : 0.050000 0.040000 0.090000 ( 0.096327)
65
+ ruby add_n : 0.140000 0.000000 0.140000 ( 0.151006)
66
+ opencl add_n : 0.020000 0.010000 0.030000 ( 0.025622)
67
+ ruby ooo matmul : 1.670000 0.010000 1.680000 ( 1.732486)
68
+ opencl ooo matmul : 0.020000 0.020000 0.040000 ( 0.027051)
69
+ ruby softmax : 0.030000 0.000000 0.030000 ( 0.032848)
70
+ opencl softmax : 0.010000 0.010000 0.020000 ( 0.026403)
71
+ ruby matmul : 0.810000 0.000000 0.810000 ( 0.866297)
72
+ opencl matmul : 0.020000 0.020000 0.040000 ( 0.026677)
73
+ ruby : 2.870000 0.020000 2.890000 ( 3.237224)
74
+ opencl : 0.240000 0.080000 0.320000 ( 0.302463)
75
+ ruby single function : 0.390000 0.010000 0.400000 ( 0.470700)
76
+ opencl single function : 0.150000 0.060000 0.210000 ( 0.228528)
77
+ ruby pow float : 0.090000 0.000000 0.090000 ( 0.113073)
78
+ opencl pow float : 0.020000 0.010000 0.030000 ( 0.036938)
79
+ ruby pow int : 0.020000 0.000000 0.020000 ( 0.023728)
80
+ opencl pow int : 0.020000 0.020000 0.040000 ( 0.031909)
@@ -1,5 +1,6 @@
1
1
  require "tensor_stream/opencl/version"
2
2
  require 'tensor_stream'
3
+ require "tensor_stream/opencl/utils"
3
4
  require "tensor_stream/opencl/opencl_evaluator"
4
5
 
5
6
  module TensorStream
@@ -10,13 +10,13 @@ module TensorStream
10
10
  shape = if %i[zeros_like ones_like].include?(tensor.operation)
11
11
  inputs[0].shape
12
12
  elsif !inputs[0].nil?
13
- read_final_result(complete_eval(inputs[0], context))
13
+ complete_eval(inputs[0], context).buffer.to_a
14
14
  else
15
15
  tensor.shape.shape
16
16
  end
17
17
  cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
18
18
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
19
- buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
19
+ buffer = OpenCLBuffer.allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
20
20
  if %i[zeros zeros_like].include?(tensor.operation)
21
21
  buffer.fill!(0)
22
22
  else
@@ -47,7 +47,7 @@ module TensorStream
47
47
  buffer = if cl_buffer
48
48
  cl_buffer.buffer
49
49
  else
50
- allocate_narray_for_type(tensor.data_type, narray_size)
50
+ OpenCLBuffer.allocate_narray_for_type(tensor.data_type, narray_size)
51
51
  end
52
52
 
53
53
  buffer.fill!(value.buffer[0])
@@ -365,8 +365,8 @@ module TensorStream
365
365
  end
366
366
 
367
367
  register_op :reshape do |context, tensor, inputs|
368
- arr = inputs[0]
369
- new_shape = read_final_result(complete_eval(inputs[1], context))
368
+ arr, new_shape = inputs
369
+ new_shape = complete_eval(new_shape, context).buffer.to_a
370
370
 
371
371
  shape = if new_shape.size.zero? && arr.buffer.size == 1
372
372
  new_shape
@@ -389,9 +389,9 @@ module TensorStream
389
389
  res
390
390
  else
391
391
  rank = inputs[0].shape.size
392
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
392
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer!
393
393
  new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
394
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
394
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name, allocate_host: true)
395
395
  transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
396
396
 
397
397
  write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
@@ -442,6 +442,36 @@ module TensorStream
442
442
  a
443
443
  end
444
444
  end
445
+
446
+ register_op :range do |context, tensor, inputs|
447
+ start, limit, delta = complete_eval(inputs, context).map { |p| p.buffer.to_a.first }
448
+
449
+ if limit.zero?
450
+ limit = start
451
+ start = 0
452
+ end
453
+
454
+ raise " delta !=0 " if delta.zero?
455
+ raise " Requires start <= limit when delta > 0" if (start > limit) && delta > 0
456
+ raise " Requires start >= limit when delta < 0" if (start < limit) && delta < 0
457
+ cache_key = "range_#{start}_#{limit}_#{delta}_#{tensor.data_type}"
458
+
459
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
460
+ delta = fp_type?(tensor.options[:output_type]) ? delta.to_f : delta.to_i
461
+ cur_step = fp_type?(tensor.options[:output_type]) ? start.to_f : start.to_i
462
+ r = []
463
+ Kernel.loop do
464
+ break if start == limit
465
+ break if (start < limit) && (cur_step >= limit)
466
+ break if (start > limit) && (cur_step <= limit)
467
+
468
+ r << cur_step
469
+ cur_step += delta
470
+ end
471
+ r
472
+ convert_to_opencl(r, [r.size], data_type: tensor.options[:output_type], name: tensor.name)
473
+ end
474
+ end
445
475
  end
446
476
  end
447
477
  end
@@ -24,7 +24,7 @@ module TensorStream
24
24
  end
25
25
  end
26
26
 
27
- output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
27
+ output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}", allocate_host: true)
28
28
 
29
29
  image.grayscale! if channels == 1
30
30
  image.pixels.each_with_index do |pixel, index|
@@ -6,8 +6,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
6
6
  __global <%= c_dtype %>* C) {
7
7
 
8
8
  // Get the index of the current element to be processed
9
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
10
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
+ const int index = get_global_id(0);
10
+ const int globalRow = get_global_id(1); // Row ID of C (0..M)
11
+ const int globalCol = get_global_id(2); // Col ID of C (0..N)
11
12
 
12
13
  // Compute a single element (loop over K)
13
14
  <%= c_dtype %> acc = 0.0f;
@@ -16,9 +17,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
16
17
  int b_index = k*N + globalCol;
17
18
  <% if ta %>a_index = M*k + globalRow;<% end %>
18
19
  <% if tb %>b_index = globalCol*K + k;<% end %>
19
- acc += A[a_index] * B[b_index];
20
+ acc += A[a_index + index * <%= n_a %>] * B[b_index + index * <%= n_b %>];
20
21
  }
21
22
 
22
23
  // Store the result
23
- C[globalRow*N + globalCol] = acc;
24
+ C[index * <%= n %> + globalRow*N + globalCol] = acc;
24
25
  }
@@ -0,0 +1,7 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void random_uniform_<%= dtype %>(const int seed_ptr, const float min, const float max, __global const <%= c_dtype %> *rand_table, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ <%= c_dtype %> rand_value = rand_table[ (seed_ptr + id) % <%= tsize %>];
6
+ C[id] = rand_value * (max - min) + min;
7
+ }
@@ -54,26 +54,28 @@ module TensorStream
54
54
  register_op :mat_mul do |_context, tensor, inputs|
55
55
  a, b = inputs
56
56
 
57
- m = a.shape[0]
58
- n = b.shape[1]
59
- v = b.shape[0]
60
- k = a.shape[1]
57
+ a_matrix_shape = a.shape.dup
58
+ b_matrix_shape = b.shape.dup
59
+
60
+ k = a_matrix_shape.pop
61
+ m = a_matrix_shape.pop
62
+ n = b_matrix_shape.pop
63
+ v = b_matrix_shape.pop
61
64
 
62
65
  if tensor.options[:transpose_a]
63
- m = a.shape[1]
64
- k = a.shape[0]
66
+ m, k = k, m
65
67
  end
66
68
 
67
69
  if tensor.options[:transpose_b]
68
- n = b.shape[0]
69
- v = b.shape[1]
70
+ n, v = v, n
70
71
  end
71
72
 
72
- result_shape = [m, n]
73
+ result_shape = [a_matrix_shape.first, m, n].compact
74
+ work_group = [a_matrix_shape.first || 1, m, n]
73
75
 
74
76
  raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
75
77
  raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
76
- raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
78
+ raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size > 3 || a.shape.size > 3
77
79
  raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
78
80
 
79
81
  dtype = tensor.data_type
@@ -85,7 +87,7 @@ module TensorStream
85
87
  cl_k = OpenCL::Int1.new(k)
86
88
 
87
89
  event_wait_list = build_event_wait_list([a, b])
88
- output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
90
+ output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], n: m * n, n_a: m * k, n_b: n * v, dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
89
91
 
90
92
  output_buffer
91
93
  end
@@ -1,7 +1,48 @@
1
1
  module TensorStream
2
2
  # Buffer used by the OpenCL evaluator
3
3
  class OpenCLBuffer < Buffer
4
+ class LazyBuffer
5
+ attr_reader :data_type
6
+
7
+ def initialize(data_type, size)
8
+ @data_type = data_type
9
+ @size = size
10
+ end
11
+
12
+ def size
13
+ @size
14
+ end
15
+
16
+ def element_size
17
+ buffer_size_for_type(@data_type)
18
+ end
19
+
20
+ def buffer_size_for_type(data_type)
21
+ case data_type
22
+ when :float, :float32, :float16
23
+ 4
24
+ when :float64
25
+ 8
26
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
27
+ 4
28
+ when :int16, :uint16
29
+ 2
30
+ when :uint8, :int8
31
+ 1
32
+ when :boolean
33
+ 1
34
+ when :string
35
+ 1
36
+ when :unknown
37
+ nil
38
+ else
39
+ raise "unsupported type #{data_type}"
40
+ end
41
+ end
42
+ end
43
+
4
44
  include ArrayOpsHelper
45
+ include TensorStream::CLEventHelpers
5
46
 
6
47
  attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
7
48
 
@@ -24,15 +65,33 @@ module TensorStream
24
65
  end
25
66
 
26
67
  def inspect
27
- "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
68
+ "CLBuffer(name: #{name} shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
69
+ end
70
+
71
+ def buffer!
72
+ return buffer if buffer.is_a?(NArray)
73
+
74
+ @buffer = OpenCLBuffer.allocate_narray_for_type(buffer.data_type, buffer.size) if buffer.is_a?(LazyBuffer)
75
+
76
+ command_queue.enqueue_read_buffer(cl_buffer, @buffer, blocking: true, event_wait_list: build_event_wait_list([self]))
77
+ @buffer
78
+ end
79
+
80
+ def command_queue
81
+ @command_queue ||= begin
82
+ first_op = op.is_a?(Array) ? op.first : op
83
+ first_op.command_queue
84
+ end
28
85
  end
29
86
 
30
87
  def to_ruby
88
+ buffer! if buffer.is_a?(LazyBuffer)
89
+
31
90
  return [] if buffer.empty?
32
91
 
33
92
  if dirty
34
- op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
35
- op.command_queue.finish
93
+ command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
94
+ command_queue.finish
36
95
  self.dirty = false
37
96
  end
38
97
 
@@ -54,5 +113,28 @@ module TensorStream
54
113
  def self.nil_buffer(owner, name, data_type)
55
114
  OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
56
115
  end
116
+
117
+ def self.allocate_narray_for_type(data_type, narray_size)
118
+ case data_type
119
+ when :float, :float32, :float16
120
+ NArray.sfloat(narray_size)
121
+ when :float64
122
+ NArray.float(narray_size)
123
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
124
+ NArray.int(narray_size)
125
+ when :int16, :uint16
126
+ NArray.sint(narray_size)
127
+ when :uint8, :int8
128
+ NArray.byte(narray_size)
129
+ when :boolean
130
+ NArray.byte(narray_size)
131
+ when :string
132
+ NArray.byte(narray_size)
133
+ when :unknown
134
+ nil
135
+ else
136
+ raise "unsupported type #{data_type}"
137
+ end
138
+ end
57
139
  end
58
140
  end
@@ -13,6 +13,7 @@ require 'tensor_stream/opencl/math_ops'
13
13
  require 'tensor_stream/opencl/nn_ops'
14
14
  require 'tensor_stream/opencl/images_ops'
15
15
  require 'tensor_stream/opencl/array_ops'
16
+ require 'tensor_stream/opencl/random_ops'
16
17
  require 'tensor_stream/helpers/op_helper'
17
18
 
18
19
  module TensorStream
@@ -49,6 +50,8 @@ module TensorStream
49
50
  include TensorStream::OpenCLHelpers::NNOps
50
51
  include TensorStream::OpenCLHelpers::ImagesOps
51
52
  include TensorStream::OpenCLHelpers::ArrayOps
53
+ include TensorStream::OpenCLHelpers::RandomOps
54
+ include TensorStream::CLEventHelpers
52
55
 
53
56
  def initialize(session, device, thread_pool: nil, log_intermediates: false)
54
57
  super
@@ -159,6 +162,9 @@ module TensorStream
159
162
  return [] if buffer.buffer.nil?
160
163
  return buffer if buffer.buffer.size.zero?
161
164
 
165
+ # lazy allocate
166
+ buffer.buffer = OpenCLBuffer.allocate_narray_for_type(buffer.buffer.data_type, buffer.buffer.size) if buffer.buffer.is_a?(OpenCLBuffer::LazyBuffer)
167
+
162
168
  buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
163
169
  buffer
164
170
  end
@@ -167,12 +173,19 @@ module TensorStream
167
173
  def complete_eval(tensor, context)
168
174
  return nil if tensor.nil?
169
175
 
170
- buffer = enqueue_buffer_read(tensor, context)
171
- events = build_event_wait_list([buffer])
176
+ buffers = if tensor.is_a?(Array)
177
+ tensor.map { |t|
178
+ enqueue_buffer_read(t, context)
179
+ }
180
+ else
181
+ [enqueue_buffer_read(tensor, context)]
182
+ end
183
+
184
+ events = build_event_wait_list(buffers)
172
185
  # puts "** wait #{tensor.name} **"
173
186
  OpenCL.wait_for_events(events) unless events.empty?
174
187
  # puts "** done #{tensor.name} **"
175
- buffer
188
+ tensor.is_a?(Array) ? buffers : buffers.first
176
189
  end
177
190
 
178
191
  def self.query_devices_with_score
@@ -355,9 +368,13 @@ module TensorStream
355
368
 
356
369
  register_op :identity do |_context, tensor, inputs|
357
370
  value = inputs[0]
358
- buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
359
- buffer.op = build_event_wait_list(inputs)
360
- buffer
371
+ if value.is_a?(OutputGroup)
372
+ value
373
+ else
374
+ buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
375
+ buffer.op = build_event_wait_list(inputs)
376
+ buffer
377
+ end
361
378
  end
362
379
 
363
380
  register_op :assign, noop: true do |context, tensor, inputs|
@@ -773,9 +790,9 @@ module TensorStream
773
790
  value
774
791
  elsif data_type == :string && shape.empty?
775
792
  cl_buffer_size = value[0].bytesize
776
- allocate_narray_for_type(data_type, value[0].bytesize)
793
+ OpenCLBuffer.allocate_narray_for_type(data_type, value[0].bytesize)
777
794
  else
778
- allocate_narray_for_type(data_type, narray_size)
795
+ OpenCLBuffer.allocate_narray_for_type(data_type, narray_size)
779
796
  end
780
797
 
781
798
  return nil if buffer.nil?
@@ -818,39 +835,17 @@ module TensorStream
818
835
  cl_object
819
836
  end
820
837
 
821
- def allocate_narray_for_type(data_type, narray_size)
822
- case data_type
823
- when :float, :float32, :float16
824
- NArray.sfloat(narray_size)
825
- when :float64
826
- NArray.float(narray_size)
827
- when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
828
- NArray.int(narray_size)
829
- when :int16, :uint16
830
- NArray.sint(narray_size)
831
- when :uint8, :int8
832
- NArray.byte(narray_size)
833
- when :boolean
834
- NArray.byte(narray_size)
835
- when :string
836
- NArray.byte(narray_size)
837
- when :unknown
838
- nil
839
- else
840
- raise "unsupported type #{data_type}"
841
- end
842
- end
843
-
844
- def _create_result_buffer(data_type, shape, name)
838
+ def _create_result_buffer(data_type, shape, name, allocate_host: false)
845
839
  return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
846
840
 
847
841
  cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
848
842
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
849
843
  # puts "create result buffer #{cache_key}"
850
844
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
851
- buffer = allocate_narray_for_type(data_type, size)
852
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
853
- OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
845
+ lazy_buffer = !allocate_host ? OpenCLBuffer::LazyBuffer.new(data_type, size) : OpenCLBuffer.allocate_narray_for_type(data_type, size)
846
+ cl_buffer = _opencl_context.create_buffer(size * lazy_buffer.element_size)
847
+
848
+ OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: lazy_buffer, cl_buffer: cl_buffer, name: name)
854
849
  end
855
850
  end
856
851
 
@@ -859,7 +854,7 @@ module TensorStream
859
854
  cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
860
855
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
861
856
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
862
- buffer = allocate_narray_for_type(data_type, size)
857
+ buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
863
858
 
864
859
  if parent_buffer.cl_buffer.associated_memobject.nil?
865
860
  start = index * buffer.size * buffer.element_size
@@ -890,7 +885,7 @@ module TensorStream
890
885
  cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
891
886
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
892
887
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
893
- buffer = allocate_narray_for_type(data_type, size)
888
+ buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
894
889
 
895
890
  if parent_buffer.cl_buffer.associated_memobject.nil?
896
891
  region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
@@ -980,14 +975,6 @@ module TensorStream
980
975
  shape.is_a?(Array) ? shape.size : 0
981
976
  end
982
977
 
983
- def build_event_wait_list(inputs)
984
- if inputs.is_a?(Array)
985
- inputs.flatten.compact.map(&:op).compact.uniq
986
- else
987
- inputs.op ? [inputs.op] : []
988
- end
989
- end
990
-
991
978
  def resolve_placeholder(placeholder, _execution_context = {})
992
979
  return nil if placeholder.nil?
993
980
  return placeholder unless placeholder.is_a?(Placeholder)
@@ -0,0 +1,54 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module RandomOps
5
+ RAND_TABLE_SIZE = 1024 * 1024
6
+
7
+ def RandomOps.included(klass)
8
+ klass.class_eval do
9
+ register_op :random_uniform do |context, tensor, inputs|
10
+ maxval = tensor.options.fetch(:maxval, 1)
11
+ minval = tensor.options.fetch(:minval, 0)
12
+ seed = tensor.options[:seed]
13
+
14
+ rand_buffer = @context[:_cache][:_cl_buffers]["_rand"] ||= begin
15
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = 0
16
+ random = _get_randomizer(tensor, seed)
17
+ rand_table = RAND_TABLE_SIZE.times.map { random.rand }
18
+ convert_to_opencl(rand_table, [RAND_TABLE_SIZE], data_type: tensor.data_type, name: "rand_#{tensor.data_type}")
19
+ end
20
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] ||= 0
21
+
22
+ seed_ptr = @context[:_cache][:_cl_buffers]["_rand_seed_ptr"]
23
+
24
+ shape = read_final_result(complete_eval(inputs[0], context))
25
+ shape = shape || tensor.shape.shape
26
+ workgroup = [shape.reduce(:*) || 1 ]
27
+ cl_seed_ptr = OpenCL::Int1.new(seed_ptr)
28
+ cl_min = OpenCL::Float1.new(minval)
29
+ cl_max = OpenCL::Float1.new(maxval)
30
+
31
+ @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = (seed_ptr + (shape.reduce(:*) || 0) ) % RAND_TABLE_SIZE
32
+ buffer = _create_result_buffer(tensor.data_type, shape, tensor.name)
33
+ buffer.op = _cl_program("random_uniform", dtype: tensor.data_type, tsize: RAND_TABLE_SIZE).send(:"random_uniform_#{tensor.data_type}", _opencl_queue, workgroup, cl_seed_ptr, cl_min, cl_max, rand_buffer.cl_buffer, buffer.cl_buffer)
34
+ buffer
35
+ end
36
+
37
+ def _get_randomizer(tensor, seed)
38
+ if tensor.graph.random_seed && seed
39
+ Random.new(tensor.graph.random_seed ^ seed)
40
+ elsif tensor.graph.random_seed
41
+ @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
42
+ @session.randomizer[tensor.graph.object_id]
43
+ elsif seed
44
+ @session.randomizer[tensor.operation] ||= Random.new(seed)
45
+ @session.randomizer[tensor.operation]
46
+ else
47
+ Random.new
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,27 @@
1
+ module TensorStream
2
+ class OpenCLUtil
3
+ ##
4
+ # initializes a OpenCL helper class based on a session
5
+ def initialize(session)
6
+ @session = session
7
+ end
8
+
9
+ ##
10
+ # Retrieves OpenCL memory usage
11
+ def get_memory_usage
12
+ cl_buffer_uniq_set = Set.new
13
+ @session.last_session_context[:_cache][:_cl_buffers].inject(0) do |sum, elem|
14
+ cl_buffer_uniq_set.add?(elem[1].cl_buffer.object_id) ? sum + elem[1].cl_buffer.size : sum
15
+ end
16
+ end
17
+ end
18
+ module CLEventHelpers
19
+ def build_event_wait_list(inputs)
20
+ if inputs.is_a?(Array)
21
+ inputs.flatten.compact.map(&:op).compact.uniq
22
+ else
23
+ inputs.op ? [inputs.op] : []
24
+ end
25
+ end
26
+ end
27
+ end
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
@@ -53,11 +53,16 @@ b5 = tf.variable(tf.zeros([10]))
53
53
 
54
54
  x_ = tf.reshape(x, [-1, 784])
55
55
 
56
+
56
57
  y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
57
- y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
58
- y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
59
- y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
60
- ylogits = tf.matmul(y4, w5) + b5
58
+ Y1d = tf.nn.dropout(y1, pkeep)
59
+ y2 = tf.nn.relu(tf.matmul(Y1d, w2) + b2)
60
+ Y2d = tf.nn.dropout(y2, pkeep)
61
+ y3 = tf.nn.relu(tf.matmul(Y2d, w3) + b3)
62
+ Y3d = tf.nn.dropout(y3, pkeep)
63
+ y4 = tf.nn.relu(tf.matmul(Y3d, w4) + b4)
64
+ Y4d = tf.nn.dropout(y4, pkeep)
65
+ ylogits = tf.matmul(Y4d, w5) + b5
61
66
 
62
67
  # model
63
68
  y = tf.nn.softmax(ylogits)
@@ -85,8 +85,8 @@ y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
85
85
  # reshape the output from the third convolution for the fully connected layer
86
86
  yy = tf.reshape(y3, [-1, 7 * 7 * M])
87
87
  y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
88
-
89
- ylogits = tf.matmul(y4, w5) + b5
88
+ YY4 = tf.nn.dropout(y4, pkeep)
89
+ ylogits = tf.matmul(YY4, w5) + b5
90
90
 
91
91
  # model
92
92
  y = tf.nn.softmax(ylogits, name: 'out')
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "1.0.6"
42
+ spec.add_dependency "tensor_stream", "1.0.7"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - '='
116
116
  - !ruby/object:Gem::Version
117
- version: 1.0.6
117
+ version: 1.0.7
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
- version: 1.0.6
124
+ version: 1.0.7
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -167,6 +167,7 @@ files:
167
167
  - README.md
168
168
  - Rakefile
169
169
  - benchmark/benchmark.rb
170
+ - benchmark_imac2015_iris.txt
170
171
  - benchmark_intel.txt
171
172
  - benchmark_ryzen.txt
172
173
  - benchmark_ryzen_nvidia.txt
@@ -219,6 +220,7 @@ files:
219
220
  - lib/tensor_stream/opencl/kernels/pack.cl
220
221
  - lib/tensor_stream/opencl/kernels/pow.cl
221
222
  - lib/tensor_stream/opencl/kernels/prod.cl
223
+ - lib/tensor_stream/opencl/kernels/random_uniform.cl
222
224
  - lib/tensor_stream/opencl/kernels/real_div.cl
223
225
  - lib/tensor_stream/opencl/kernels/reciprocal.cl
224
226
  - lib/tensor_stream/opencl/kernels/reduce_axis.cl
@@ -250,6 +252,8 @@ files:
250
252
  - lib/tensor_stream/opencl/opencl_device.rb
251
253
  - lib/tensor_stream/opencl/opencl_evaluator.rb
252
254
  - lib/tensor_stream/opencl/opencl_template_helper.rb
255
+ - lib/tensor_stream/opencl/random_ops.rb
256
+ - lib/tensor_stream/opencl/utils.rb
253
257
  - lib/tensor_stream/opencl/version.rb
254
258
  - samples/build_mnist_model.rb
255
259
  - samples/classify.rb