tensor_stream-opencl 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +89 -0
- data/lib/tensor_stream/opencl/array_ops.rb +30 -6
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -2
- data/lib/tensor_stream/opencl/math_ops.rb +3 -1
- data/lib/tensor_stream/opencl/opencl_buffer.rb +3 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +112 -61
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +12 -2
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +0 -2
- data/samples/mnist_data_2.1.rb +99 -0
- data/samples/mnist_data_2.2.rb +98 -0
- data/samples/multigpu.rb +27 -13
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +7 -6
- data/Gemfile.lock +0 -70
- data/samples/mnist_data.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
|
4
|
+
data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
|
7
|
+
data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
AllCops:
|
2
|
+
Exclude:
|
3
|
+
- samples/*
|
4
|
+
- bin/*
|
5
|
+
- spec/**/*
|
6
|
+
- tensor_stream.gemspec
|
7
|
+
- Rakefile
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Exclude:
|
11
|
+
- lib/tensor_stream/utils.rb
|
12
|
+
|
13
|
+
Style/StringLiterals:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Layout/TrailingBlankLines:
|
17
|
+
Enabled: false
|
18
|
+
|
19
|
+
Metrics/LineLength:
|
20
|
+
Max: 200
|
21
|
+
|
22
|
+
Metrics/AbcSize:
|
23
|
+
Enabled: false
|
24
|
+
|
25
|
+
Metrics/PerceivedComplexity:
|
26
|
+
Enabled: false
|
27
|
+
|
28
|
+
Metrics/MethodLength:
|
29
|
+
Enabled: false
|
30
|
+
|
31
|
+
Metrics/CyclomaticComplexity:
|
32
|
+
Enabled: false
|
33
|
+
|
34
|
+
Metrics/BlockLength:
|
35
|
+
Exclude:
|
36
|
+
- lib/tensor_stream/math_gradients.rb
|
37
|
+
|
38
|
+
Naming/AccessorMethodName:
|
39
|
+
Exclude:
|
40
|
+
- lib/tensor_stream.rb
|
41
|
+
- lib/tensor_stream/control_flow.rb
|
42
|
+
- lib/tensor_stream/graph.rb
|
43
|
+
- lib/tensor_stream/operation.rb
|
44
|
+
|
45
|
+
Style/Documentation:
|
46
|
+
Exclude:
|
47
|
+
- lib/tensor_stream/version.rb
|
48
|
+
- lib/tensor_stream/trainer.rb
|
49
|
+
- lib/tensor_stream/nn/nn_ops.rb
|
50
|
+
- lib/tensor_stream/evaluator/evaluator.rb
|
51
|
+
|
52
|
+
Lint/UnusedMethodArgument:
|
53
|
+
Exclude:
|
54
|
+
- lib/tensor_stream/train/saver.rb
|
55
|
+
- lib/tensor_stream/ops.rb
|
56
|
+
|
57
|
+
Metrics/ParameterLists:
|
58
|
+
Max: 8
|
59
|
+
|
60
|
+
Style/PerlBackrefs:
|
61
|
+
Enabled: false
|
62
|
+
|
63
|
+
Style/RegexpLiteral:
|
64
|
+
Enabled: false
|
65
|
+
|
66
|
+
Naming/MemoizedInstanceVariableName:
|
67
|
+
Enabled: false
|
68
|
+
|
69
|
+
Metrics/ModuleLength:
|
70
|
+
Max: 200
|
71
|
+
|
72
|
+
Metrics/ClassLength:
|
73
|
+
Max: 250
|
74
|
+
Exclude:
|
75
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
76
|
+
|
77
|
+
Naming/VariableNumber:
|
78
|
+
Enabled: false
|
79
|
+
|
80
|
+
Style/DoubleNegation:
|
81
|
+
Enabled: false
|
82
|
+
|
83
|
+
Style/TrailingCommaInHashLiteral:
|
84
|
+
Enabled: false
|
85
|
+
|
86
|
+
Naming/UncommunicativeMethodParamName:
|
87
|
+
Exclude:
|
88
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
89
|
+
- lib/tensor_stream/ops.rb
|
@@ -4,6 +4,28 @@ module TensorStream
|
|
4
4
|
module ArrayOps
|
5
5
|
def ArrayOps.included(klass)
|
6
6
|
klass.class_eval do
|
7
|
+
|
8
|
+
#fast cached 0/1 constant fill
|
9
|
+
register_op %i[zeros ones zeros_like ones_like] do |context, tensor, inputs|
|
10
|
+
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
11
|
+
inputs[0].shape
|
12
|
+
elsif !inputs[0].nil?
|
13
|
+
read_final_result(complete_eval(inputs[0], context))
|
14
|
+
else
|
15
|
+
tensor.shape.shape
|
16
|
+
end
|
17
|
+
cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
|
18
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
19
|
+
buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
|
20
|
+
if %i[zeros zeros_like].include?(tensor.operation)
|
21
|
+
buffer.fill!(0)
|
22
|
+
else
|
23
|
+
buffer.fill!(1)
|
24
|
+
end
|
25
|
+
convert_to_opencl(buffer, shape, data_type: tensor.data_type, name: tensor.name)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
7
29
|
register_op :expand_dims, buffer: true do |_context, tensor, inputs|
|
8
30
|
axis = inputs[1].buffer[0]
|
9
31
|
shape = inputs[0].shape.dup
|
@@ -17,8 +39,10 @@ module TensorStream
|
|
17
39
|
shape = inputs[0]
|
18
40
|
value = inputs[1]
|
19
41
|
|
20
|
-
|
21
|
-
|
42
|
+
fill_shape = shape.nil? ? tensor.shape.shape : shape.buffer.to_a
|
43
|
+
narray_size = fill_shape.reduce(:*) || 1
|
44
|
+
|
45
|
+
cl_buffer = get_cached_buffer(tensor.name, fill_shape)
|
22
46
|
|
23
47
|
buffer = if cl_buffer
|
24
48
|
cl_buffer.buffer
|
@@ -27,7 +51,7 @@ module TensorStream
|
|
27
51
|
end
|
28
52
|
|
29
53
|
buffer.fill!(value.buffer[0])
|
30
|
-
convert_to_opencl(buffer,
|
54
|
+
convert_to_opencl(buffer, fill_shape, data_type: tensor.data_type, name: tensor.name)
|
31
55
|
end
|
32
56
|
|
33
57
|
register_op :split do |context, tensor, inputs|
|
@@ -119,7 +143,7 @@ module TensorStream
|
|
119
143
|
piece_size = shape.reduce(:*) || 1
|
120
144
|
work_group = [piece_size]
|
121
145
|
cl_offset = OpenCL::Int1.new(offset)
|
122
|
-
|
146
|
+
|
123
147
|
_cl_program('split_n', axis: axis,
|
124
148
|
div: divisors,
|
125
149
|
mul: multipliers,
|
@@ -218,7 +242,7 @@ module TensorStream
|
|
218
242
|
shape = shape.map { |s| s == 1 ? nil : s }
|
219
243
|
end
|
220
244
|
|
221
|
-
OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
|
245
|
+
OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
|
222
246
|
shape: shape.compact, buffer: arr.buffer,
|
223
247
|
cl_buffer: arr.cl_buffer,
|
224
248
|
op: arr.op)
|
@@ -350,7 +374,7 @@ module TensorStream
|
|
350
374
|
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
351
375
|
end
|
352
376
|
|
353
|
-
OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
|
377
|
+
OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
|
354
378
|
shape: shape, buffer: arr.buffer,
|
355
379
|
cl_buffer: arr.cl_buffer,
|
356
380
|
op: arr.op)
|
@@ -12,9 +12,9 @@
|
|
12
12
|
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int index = get_global_id(0);
|
15
|
-
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
15
|
+
<%= c_dtype %> alpha = learning_rate[0] * sqrt((<%= c_dtype %>)1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
16
16
|
|
17
17
|
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
18
18
|
v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
|
19
|
-
output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
|
19
|
+
output[index] -= (momentum[index] * alpha) / ( sqrt((<%= c_dtype %>)v[index]) + epsilon[0] );
|
20
20
|
}
|
@@ -80,8 +80,10 @@ module TensorStream
|
|
80
80
|
|
81
81
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
82
82
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
83
|
-
event_wait_list = build_event_wait_list(
|
83
|
+
event_wait_list = build_event_wait_list([a, b])
|
84
|
+
|
84
85
|
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
86
|
+
|
85
87
|
output_buffer
|
86
88
|
end
|
87
89
|
|
@@ -3,15 +3,16 @@ module TensorStream
|
|
3
3
|
class OpenCLBuffer < Buffer
|
4
4
|
include ArrayOpsHelper
|
5
5
|
|
6
|
-
attr_accessor :shape, :buffer, :cl_buffer, :op
|
6
|
+
attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
|
7
7
|
|
8
|
-
def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
|
+
def initialize(owner, data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
9
9
|
@data_type = data_type
|
10
10
|
@shape = shape
|
11
11
|
@buffer = buffer
|
12
12
|
@cl_buffer = cl_buffer
|
13
13
|
@name = name
|
14
14
|
@op = op
|
15
|
+
@owner = owner
|
15
16
|
end
|
16
17
|
|
17
18
|
def total_elements
|
@@ -38,7 +38,8 @@ module TensorStream
|
|
38
38
|
# PURE ruby evaluator used for testing and development
|
39
39
|
class OpenclEvaluator < BaseEvaluator
|
40
40
|
attr_accessor :retain
|
41
|
-
attr_reader :opencl_device
|
41
|
+
attr_reader :opencl_device, :opencl_context
|
42
|
+
attr_writer :context
|
42
43
|
|
43
44
|
include TensorStream::OpHelper
|
44
45
|
include TensorStream::ArrayOpsHelper
|
@@ -50,14 +51,14 @@ module TensorStream
|
|
50
51
|
|
51
52
|
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
52
53
|
super
|
53
|
-
_create_opencl_context
|
54
|
+
_create_opencl_context
|
54
55
|
@opencl_device = device.native_device
|
55
56
|
create_command_queue
|
56
57
|
end
|
57
58
|
|
58
59
|
def self.query_supported_devices
|
59
60
|
devices = query_devices_with_score
|
60
|
-
devices.sort { |a| a[1] }.
|
61
|
+
devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
|
61
62
|
opencl_to_device(d)
|
62
63
|
end
|
63
64
|
end
|
@@ -68,16 +69,16 @@ module TensorStream
|
|
68
69
|
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
69
70
|
end
|
70
71
|
|
71
|
-
def self.opencl_to_device(
|
72
|
-
device =
|
73
|
-
index =
|
72
|
+
def self.opencl_to_device(dev)
|
73
|
+
device = dev[0]
|
74
|
+
index = dev[3]
|
74
75
|
platform_name = device.platform.name.tr(' ', '_').downcase
|
75
76
|
uri = [platform_name, index].join(':')
|
76
77
|
|
77
78
|
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
78
79
|
|
79
|
-
OpenclDevice.new(uri, device_type, self).tap do |
|
80
|
-
|
80
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
81
|
+
d.native_device = device
|
81
82
|
end
|
82
83
|
end
|
83
84
|
|
@@ -85,14 +86,14 @@ module TensorStream
|
|
85
86
|
# Select the best device available in the system for this evaluator
|
86
87
|
def self.default_device
|
87
88
|
devices = OpenclEvaluator.query_devices_with_score
|
88
|
-
device = devices.
|
89
|
+
device = devices.max { |a, b| a[1] <=> b[1] }
|
89
90
|
opencl_to_device(device)
|
90
91
|
end
|
91
92
|
|
92
93
|
# opencl evaluator main entrypoint
|
93
94
|
def run(tensor, execution_context)
|
94
|
-
|
95
|
-
#
|
95
|
+
result = complete_eval(tensor, execution_context)
|
96
|
+
# puts "-------------------wait finish------------------------"
|
96
97
|
_opencl_queue.finish
|
97
98
|
read_final_result(result)
|
98
99
|
end
|
@@ -115,18 +116,22 @@ module TensorStream
|
|
115
116
|
# buffer comes from non-opencl evaluator
|
116
117
|
def convert_from_buffer(tensor, result)
|
117
118
|
if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
|
118
|
-
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map
|
119
|
+
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map do |output, data_type|
|
120
|
+
convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name)
|
121
|
+
end
|
119
122
|
TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
|
120
123
|
else
|
121
124
|
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
122
125
|
end
|
123
126
|
end
|
124
127
|
|
128
|
+
# Generate OpenCL instruction to read back from GPU memory to Host memory for a tensor
|
125
129
|
def enqueue_buffer_read(tensor, context)
|
126
130
|
buffer = _run(tensor, context)
|
127
131
|
if buffer.is_a?(Array)
|
128
132
|
buffer.collect do |b|
|
129
133
|
next b if b.buffer.size.zero?
|
134
|
+
|
130
135
|
b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
131
136
|
b
|
132
137
|
end
|
@@ -135,6 +140,7 @@ module TensorStream
|
|
135
140
|
return buffer if buffer.nil?
|
136
141
|
return [] if buffer.buffer.nil?
|
137
142
|
return buffer if buffer.buffer.size.zero?
|
143
|
+
|
138
144
|
buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
139
145
|
buffer
|
140
146
|
end
|
@@ -145,7 +151,7 @@ module TensorStream
|
|
145
151
|
|
146
152
|
buffer = enqueue_buffer_read(tensor, context)
|
147
153
|
events = build_event_wait_list([buffer])
|
148
|
-
# puts "wait #{tensor.name}"
|
154
|
+
# puts "** wait #{tensor.name} **"
|
149
155
|
OpenCL.wait_for_events(events) unless events.empty?
|
150
156
|
buffer
|
151
157
|
end
|
@@ -154,6 +160,7 @@ module TensorStream
|
|
154
160
|
OpenCL.platforms.flat_map do |p|
|
155
161
|
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
156
162
|
score = 0
|
163
|
+
|
157
164
|
if d.type.to_s == 'CPU'
|
158
165
|
score += 1
|
159
166
|
elsif d.type.to_s == 'GPU'
|
@@ -162,8 +169,7 @@ module TensorStream
|
|
162
169
|
|
163
170
|
score += 1000 if d.platform.name == 'NVIDIA CUDA'
|
164
171
|
|
165
|
-
score += d.max_compute_units
|
166
|
-
score += d.max_clock_frequency
|
172
|
+
score += d.max_compute_units * d.max_clock_frequency
|
167
173
|
|
168
174
|
[d, score, p.name, index]
|
169
175
|
end
|
@@ -172,6 +178,31 @@ module TensorStream
|
|
172
178
|
|
173
179
|
protected
|
174
180
|
|
181
|
+
##
|
182
|
+
# called when passing control to another evaluator
|
183
|
+
def perform_transition(tensor, input, next_evaluator, execution_context)
|
184
|
+
if next_evaluator.is_a?(OpenclEvaluator) # OpenCL but different device?
|
185
|
+
# create opencl buffer for this tensor
|
186
|
+
next_evaluator.context = @context
|
187
|
+
|
188
|
+
foreign_buffer = next_evaluator._run(input, execution_context)
|
189
|
+
event_list = build_event_wait_list([foreign_buffer])
|
190
|
+
|
191
|
+
output_buffer = _create_result_buffer(input.data_type, foreign_buffer.shape, "t_#{tensor.name}_#{input.name}")
|
192
|
+
output_buffer.op = if next_evaluator.opencl_context == @opencl_context
|
193
|
+
_opencl_queue.enqueue_copy_buffer(foreign_buffer.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_list)
|
194
|
+
else
|
195
|
+
puts "wait finish transition ** #{input.name} **"
|
196
|
+
read_event = next_evaluator._opencl_queue.enqueue_read_buffer(foreign_buffer.cl_buffer, output_buffer.buffer, event_wait_list: event_list)
|
197
|
+
OpenCL.wait_for_events(read_event)
|
198
|
+
_opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
199
|
+
end
|
200
|
+
output_buffer
|
201
|
+
else
|
202
|
+
super
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
175
206
|
def prepare_input(tensor, context, options = {})
|
176
207
|
return nil unless tensor
|
177
208
|
|
@@ -195,8 +226,19 @@ module TensorStream
|
|
195
226
|
buffer.to_ruby
|
196
227
|
end
|
197
228
|
|
198
|
-
def _create_opencl_context(
|
199
|
-
|
229
|
+
def _create_opencl_context(device = nil)
|
230
|
+
if device.nil?
|
231
|
+
@@global_opencl_context ||= begin
|
232
|
+
all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
|
233
|
+
puts "global context created for #{all_devices}"
|
234
|
+
OpenCL.create_context(all_devices)
|
235
|
+
end
|
236
|
+
|
237
|
+
@opencl_context = @@global_opencl_context
|
238
|
+
else
|
239
|
+
puts "context created for #{device.native_device}"
|
240
|
+
@opencl_context = OpenCL.create_context(device.native_device)
|
241
|
+
end
|
200
242
|
end
|
201
243
|
|
202
244
|
def create_command_queue
|
@@ -205,6 +247,7 @@ module TensorStream
|
|
205
247
|
properties = []
|
206
248
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
207
249
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
250
|
+
# puts "creating queue with properties #{supported_proprties}"
|
208
251
|
@command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
|
209
252
|
end
|
210
253
|
|
@@ -222,28 +265,32 @@ module TensorStream
|
|
222
265
|
|
223
266
|
def _cl_program(kernel, args = {})
|
224
267
|
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
268
|
+
kernel_cache_key = "_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"
|
269
|
+
@context[:_cache][kernel_cache_key] ||=
|
270
|
+
begin
|
271
|
+
# puts "building #{kernel_cache_key}"
|
272
|
+
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
273
|
+
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
274
|
+
File.read(file_path)
|
275
|
+
else
|
276
|
+
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
277
|
+
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
278
|
+
|
279
|
+
source = File.read(filename)
|
280
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
281
|
+
File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
|
282
|
+
source
|
283
|
+
end
|
284
|
+
program = _opencl_context.create_program_with_source(source)
|
285
|
+
program.build
|
286
|
+
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
287
|
+
puts "OpenCL Compile error: #{program.build_log}"
|
288
|
+
raise e
|
289
|
+
end
|
243
290
|
end
|
244
291
|
|
245
292
|
def escape_arg_content(value)
|
246
|
-
return value.tr(' ','_') if value.is_a?(String)
|
293
|
+
return value.tr(' ', '_') if value.is_a?(String)
|
247
294
|
return value.join('-') if value.is_a?(Array)
|
248
295
|
|
249
296
|
value
|
@@ -257,9 +304,8 @@ module TensorStream
|
|
257
304
|
|
258
305
|
child_context = execution_context.dup
|
259
306
|
res = if tensor.is_a?(Operation)
|
260
|
-
if !
|
261
|
-
|
262
|
-
convert_from_buffer(tensor, result)
|
307
|
+
if !on_same_device?(tensor) # tensor is on another device or evaluator
|
308
|
+
perform_transition(tensor, tensor, @context[:_cache][:placement][tensor.name][1], execution_context)
|
263
309
|
else
|
264
310
|
eval_operation(tensor, child_context)
|
265
311
|
end
|
@@ -295,7 +341,7 @@ module TensorStream
|
|
295
341
|
|
296
342
|
register_op :identity do |context, tensor, inputs|
|
297
343
|
value = inputs[0]
|
298
|
-
buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
344
|
+
buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
299
345
|
buffer.op = build_event_wait_list(inputs)
|
300
346
|
buffer
|
301
347
|
end
|
@@ -375,6 +421,7 @@ module TensorStream
|
|
375
421
|
|
376
422
|
register_op :flow_group do |_context, _tensor, inputs|
|
377
423
|
events = build_event_wait_list(inputs)
|
424
|
+
# puts "** wait for event flow_group**"
|
378
425
|
OpenCL.wait_for_events(events) unless events.empty?
|
379
426
|
nil
|
380
427
|
end
|
@@ -387,8 +434,10 @@ module TensorStream
|
|
387
434
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
388
435
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
389
436
|
return @context[cache_key] if @context.key?(cache_key)
|
390
|
-
|
437
|
+
|
438
|
+
# puts "opencl eval #{object_id} #{tensor.name}"
|
391
439
|
invoke(tensor, child_context).tap do |result|
|
440
|
+
# puts "result done opencl #{object_id}: #{tensor.name}"
|
392
441
|
if tensor.breakpoint
|
393
442
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
394
443
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -603,6 +652,7 @@ module TensorStream
|
|
603
652
|
end
|
604
653
|
|
605
654
|
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
655
|
+
# puts "convert_to_opencl called for #{name}"
|
606
656
|
value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
|
607
657
|
|
608
658
|
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
@@ -630,7 +680,7 @@ module TensorStream
|
|
630
680
|
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
631
681
|
end
|
632
682
|
|
633
|
-
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
683
|
+
@context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
634
684
|
end
|
635
685
|
if data_type == :string
|
636
686
|
value[0].each_byte.with_index do |c, index|
|
@@ -664,15 +714,15 @@ module TensorStream
|
|
664
714
|
|
665
715
|
def allocate_narray_for_type(data_type, narray_size)
|
666
716
|
case data_type
|
667
|
-
when :float, :float32
|
717
|
+
when :float, :float32, :float16
|
668
718
|
NArray.sfloat(narray_size)
|
669
719
|
when :float64
|
670
720
|
NArray.float(narray_size)
|
671
|
-
when :int, :int32, :int64
|
721
|
+
when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
|
672
722
|
NArray.int(narray_size)
|
673
|
-
when :int16
|
723
|
+
when :int16, :uint16
|
674
724
|
NArray.sint(narray_size)
|
675
|
-
when :uint8
|
725
|
+
when :uint8, :int8
|
676
726
|
NArray.byte(narray_size)
|
677
727
|
when :boolean
|
678
728
|
NArray.byte(narray_size)
|
@@ -686,12 +736,14 @@ module TensorStream
|
|
686
736
|
end
|
687
737
|
|
688
738
|
def _create_result_buffer(data_type, shape, name)
|
689
|
-
return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
690
|
-
|
739
|
+
return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
740
|
+
cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
|
741
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
742
|
+
# puts "create result buffer #{cache_key}"
|
691
743
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
692
744
|
buffer = allocate_narray_for_type(data_type, size)
|
693
745
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
694
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
746
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
695
747
|
end
|
696
748
|
end
|
697
749
|
|
@@ -706,7 +758,7 @@ module TensorStream
|
|
706
758
|
start = index * buffer.size * buffer.element_size
|
707
759
|
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
708
760
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
709
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
761
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
710
762
|
else
|
711
763
|
_create_result_buffer(tensor.data_type, shape, name)
|
712
764
|
end
|
@@ -728,7 +780,7 @@ module TensorStream
|
|
728
780
|
|
729
781
|
# create sub buffers of different sizes
|
730
782
|
def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
|
731
|
-
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
783
|
+
cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
732
784
|
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
733
785
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
734
786
|
buffer = allocate_narray_for_type(data_type, size)
|
@@ -736,7 +788,7 @@ module TensorStream
|
|
736
788
|
if parent_buffer.cl_buffer.associated_memobject.nil?
|
737
789
|
region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
|
738
790
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
739
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
791
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
740
792
|
else
|
741
793
|
_create_result_buffer(tensor.data_type, shape, name)
|
742
794
|
end
|
@@ -806,6 +858,7 @@ module TensorStream
|
|
806
858
|
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
807
859
|
else
|
808
860
|
return input if input.shape.empty?
|
861
|
+
|
809
862
|
value = input.buffer.reshape(*input.shape.reverse)
|
810
863
|
rank = input.shape.size - 1
|
811
864
|
|
@@ -862,17 +915,15 @@ module TensorStream
|
|
862
915
|
|
863
916
|
def resolve_placeholder(placeholder, _execution_context = {})
|
864
917
|
return nil if placeholder.nil?
|
918
|
+
return placeholder unless placeholder.is_a?(Placeholder)
|
865
919
|
|
866
|
-
var =
|
867
|
-
|
868
|
-
raise "missing placeholder #{placeholder.name}" if c.nil?
|
869
|
-
end
|
870
|
-
else
|
871
|
-
placeholder
|
872
|
-
end
|
920
|
+
var = @context[placeholder.name.to_sym]
|
921
|
+
raise "missing placeholder #{placeholder.name}" if var.nil?
|
873
922
|
|
874
|
-
|
875
|
-
|
923
|
+
cache_key = "#{placeholder.graph.object_id}_opencl_#{placeholder.name}_p:#{object_id}"
|
924
|
+
@context[cache_key] ||= begin
|
925
|
+
convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
|
926
|
+
end
|
876
927
|
end
|
877
928
|
|
878
929
|
def all_true?(arr)
|
@@ -32,10 +32,18 @@ class OpenclTemplateHelper
|
|
32
32
|
case dtype.to_s
|
33
33
|
when 'float64'
|
34
34
|
'double'
|
35
|
-
when 'float32', 'float'
|
35
|
+
when 'float32', 'float', 'float16'
|
36
36
|
'float'
|
37
|
+
when 'uint32'
|
38
|
+
'uint'
|
39
|
+
when 'int64'
|
40
|
+
'int' # 'long' - NArray does not support 64bit int types
|
41
|
+
when 'uint64'
|
42
|
+
'uint' # 'ulong' - NArray does not support 64bit int types
|
37
43
|
when 'int32', 'int'
|
38
44
|
'int'
|
45
|
+
when 'uint16'
|
46
|
+
'ushort'
|
39
47
|
when 'int16'
|
40
48
|
'short'
|
41
49
|
when 'uint8'
|
@@ -51,10 +59,12 @@ class OpenclTemplateHelper
|
|
51
59
|
case dtype.to_s
|
52
60
|
when 'float64'
|
53
61
|
'DBL_MIN'
|
54
|
-
when 'float32', 'float'
|
62
|
+
when 'float32', 'float', 'float16'
|
55
63
|
'FLT_MIN'
|
56
64
|
when 'int32', 'int'
|
57
65
|
'INT_MIN'
|
66
|
+
when 'uint32', 'uint16'
|
67
|
+
'0'
|
58
68
|
when 'int16'
|
59
69
|
'SHRT_MIN'
|
60
70
|
when 'int8'
|
data/samples/iris.rb
CHANGED
@@ -0,0 +1,99 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
|
19
|
+
puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
|
20
|
+
tf.set_random_seed(0)
|
21
|
+
|
22
|
+
# Import MNIST data
|
23
|
+
puts "downloading minst data"
|
24
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
25
|
+
puts "downloading finished"
|
26
|
+
|
27
|
+
x = tf.placeholder(:float32, shape: [nil, 784])
|
28
|
+
|
29
|
+
K = 200
|
30
|
+
L = 100
|
31
|
+
M = 60
|
32
|
+
N = 30
|
33
|
+
|
34
|
+
|
35
|
+
w1 = tf.variable(tf.random_normal([784, K]))
|
36
|
+
b1 = tf.variable(tf.zeros([K]))
|
37
|
+
|
38
|
+
w2 = tf.variable(tf.random_normal([K, L]))
|
39
|
+
b2 = tf.variable(tf.zeros([L]))
|
40
|
+
|
41
|
+
w3 = tf.variable(tf.random_normal([L, M]))
|
42
|
+
b3 = tf.variable(tf.zeros([M]))
|
43
|
+
|
44
|
+
w4 = tf.variable(tf.random_normal([M, N]))
|
45
|
+
b4 = tf.variable(tf.zeros([N]))
|
46
|
+
|
47
|
+
w5 = tf.variable(tf.random_normal([N, 10]))
|
48
|
+
b5 = tf.variable(tf.zeros([10]))
|
49
|
+
|
50
|
+
x_ = tf.reshape(x, [-1, 784])
|
51
|
+
|
52
|
+
y1 = tf.sigmoid(tf.matmul(x_, w1) + b1)
|
53
|
+
y2 = tf.sigmoid(tf.matmul(y1, w2) + b2)
|
54
|
+
y3 = tf.sigmoid(tf.matmul(y2, w3) + b3)
|
55
|
+
y4 = tf.sigmoid(tf.matmul(y3, w4) + b4)
|
56
|
+
ylogits = tf.matmul(y4, w5) + b5
|
57
|
+
|
58
|
+
# model
|
59
|
+
y = tf.nn.softmax(ylogits)
|
60
|
+
|
61
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
62
|
+
|
63
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
64
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
65
|
+
# problems with log(0) which is NaN
|
66
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
67
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
68
|
+
|
69
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
70
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
71
|
+
|
72
|
+
# training step, learning rate = 0.003
|
73
|
+
learning_rate = 0.003
|
74
|
+
train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
|
75
|
+
|
76
|
+
sess = tf.session
|
77
|
+
init = tf.global_variables_initializer
|
78
|
+
sess.run(init)
|
79
|
+
|
80
|
+
mnist_train = mnist.train
|
81
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
82
|
+
|
83
|
+
(0..10000).each do |i|
|
84
|
+
# load batch of images and correct answers
|
85
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
86
|
+
train_data = { x => batch_x, y_ => batch_y }
|
87
|
+
|
88
|
+
# train
|
89
|
+
sess.run(train_step, feed_dict: train_data)
|
90
|
+
if (i % 50 == 0)
|
91
|
+
# success? add code to print it
|
92
|
+
a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
93
|
+
|
94
|
+
# success on test data?
|
95
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
96
|
+
puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
|
19
|
+
# Import MNIST data
|
20
|
+
puts "downloading minst data"
|
21
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
22
|
+
puts "downloading finished"
|
23
|
+
|
24
|
+
x = tf.placeholder(:float32, shape: [nil, 784])
|
25
|
+
|
26
|
+
K = 200
|
27
|
+
L = 100
|
28
|
+
M = 60
|
29
|
+
N = 30
|
30
|
+
|
31
|
+
|
32
|
+
w1 = tf.variable(tf.random_normal([784, K]))
|
33
|
+
b1 = tf.variable(tf.zeros([K]))
|
34
|
+
|
35
|
+
w2 = tf.variable(tf.random_normal([K, L]))
|
36
|
+
b2 = tf.variable(tf.zeros([L]))
|
37
|
+
|
38
|
+
w3 = tf.variable(tf.random_normal([L, M]))
|
39
|
+
b3 = tf.variable(tf.zeros([M]))
|
40
|
+
|
41
|
+
w4 = tf.variable(tf.random_normal([M, N]))
|
42
|
+
b4 = tf.variable(tf.zeros([N]))
|
43
|
+
|
44
|
+
w5 = tf.variable(tf.random_normal([N, 10]))
|
45
|
+
b5 = tf.variable(tf.zeros([10]))
|
46
|
+
|
47
|
+
x_ = tf.reshape(x, [-1, 784])
|
48
|
+
|
49
|
+
y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
|
50
|
+
y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
|
51
|
+
y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
|
52
|
+
y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
|
53
|
+
ylogits = tf.matmul(y4, w5) + b5
|
54
|
+
|
55
|
+
# model
|
56
|
+
y = tf.nn.softmax(ylogits)
|
57
|
+
|
58
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
59
|
+
|
60
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
61
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
62
|
+
# problems with log(0) which is NaN
|
63
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
64
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
65
|
+
|
66
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
67
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
68
|
+
|
69
|
+
# training step, learning rate = 0.003
|
70
|
+
learning_rate = 0.003
|
71
|
+
train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
|
72
|
+
|
73
|
+
sess = tf.session
|
74
|
+
# Add ops to save and restore all the variables.
|
75
|
+
saver = tf::Train::Saver.new
|
76
|
+
init = tf.global_variables_initializer
|
77
|
+
|
78
|
+
sess.run(init)
|
79
|
+
mnist_train = mnist.train
|
80
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
81
|
+
|
82
|
+
(0..1000).each do |i|
|
83
|
+
# load batch of images and correct answers
|
84
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
85
|
+
train_data = { x => batch_x, y_ => batch_y }
|
86
|
+
|
87
|
+
# train
|
88
|
+
sess.run(train_step, feed_dict: train_data)
|
89
|
+
if (i % 50 == 0)
|
90
|
+
# success? add code to print it
|
91
|
+
a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
92
|
+
|
93
|
+
# success on test data?
|
94
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
95
|
+
puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
data/samples/multigpu.rb
CHANGED
@@ -11,7 +11,6 @@ DIMEN = 1024
|
|
11
11
|
A = ts.random_uniform([DIMEN, DIMEN]).eval
|
12
12
|
B = ts.random_uniform([DIMEN, DIMEN]).eval
|
13
13
|
|
14
|
-
|
15
14
|
# Create a graph to store results
|
16
15
|
c1 = []
|
17
16
|
c2 = []
|
@@ -35,17 +34,24 @@ sum = ts.device('/device:GPU:0') do
|
|
35
34
|
ts.add_n(c1)
|
36
35
|
end
|
37
36
|
|
38
|
-
t1_1 =
|
37
|
+
t1_1 = nil
|
39
38
|
t2_1 = nil
|
40
|
-
|
41
|
-
ts.session(log_device_placement: true) do |sess|
|
39
|
+
puts "===================== starting single GPU test ================"
|
40
|
+
ts.session(log_device_placement: true, profile_enabled: true) do |sess|
|
41
|
+
puts "-- warmup ---"
|
42
|
+
sess.run(sum, feed_dict: { a => A, b => B}) # warmup
|
43
|
+
puts "-- warmup ---"
|
44
|
+
time = Time.now
|
45
|
+
t1_1 = time.to_i * (10 ** 9) + time.nsec
|
42
46
|
sess.run(sum, feed_dict: { a => A, b => B})
|
43
|
-
|
47
|
+
time = Time.now
|
48
|
+
t2_1 = time.to_i * (10 ** 9) + time.nsec
|
44
49
|
end
|
45
|
-
|
50
|
+
puts "===================== end single GPU test ================"
|
51
|
+
puts "===================== MULTI GPU text ================"
|
46
52
|
# Multi GPU computing
|
47
53
|
# GPU:0 computes A^n
|
48
|
-
ts.device('/device:GPU:
|
54
|
+
ts.device('/device:GPU:0') do
|
49
55
|
a = ts.placeholder(:float32, shape: [DIMEN, DIMEN])
|
50
56
|
c2 << matpow(a, n)
|
51
57
|
end
|
@@ -56,18 +62,26 @@ ts.device('/device:GPU:1') do
|
|
56
62
|
c2 << matpow(b, n)
|
57
63
|
end
|
58
64
|
|
59
|
-
ts.device('/device:GPU:
|
65
|
+
ts.device('/device:GPU:0') do
|
60
66
|
sum = ts.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n
|
61
67
|
end
|
62
68
|
|
63
|
-
t1_2 =
|
69
|
+
t1_2 = nil
|
64
70
|
t2_2 = nil
|
65
|
-
|
71
|
+
|
72
|
+
ts.session(log_device_placement: true, profile_enabled: true) do |sess|
|
66
73
|
# Run the op.
|
74
|
+
puts "-- warmup ---"
|
75
|
+
sess.run(sum, feed_dict: {a => A, b => B}) # warm up
|
76
|
+
puts "-- warmup ---"
|
77
|
+
time = Time.now
|
78
|
+
t1_2 = time.to_i * (10 ** 9) + time.nsec
|
79
|
+
puts "================ starting multiGPU test ==============="
|
67
80
|
sess.run(sum, feed_dict: {a => A, b => B})
|
68
|
-
|
81
|
+
time = Time.now
|
82
|
+
t2_2 = time.to_i * (10 ** 9) + time.nsec
|
69
83
|
end
|
70
84
|
|
71
85
|
|
72
|
-
|
73
|
-
|
86
|
+
puts("Single GPU computation time: " + ((t2_1-t1_1)/ 1000000.to_f).to_s)
|
87
|
+
puts("Multi GPU computation time: " + ((t2_2-t1_2)/ 1000000.to_f).to_s)
|
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_development_dependency "pry-byebug"
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
|
-
spec.add_dependency "tensor_stream", "~> 0.9.
|
41
|
+
spec.add_dependency "tensor_stream", "~> 0.9.2"
|
42
42
|
spec.add_dependency "opencl_ruby_ffi"
|
43
43
|
spec.add_dependency "oily_png"
|
44
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.9.
|
103
|
+
version: 0.9.2
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.9.
|
110
|
+
version: 0.9.2
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: opencl_ruby_ffi
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -145,10 +145,10 @@ extra_rdoc_files: []
|
|
145
145
|
files:
|
146
146
|
- ".gitignore"
|
147
147
|
- ".rspec"
|
148
|
+
- ".rubocop.yml"
|
148
149
|
- ".travis.yml"
|
149
150
|
- CODE_OF_CONDUCT.md
|
150
151
|
- Gemfile
|
151
|
-
- Gemfile.lock
|
152
152
|
- LICENSE.txt
|
153
153
|
- README.md
|
154
154
|
- Rakefile
|
@@ -226,7 +226,8 @@ files:
|
|
226
226
|
- lib/tensor_stream/opencl/version.rb
|
227
227
|
- samples/iris.data
|
228
228
|
- samples/iris.rb
|
229
|
-
- samples/
|
229
|
+
- samples/mnist_data_2.1.rb
|
230
|
+
- samples/mnist_data_2.2.rb
|
230
231
|
- samples/multigpu.rb
|
231
232
|
- samples/nearest_neighbor.rb
|
232
233
|
- samples/rnn.rb
|
data/Gemfile.lock
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
tensor_stream-opencl (0.2.1)
|
5
|
-
oily_png
|
6
|
-
opencl_ruby_ffi
|
7
|
-
tensor_stream (~> 0.9.0)
|
8
|
-
|
9
|
-
GEM
|
10
|
-
remote: https://rubygems.org/
|
11
|
-
specs:
|
12
|
-
awesome_print (1.8.0)
|
13
|
-
byebug (10.0.2)
|
14
|
-
chunky_png (1.3.10)
|
15
|
-
coderay (1.1.2)
|
16
|
-
concurrent-ruby (1.0.5)
|
17
|
-
deep_merge (1.2.1)
|
18
|
-
diff-lcs (1.3)
|
19
|
-
ffi (1.9.25)
|
20
|
-
method_source (0.9.0)
|
21
|
-
mnist-learn (0.1.1)
|
22
|
-
narray (0.6.1.2)
|
23
|
-
narray_ffi (1.4.4)
|
24
|
-
ffi (~> 1.9, >= 1.9.3)
|
25
|
-
narray (~> 0.6, >= 0.6.0.8)
|
26
|
-
oily_png (1.2.1)
|
27
|
-
chunky_png (~> 1.3.7)
|
28
|
-
opencl_ruby_ffi (1.3.4)
|
29
|
-
ffi (~> 1.9, >= 1.9.3)
|
30
|
-
narray (~> 0.6, >= 0.6.0.8)
|
31
|
-
narray_ffi (~> 1.0, >= 1.0.0)
|
32
|
-
pry (0.11.3)
|
33
|
-
coderay (~> 1.1.0)
|
34
|
-
method_source (~> 0.9.0)
|
35
|
-
pry-byebug (3.6.0)
|
36
|
-
byebug (~> 10.0)
|
37
|
-
pry (~> 0.10)
|
38
|
-
rake (10.5.0)
|
39
|
-
rspec (3.8.0)
|
40
|
-
rspec-core (~> 3.8.0)
|
41
|
-
rspec-expectations (~> 3.8.0)
|
42
|
-
rspec-mocks (~> 3.8.0)
|
43
|
-
rspec-core (3.8.0)
|
44
|
-
rspec-support (~> 3.8.0)
|
45
|
-
rspec-expectations (3.8.1)
|
46
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
47
|
-
rspec-support (~> 3.8.0)
|
48
|
-
rspec-mocks (3.8.0)
|
49
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
50
|
-
rspec-support (~> 3.8.0)
|
51
|
-
rspec-support (3.8.0)
|
52
|
-
tensor_stream (0.9.0)
|
53
|
-
chunky_png
|
54
|
-
concurrent-ruby
|
55
|
-
deep_merge
|
56
|
-
|
57
|
-
PLATFORMS
|
58
|
-
ruby
|
59
|
-
|
60
|
-
DEPENDENCIES
|
61
|
-
awesome_print
|
62
|
-
bundler (~> 1.16)
|
63
|
-
mnist-learn
|
64
|
-
pry-byebug
|
65
|
-
rake (~> 10.0)
|
66
|
-
rspec (~> 3.0)
|
67
|
-
tensor_stream-opencl!
|
68
|
-
|
69
|
-
BUNDLED WITH
|
70
|
-
1.16.2
|
data/samples/mnist_data.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
-
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
-
#
|
4
|
-
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
-
#
|
6
|
-
# Requirements:
|
7
|
-
# mnist-learn gem
|
8
|
-
# opencl_ruby_ffi gem
|
9
|
-
require "bundler/setup"
|
10
|
-
require 'tensor_stream'
|
11
|
-
require 'mnist-learn'
|
12
|
-
|
13
|
-
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
14
|
-
# require 'tensor_stream/opencl'
|
15
|
-
|
16
|
-
tf = TensorStream
|
17
|
-
|
18
|
-
# Import MNIST data
|
19
|
-
puts "downloading minst data"
|
20
|
-
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
21
|
-
puts "downloading finished"
|
22
|
-
|
23
|
-
x = tf.placeholder(:float32, shape: [nil, 784])
|
24
|
-
w = tf.variable(tf.zeros([784, 10]))
|
25
|
-
b = tf.variable(tf.zeros([10]))
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
# model
|
30
|
-
y = tf.nn.softmax(tf.matmul(tf.reshape(x, [-1, 784]), w) + b)
|
31
|
-
|
32
|
-
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
33
|
-
|
34
|
-
# loss function
|
35
|
-
cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
|
36
|
-
|
37
|
-
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
38
|
-
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
39
|
-
|
40
|
-
optimizer = TensorStream::Train::AdamOptimizer.new
|
41
|
-
train_step = optimizer.minimize(cross_entropy)
|
42
|
-
|
43
|
-
sess = tf.session
|
44
|
-
init = tf.global_variables_initializer
|
45
|
-
sess.run(init)
|
46
|
-
|
47
|
-
(0...1000).each do |i|
|
48
|
-
# load batch of images and correct answers
|
49
|
-
batch_x, batch_y = mnist.train.next_batch(100)
|
50
|
-
train_data = { x => batch_x, y_ => batch_y }
|
51
|
-
|
52
|
-
# train
|
53
|
-
sess.run(train_step, feed_dict: train_data)
|
54
|
-
if (i % 10 == 0)
|
55
|
-
# success? add code to print it
|
56
|
-
a, c = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
57
|
-
puts "#{i} train accuracy #{a}, error #{c}"
|
58
|
-
|
59
|
-
# success on test data?
|
60
|
-
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
61
|
-
a, c = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
62
|
-
puts " test accuracy #{a}, error #{c}"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|