tensor_stream-opencl 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +89 -0
- data/lib/tensor_stream/opencl/array_ops.rb +30 -6
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -2
- data/lib/tensor_stream/opencl/math_ops.rb +3 -1
- data/lib/tensor_stream/opencl/opencl_buffer.rb +3 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +112 -61
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +12 -2
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +0 -2
- data/samples/mnist_data_2.1.rb +99 -0
- data/samples/mnist_data_2.2.rb +98 -0
- data/samples/multigpu.rb +27 -13
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +7 -6
- data/Gemfile.lock +0 -70
- data/samples/mnist_data.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
|
4
|
+
data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
|
7
|
+
data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
AllCops:
|
2
|
+
Exclude:
|
3
|
+
- samples/*
|
4
|
+
- bin/*
|
5
|
+
- spec/**/*
|
6
|
+
- tensor_stream.gemspec
|
7
|
+
- Rakefile
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Exclude:
|
11
|
+
- lib/tensor_stream/utils.rb
|
12
|
+
|
13
|
+
Style/StringLiterals:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Layout/TrailingBlankLines:
|
17
|
+
Enabled: false
|
18
|
+
|
19
|
+
Metrics/LineLength:
|
20
|
+
Max: 200
|
21
|
+
|
22
|
+
Metrics/AbcSize:
|
23
|
+
Enabled: false
|
24
|
+
|
25
|
+
Metrics/PerceivedComplexity:
|
26
|
+
Enabled: false
|
27
|
+
|
28
|
+
Metrics/MethodLength:
|
29
|
+
Enabled: false
|
30
|
+
|
31
|
+
Metrics/CyclomaticComplexity:
|
32
|
+
Enabled: false
|
33
|
+
|
34
|
+
Metrics/BlockLength:
|
35
|
+
Exclude:
|
36
|
+
- lib/tensor_stream/math_gradients.rb
|
37
|
+
|
38
|
+
Naming/AccessorMethodName:
|
39
|
+
Exclude:
|
40
|
+
- lib/tensor_stream.rb
|
41
|
+
- lib/tensor_stream/control_flow.rb
|
42
|
+
- lib/tensor_stream/graph.rb
|
43
|
+
- lib/tensor_stream/operation.rb
|
44
|
+
|
45
|
+
Style/Documentation:
|
46
|
+
Exclude:
|
47
|
+
- lib/tensor_stream/version.rb
|
48
|
+
- lib/tensor_stream/trainer.rb
|
49
|
+
- lib/tensor_stream/nn/nn_ops.rb
|
50
|
+
- lib/tensor_stream/evaluator/evaluator.rb
|
51
|
+
|
52
|
+
Lint/UnusedMethodArgument:
|
53
|
+
Exclude:
|
54
|
+
- lib/tensor_stream/train/saver.rb
|
55
|
+
- lib/tensor_stream/ops.rb
|
56
|
+
|
57
|
+
Metrics/ParameterLists:
|
58
|
+
Max: 8
|
59
|
+
|
60
|
+
Style/PerlBackrefs:
|
61
|
+
Enabled: false
|
62
|
+
|
63
|
+
Style/RegexpLiteral:
|
64
|
+
Enabled: false
|
65
|
+
|
66
|
+
Naming/MemoizedInstanceVariableName:
|
67
|
+
Enabled: false
|
68
|
+
|
69
|
+
Metrics/ModuleLength:
|
70
|
+
Max: 200
|
71
|
+
|
72
|
+
Metrics/ClassLength:
|
73
|
+
Max: 250
|
74
|
+
Exclude:
|
75
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
76
|
+
|
77
|
+
Naming/VariableNumber:
|
78
|
+
Enabled: false
|
79
|
+
|
80
|
+
Style/DoubleNegation:
|
81
|
+
Enabled: false
|
82
|
+
|
83
|
+
Style/TrailingCommaInHashLiteral:
|
84
|
+
Enabled: false
|
85
|
+
|
86
|
+
Naming/UncommunicativeMethodParamName:
|
87
|
+
Exclude:
|
88
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
89
|
+
- lib/tensor_stream/ops.rb
|
@@ -4,6 +4,28 @@ module TensorStream
|
|
4
4
|
module ArrayOps
|
5
5
|
def ArrayOps.included(klass)
|
6
6
|
klass.class_eval do
|
7
|
+
|
8
|
+
#fast cached 0/1 constant fill
|
9
|
+
register_op %i[zeros ones zeros_like ones_like] do |context, tensor, inputs|
|
10
|
+
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
11
|
+
inputs[0].shape
|
12
|
+
elsif !inputs[0].nil?
|
13
|
+
read_final_result(complete_eval(inputs[0], context))
|
14
|
+
else
|
15
|
+
tensor.shape.shape
|
16
|
+
end
|
17
|
+
cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
|
18
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
19
|
+
buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
|
20
|
+
if %i[zeros zeros_like].include?(tensor.operation)
|
21
|
+
buffer.fill!(0)
|
22
|
+
else
|
23
|
+
buffer.fill!(1)
|
24
|
+
end
|
25
|
+
convert_to_opencl(buffer, shape, data_type: tensor.data_type, name: tensor.name)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
7
29
|
register_op :expand_dims, buffer: true do |_context, tensor, inputs|
|
8
30
|
axis = inputs[1].buffer[0]
|
9
31
|
shape = inputs[0].shape.dup
|
@@ -17,8 +39,10 @@ module TensorStream
|
|
17
39
|
shape = inputs[0]
|
18
40
|
value = inputs[1]
|
19
41
|
|
20
|
-
|
21
|
-
|
42
|
+
fill_shape = shape.nil? ? tensor.shape.shape : shape.buffer.to_a
|
43
|
+
narray_size = fill_shape.reduce(:*) || 1
|
44
|
+
|
45
|
+
cl_buffer = get_cached_buffer(tensor.name, fill_shape)
|
22
46
|
|
23
47
|
buffer = if cl_buffer
|
24
48
|
cl_buffer.buffer
|
@@ -27,7 +51,7 @@ module TensorStream
|
|
27
51
|
end
|
28
52
|
|
29
53
|
buffer.fill!(value.buffer[0])
|
30
|
-
convert_to_opencl(buffer,
|
54
|
+
convert_to_opencl(buffer, fill_shape, data_type: tensor.data_type, name: tensor.name)
|
31
55
|
end
|
32
56
|
|
33
57
|
register_op :split do |context, tensor, inputs|
|
@@ -119,7 +143,7 @@ module TensorStream
|
|
119
143
|
piece_size = shape.reduce(:*) || 1
|
120
144
|
work_group = [piece_size]
|
121
145
|
cl_offset = OpenCL::Int1.new(offset)
|
122
|
-
|
146
|
+
|
123
147
|
_cl_program('split_n', axis: axis,
|
124
148
|
div: divisors,
|
125
149
|
mul: multipliers,
|
@@ -218,7 +242,7 @@ module TensorStream
|
|
218
242
|
shape = shape.map { |s| s == 1 ? nil : s }
|
219
243
|
end
|
220
244
|
|
221
|
-
OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
|
245
|
+
OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
|
222
246
|
shape: shape.compact, buffer: arr.buffer,
|
223
247
|
cl_buffer: arr.cl_buffer,
|
224
248
|
op: arr.op)
|
@@ -350,7 +374,7 @@ module TensorStream
|
|
350
374
|
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
351
375
|
end
|
352
376
|
|
353
|
-
OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
|
377
|
+
OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
|
354
378
|
shape: shape, buffer: arr.buffer,
|
355
379
|
cl_buffer: arr.cl_buffer,
|
356
380
|
op: arr.op)
|
@@ -12,9 +12,9 @@
|
|
12
12
|
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int index = get_global_id(0);
|
15
|
-
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
15
|
+
<%= c_dtype %> alpha = learning_rate[0] * sqrt((<%= c_dtype %>)1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
16
16
|
|
17
17
|
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
18
18
|
v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
|
19
|
-
output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
|
19
|
+
output[index] -= (momentum[index] * alpha) / ( sqrt((<%= c_dtype %>)v[index]) + epsilon[0] );
|
20
20
|
}
|
@@ -80,8 +80,10 @@ module TensorStream
|
|
80
80
|
|
81
81
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
82
82
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
83
|
-
event_wait_list = build_event_wait_list(
|
83
|
+
event_wait_list = build_event_wait_list([a, b])
|
84
|
+
|
84
85
|
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
86
|
+
|
85
87
|
output_buffer
|
86
88
|
end
|
87
89
|
|
@@ -3,15 +3,16 @@ module TensorStream
|
|
3
3
|
class OpenCLBuffer < Buffer
|
4
4
|
include ArrayOpsHelper
|
5
5
|
|
6
|
-
attr_accessor :shape, :buffer, :cl_buffer, :op
|
6
|
+
attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
|
7
7
|
|
8
|
-
def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
|
+
def initialize(owner, data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
9
9
|
@data_type = data_type
|
10
10
|
@shape = shape
|
11
11
|
@buffer = buffer
|
12
12
|
@cl_buffer = cl_buffer
|
13
13
|
@name = name
|
14
14
|
@op = op
|
15
|
+
@owner = owner
|
15
16
|
end
|
16
17
|
|
17
18
|
def total_elements
|
@@ -38,7 +38,8 @@ module TensorStream
|
|
38
38
|
# PURE ruby evaluator used for testing and development
|
39
39
|
class OpenclEvaluator < BaseEvaluator
|
40
40
|
attr_accessor :retain
|
41
|
-
attr_reader :opencl_device
|
41
|
+
attr_reader :opencl_device, :opencl_context
|
42
|
+
attr_writer :context
|
42
43
|
|
43
44
|
include TensorStream::OpHelper
|
44
45
|
include TensorStream::ArrayOpsHelper
|
@@ -50,14 +51,14 @@ module TensorStream
|
|
50
51
|
|
51
52
|
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
52
53
|
super
|
53
|
-
_create_opencl_context
|
54
|
+
_create_opencl_context
|
54
55
|
@opencl_device = device.native_device
|
55
56
|
create_command_queue
|
56
57
|
end
|
57
58
|
|
58
59
|
def self.query_supported_devices
|
59
60
|
devices = query_devices_with_score
|
60
|
-
devices.sort { |a| a[1] }.
|
61
|
+
devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
|
61
62
|
opencl_to_device(d)
|
62
63
|
end
|
63
64
|
end
|
@@ -68,16 +69,16 @@ module TensorStream
|
|
68
69
|
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
69
70
|
end
|
70
71
|
|
71
|
-
def self.opencl_to_device(
|
72
|
-
device =
|
73
|
-
index =
|
72
|
+
def self.opencl_to_device(dev)
|
73
|
+
device = dev[0]
|
74
|
+
index = dev[3]
|
74
75
|
platform_name = device.platform.name.tr(' ', '_').downcase
|
75
76
|
uri = [platform_name, index].join(':')
|
76
77
|
|
77
78
|
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
78
79
|
|
79
|
-
OpenclDevice.new(uri, device_type, self).tap do |
|
80
|
-
|
80
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
81
|
+
d.native_device = device
|
81
82
|
end
|
82
83
|
end
|
83
84
|
|
@@ -85,14 +86,14 @@ module TensorStream
|
|
85
86
|
# Select the best device available in the system for this evaluator
|
86
87
|
def self.default_device
|
87
88
|
devices = OpenclEvaluator.query_devices_with_score
|
88
|
-
device = devices.
|
89
|
+
device = devices.max { |a, b| a[1] <=> b[1] }
|
89
90
|
opencl_to_device(device)
|
90
91
|
end
|
91
92
|
|
92
93
|
# opencl evaluator main entrypoint
|
93
94
|
def run(tensor, execution_context)
|
94
|
-
|
95
|
-
#
|
95
|
+
result = complete_eval(tensor, execution_context)
|
96
|
+
# puts "-------------------wait finish------------------------"
|
96
97
|
_opencl_queue.finish
|
97
98
|
read_final_result(result)
|
98
99
|
end
|
@@ -115,18 +116,22 @@ module TensorStream
|
|
115
116
|
# buffer comes from non-opencl evaluator
|
116
117
|
def convert_from_buffer(tensor, result)
|
117
118
|
if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
|
118
|
-
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map
|
119
|
+
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map do |output, data_type|
|
120
|
+
convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name)
|
121
|
+
end
|
119
122
|
TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
|
120
123
|
else
|
121
124
|
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
122
125
|
end
|
123
126
|
end
|
124
127
|
|
128
|
+
# Generate OpenCL instruction to read back from GPU memory to Host memory for a tensor
|
125
129
|
def enqueue_buffer_read(tensor, context)
|
126
130
|
buffer = _run(tensor, context)
|
127
131
|
if buffer.is_a?(Array)
|
128
132
|
buffer.collect do |b|
|
129
133
|
next b if b.buffer.size.zero?
|
134
|
+
|
130
135
|
b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
131
136
|
b
|
132
137
|
end
|
@@ -135,6 +140,7 @@ module TensorStream
|
|
135
140
|
return buffer if buffer.nil?
|
136
141
|
return [] if buffer.buffer.nil?
|
137
142
|
return buffer if buffer.buffer.size.zero?
|
143
|
+
|
138
144
|
buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
139
145
|
buffer
|
140
146
|
end
|
@@ -145,7 +151,7 @@ module TensorStream
|
|
145
151
|
|
146
152
|
buffer = enqueue_buffer_read(tensor, context)
|
147
153
|
events = build_event_wait_list([buffer])
|
148
|
-
# puts "wait #{tensor.name}"
|
154
|
+
# puts "** wait #{tensor.name} **"
|
149
155
|
OpenCL.wait_for_events(events) unless events.empty?
|
150
156
|
buffer
|
151
157
|
end
|
@@ -154,6 +160,7 @@ module TensorStream
|
|
154
160
|
OpenCL.platforms.flat_map do |p|
|
155
161
|
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
156
162
|
score = 0
|
163
|
+
|
157
164
|
if d.type.to_s == 'CPU'
|
158
165
|
score += 1
|
159
166
|
elsif d.type.to_s == 'GPU'
|
@@ -162,8 +169,7 @@ module TensorStream
|
|
162
169
|
|
163
170
|
score += 1000 if d.platform.name == 'NVIDIA CUDA'
|
164
171
|
|
165
|
-
score += d.max_compute_units
|
166
|
-
score += d.max_clock_frequency
|
172
|
+
score += d.max_compute_units * d.max_clock_frequency
|
167
173
|
|
168
174
|
[d, score, p.name, index]
|
169
175
|
end
|
@@ -172,6 +178,31 @@ module TensorStream
|
|
172
178
|
|
173
179
|
protected
|
174
180
|
|
181
|
+
##
|
182
|
+
# called when passing control to another evaluator
|
183
|
+
def perform_transition(tensor, input, next_evaluator, execution_context)
|
184
|
+
if next_evaluator.is_a?(OpenclEvaluator) # OpenCL but different device?
|
185
|
+
# create opencl buffer for this tensor
|
186
|
+
next_evaluator.context = @context
|
187
|
+
|
188
|
+
foreign_buffer = next_evaluator._run(input, execution_context)
|
189
|
+
event_list = build_event_wait_list([foreign_buffer])
|
190
|
+
|
191
|
+
output_buffer = _create_result_buffer(input.data_type, foreign_buffer.shape, "t_#{tensor.name}_#{input.name}")
|
192
|
+
output_buffer.op = if next_evaluator.opencl_context == @opencl_context
|
193
|
+
_opencl_queue.enqueue_copy_buffer(foreign_buffer.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_list)
|
194
|
+
else
|
195
|
+
puts "wait finish transition ** #{input.name} **"
|
196
|
+
read_event = next_evaluator._opencl_queue.enqueue_read_buffer(foreign_buffer.cl_buffer, output_buffer.buffer, event_wait_list: event_list)
|
197
|
+
OpenCL.wait_for_events(read_event)
|
198
|
+
_opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
199
|
+
end
|
200
|
+
output_buffer
|
201
|
+
else
|
202
|
+
super
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
175
206
|
def prepare_input(tensor, context, options = {})
|
176
207
|
return nil unless tensor
|
177
208
|
|
@@ -195,8 +226,19 @@ module TensorStream
|
|
195
226
|
buffer.to_ruby
|
196
227
|
end
|
197
228
|
|
198
|
-
def _create_opencl_context(
|
199
|
-
|
229
|
+
def _create_opencl_context(device = nil)
|
230
|
+
if device.nil?
|
231
|
+
@@global_opencl_context ||= begin
|
232
|
+
all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
|
233
|
+
puts "global context created for #{all_devices}"
|
234
|
+
OpenCL.create_context(all_devices)
|
235
|
+
end
|
236
|
+
|
237
|
+
@opencl_context = @@global_opencl_context
|
238
|
+
else
|
239
|
+
puts "context created for #{device.native_device}"
|
240
|
+
@opencl_context = OpenCL.create_context(device.native_device)
|
241
|
+
end
|
200
242
|
end
|
201
243
|
|
202
244
|
def create_command_queue
|
@@ -205,6 +247,7 @@ module TensorStream
|
|
205
247
|
properties = []
|
206
248
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
207
249
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
250
|
+
# puts "creating queue with properties #{supported_proprties}"
|
208
251
|
@command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
|
209
252
|
end
|
210
253
|
|
@@ -222,28 +265,32 @@ module TensorStream
|
|
222
265
|
|
223
266
|
def _cl_program(kernel, args = {})
|
224
267
|
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
268
|
+
kernel_cache_key = "_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"
|
269
|
+
@context[:_cache][kernel_cache_key] ||=
|
270
|
+
begin
|
271
|
+
# puts "building #{kernel_cache_key}"
|
272
|
+
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
273
|
+
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
274
|
+
File.read(file_path)
|
275
|
+
else
|
276
|
+
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
277
|
+
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
278
|
+
|
279
|
+
source = File.read(filename)
|
280
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
281
|
+
File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
|
282
|
+
source
|
283
|
+
end
|
284
|
+
program = _opencl_context.create_program_with_source(source)
|
285
|
+
program.build
|
286
|
+
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
287
|
+
puts "OpenCL Compile error: #{program.build_log}"
|
288
|
+
raise e
|
289
|
+
end
|
243
290
|
end
|
244
291
|
|
245
292
|
def escape_arg_content(value)
|
246
|
-
return value.tr(' ','_') if value.is_a?(String)
|
293
|
+
return value.tr(' ', '_') if value.is_a?(String)
|
247
294
|
return value.join('-') if value.is_a?(Array)
|
248
295
|
|
249
296
|
value
|
@@ -257,9 +304,8 @@ module TensorStream
|
|
257
304
|
|
258
305
|
child_context = execution_context.dup
|
259
306
|
res = if tensor.is_a?(Operation)
|
260
|
-
if !
|
261
|
-
|
262
|
-
convert_from_buffer(tensor, result)
|
307
|
+
if !on_same_device?(tensor) # tensor is on another device or evaluator
|
308
|
+
perform_transition(tensor, tensor, @context[:_cache][:placement][tensor.name][1], execution_context)
|
263
309
|
else
|
264
310
|
eval_operation(tensor, child_context)
|
265
311
|
end
|
@@ -295,7 +341,7 @@ module TensorStream
|
|
295
341
|
|
296
342
|
register_op :identity do |context, tensor, inputs|
|
297
343
|
value = inputs[0]
|
298
|
-
buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
344
|
+
buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
299
345
|
buffer.op = build_event_wait_list(inputs)
|
300
346
|
buffer
|
301
347
|
end
|
@@ -375,6 +421,7 @@ module TensorStream
|
|
375
421
|
|
376
422
|
register_op :flow_group do |_context, _tensor, inputs|
|
377
423
|
events = build_event_wait_list(inputs)
|
424
|
+
# puts "** wait for event flow_group**"
|
378
425
|
OpenCL.wait_for_events(events) unless events.empty?
|
379
426
|
nil
|
380
427
|
end
|
@@ -387,8 +434,10 @@ module TensorStream
|
|
387
434
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
388
435
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
389
436
|
return @context[cache_key] if @context.key?(cache_key)
|
390
|
-
|
437
|
+
|
438
|
+
# puts "opencl eval #{object_id} #{tensor.name}"
|
391
439
|
invoke(tensor, child_context).tap do |result|
|
440
|
+
# puts "result done opencl #{object_id}: #{tensor.name}"
|
392
441
|
if tensor.breakpoint
|
393
442
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
394
443
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -603,6 +652,7 @@ module TensorStream
|
|
603
652
|
end
|
604
653
|
|
605
654
|
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
655
|
+
# puts "convert_to_opencl called for #{name}"
|
606
656
|
value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
|
607
657
|
|
608
658
|
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
@@ -630,7 +680,7 @@ module TensorStream
|
|
630
680
|
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
631
681
|
end
|
632
682
|
|
633
|
-
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
683
|
+
@context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
634
684
|
end
|
635
685
|
if data_type == :string
|
636
686
|
value[0].each_byte.with_index do |c, index|
|
@@ -664,15 +714,15 @@ module TensorStream
|
|
664
714
|
|
665
715
|
def allocate_narray_for_type(data_type, narray_size)
|
666
716
|
case data_type
|
667
|
-
when :float, :float32
|
717
|
+
when :float, :float32, :float16
|
668
718
|
NArray.sfloat(narray_size)
|
669
719
|
when :float64
|
670
720
|
NArray.float(narray_size)
|
671
|
-
when :int, :int32, :int64
|
721
|
+
when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
|
672
722
|
NArray.int(narray_size)
|
673
|
-
when :int16
|
723
|
+
when :int16, :uint16
|
674
724
|
NArray.sint(narray_size)
|
675
|
-
when :uint8
|
725
|
+
when :uint8, :int8
|
676
726
|
NArray.byte(narray_size)
|
677
727
|
when :boolean
|
678
728
|
NArray.byte(narray_size)
|
@@ -686,12 +736,14 @@ module TensorStream
|
|
686
736
|
end
|
687
737
|
|
688
738
|
def _create_result_buffer(data_type, shape, name)
|
689
|
-
return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
690
|
-
|
739
|
+
return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
740
|
+
cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
|
741
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
742
|
+
# puts "create result buffer #{cache_key}"
|
691
743
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
692
744
|
buffer = allocate_narray_for_type(data_type, size)
|
693
745
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
694
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
746
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
695
747
|
end
|
696
748
|
end
|
697
749
|
|
@@ -706,7 +758,7 @@ module TensorStream
|
|
706
758
|
start = index * buffer.size * buffer.element_size
|
707
759
|
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
708
760
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
709
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
761
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
710
762
|
else
|
711
763
|
_create_result_buffer(tensor.data_type, shape, name)
|
712
764
|
end
|
@@ -728,7 +780,7 @@ module TensorStream
|
|
728
780
|
|
729
781
|
# create sub buffers of different sizes
|
730
782
|
def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
|
731
|
-
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
783
|
+
cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
732
784
|
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
733
785
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
734
786
|
buffer = allocate_narray_for_type(data_type, size)
|
@@ -736,7 +788,7 @@ module TensorStream
|
|
736
788
|
if parent_buffer.cl_buffer.associated_memobject.nil?
|
737
789
|
region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
|
738
790
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
739
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
791
|
+
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
740
792
|
else
|
741
793
|
_create_result_buffer(tensor.data_type, shape, name)
|
742
794
|
end
|
@@ -806,6 +858,7 @@ module TensorStream
|
|
806
858
|
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
807
859
|
else
|
808
860
|
return input if input.shape.empty?
|
861
|
+
|
809
862
|
value = input.buffer.reshape(*input.shape.reverse)
|
810
863
|
rank = input.shape.size - 1
|
811
864
|
|
@@ -862,17 +915,15 @@ module TensorStream
|
|
862
915
|
|
863
916
|
def resolve_placeholder(placeholder, _execution_context = {})
|
864
917
|
return nil if placeholder.nil?
|
918
|
+
return placeholder unless placeholder.is_a?(Placeholder)
|
865
919
|
|
866
|
-
var =
|
867
|
-
|
868
|
-
raise "missing placeholder #{placeholder.name}" if c.nil?
|
869
|
-
end
|
870
|
-
else
|
871
|
-
placeholder
|
872
|
-
end
|
920
|
+
var = @context[placeholder.name.to_sym]
|
921
|
+
raise "missing placeholder #{placeholder.name}" if var.nil?
|
873
922
|
|
874
|
-
|
875
|
-
|
923
|
+
cache_key = "#{placeholder.graph.object_id}_opencl_#{placeholder.name}_p:#{object_id}"
|
924
|
+
@context[cache_key] ||= begin
|
925
|
+
convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
|
926
|
+
end
|
876
927
|
end
|
877
928
|
|
878
929
|
def all_true?(arr)
|
@@ -32,10 +32,18 @@ class OpenclTemplateHelper
|
|
32
32
|
case dtype.to_s
|
33
33
|
when 'float64'
|
34
34
|
'double'
|
35
|
-
when 'float32', 'float'
|
35
|
+
when 'float32', 'float', 'float16'
|
36
36
|
'float'
|
37
|
+
when 'uint32'
|
38
|
+
'uint'
|
39
|
+
when 'int64'
|
40
|
+
'int' # 'long' - NArray does not support 64bit int types
|
41
|
+
when 'uint64'
|
42
|
+
'uint' # 'ulong' - NArray does not support 64bit int types
|
37
43
|
when 'int32', 'int'
|
38
44
|
'int'
|
45
|
+
when 'uint16'
|
46
|
+
'ushort'
|
39
47
|
when 'int16'
|
40
48
|
'short'
|
41
49
|
when 'uint8'
|
@@ -51,10 +59,12 @@ class OpenclTemplateHelper
|
|
51
59
|
case dtype.to_s
|
52
60
|
when 'float64'
|
53
61
|
'DBL_MIN'
|
54
|
-
when 'float32', 'float'
|
62
|
+
when 'float32', 'float', 'float16'
|
55
63
|
'FLT_MIN'
|
56
64
|
when 'int32', 'int'
|
57
65
|
'INT_MIN'
|
66
|
+
when 'uint32', 'uint16'
|
67
|
+
'0'
|
58
68
|
when 'int16'
|
59
69
|
'SHRT_MIN'
|
60
70
|
when 'int8'
|
data/samples/iris.rb
CHANGED
@@ -0,0 +1,99 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
|
19
|
+
puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
|
20
|
+
tf.set_random_seed(0)
|
21
|
+
|
22
|
+
# Import MNIST data
|
23
|
+
puts "downloading minst data"
|
24
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
25
|
+
puts "downloading finished"
|
26
|
+
|
27
|
+
x = tf.placeholder(:float32, shape: [nil, 784])
|
28
|
+
|
29
|
+
K = 200
|
30
|
+
L = 100
|
31
|
+
M = 60
|
32
|
+
N = 30
|
33
|
+
|
34
|
+
|
35
|
+
w1 = tf.variable(tf.random_normal([784, K]))
|
36
|
+
b1 = tf.variable(tf.zeros([K]))
|
37
|
+
|
38
|
+
w2 = tf.variable(tf.random_normal([K, L]))
|
39
|
+
b2 = tf.variable(tf.zeros([L]))
|
40
|
+
|
41
|
+
w3 = tf.variable(tf.random_normal([L, M]))
|
42
|
+
b3 = tf.variable(tf.zeros([M]))
|
43
|
+
|
44
|
+
w4 = tf.variable(tf.random_normal([M, N]))
|
45
|
+
b4 = tf.variable(tf.zeros([N]))
|
46
|
+
|
47
|
+
w5 = tf.variable(tf.random_normal([N, 10]))
|
48
|
+
b5 = tf.variable(tf.zeros([10]))
|
49
|
+
|
50
|
+
x_ = tf.reshape(x, [-1, 784])
|
51
|
+
|
52
|
+
y1 = tf.sigmoid(tf.matmul(x_, w1) + b1)
|
53
|
+
y2 = tf.sigmoid(tf.matmul(y1, w2) + b2)
|
54
|
+
y3 = tf.sigmoid(tf.matmul(y2, w3) + b3)
|
55
|
+
y4 = tf.sigmoid(tf.matmul(y3, w4) + b4)
|
56
|
+
ylogits = tf.matmul(y4, w5) + b5
|
57
|
+
|
58
|
+
# model
|
59
|
+
y = tf.nn.softmax(ylogits)
|
60
|
+
|
61
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
62
|
+
|
63
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
64
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
65
|
+
# problems with log(0) which is NaN
|
66
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
67
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
68
|
+
|
69
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
70
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
71
|
+
|
72
|
+
# training step, learning rate = 0.003
|
73
|
+
learning_rate = 0.003
|
74
|
+
train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
|
75
|
+
|
76
|
+
sess = tf.session
|
77
|
+
init = tf.global_variables_initializer
|
78
|
+
sess.run(init)
|
79
|
+
|
80
|
+
mnist_train = mnist.train
|
81
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
82
|
+
|
83
|
+
(0..10000).each do |i|
|
84
|
+
# load batch of images and correct answers
|
85
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
86
|
+
train_data = { x => batch_x, y_ => batch_y }
|
87
|
+
|
88
|
+
# train
|
89
|
+
sess.run(train_step, feed_dict: train_data)
|
90
|
+
if (i % 50 == 0)
|
91
|
+
# success? add code to print it
|
92
|
+
a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
93
|
+
|
94
|
+
# success on test data?
|
95
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
96
|
+
puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
+
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
+
#
|
4
|
+
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
+
#
|
6
|
+
# Requirements:
|
7
|
+
# mnist-learn gem
|
8
|
+
# opencl_ruby_ffi gem
|
9
|
+
require "bundler/setup"
|
10
|
+
require 'tensor_stream'
|
11
|
+
require 'mnist-learn'
|
12
|
+
require 'pry-byebug'
|
13
|
+
|
14
|
+
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
15
|
+
require 'tensor_stream/opencl'
|
16
|
+
|
17
|
+
tf = TensorStream
|
18
|
+
|
19
|
+
# Import MNIST data
|
20
|
+
puts "downloading minst data"
|
21
|
+
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
22
|
+
puts "downloading finished"
|
23
|
+
|
24
|
+
x = tf.placeholder(:float32, shape: [nil, 784])
|
25
|
+
|
26
|
+
K = 200
|
27
|
+
L = 100
|
28
|
+
M = 60
|
29
|
+
N = 30
|
30
|
+
|
31
|
+
|
32
|
+
w1 = tf.variable(tf.random_normal([784, K]))
|
33
|
+
b1 = tf.variable(tf.zeros([K]))
|
34
|
+
|
35
|
+
w2 = tf.variable(tf.random_normal([K, L]))
|
36
|
+
b2 = tf.variable(tf.zeros([L]))
|
37
|
+
|
38
|
+
w3 = tf.variable(tf.random_normal([L, M]))
|
39
|
+
b3 = tf.variable(tf.zeros([M]))
|
40
|
+
|
41
|
+
w4 = tf.variable(tf.random_normal([M, N]))
|
42
|
+
b4 = tf.variable(tf.zeros([N]))
|
43
|
+
|
44
|
+
w5 = tf.variable(tf.random_normal([N, 10]))
|
45
|
+
b5 = tf.variable(tf.zeros([10]))
|
46
|
+
|
47
|
+
x_ = tf.reshape(x, [-1, 784])
|
48
|
+
|
49
|
+
y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
|
50
|
+
y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
|
51
|
+
y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
|
52
|
+
y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
|
53
|
+
ylogits = tf.matmul(y4, w5) + b5
|
54
|
+
|
55
|
+
# model
|
56
|
+
y = tf.nn.softmax(ylogits)
|
57
|
+
|
58
|
+
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
59
|
+
|
60
|
+
# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100 images
|
61
|
+
# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
|
62
|
+
# problems with log(0) which is NaN
|
63
|
+
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
|
64
|
+
cross_entropy = tf.reduce_mean(cross_entropy)*100
|
65
|
+
|
66
|
+
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
67
|
+
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
68
|
+
|
69
|
+
# training step, learning rate = 0.003
|
70
|
+
learning_rate = 0.003
|
71
|
+
train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
|
72
|
+
|
73
|
+
sess = tf.session
|
74
|
+
# Add ops to save and restore all the variables.
|
75
|
+
saver = tf::Train::Saver.new
|
76
|
+
init = tf.global_variables_initializer
|
77
|
+
|
78
|
+
sess.run(init)
|
79
|
+
mnist_train = mnist.train
|
80
|
+
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
81
|
+
|
82
|
+
(0..1000).each do |i|
|
83
|
+
# load batch of images and correct answers
|
84
|
+
batch_x, batch_y = mnist_train.next_batch(100)
|
85
|
+
train_data = { x => batch_x, y_ => batch_y }
|
86
|
+
|
87
|
+
# train
|
88
|
+
sess.run(train_step, feed_dict: train_data)
|
89
|
+
if (i % 50 == 0)
|
90
|
+
# success? add code to print it
|
91
|
+
a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
92
|
+
|
93
|
+
# success on test data?
|
94
|
+
a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
95
|
+
puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
data/samples/multigpu.rb
CHANGED
@@ -11,7 +11,6 @@ DIMEN = 1024
|
|
11
11
|
A = ts.random_uniform([DIMEN, DIMEN]).eval
|
12
12
|
B = ts.random_uniform([DIMEN, DIMEN]).eval
|
13
13
|
|
14
|
-
|
15
14
|
# Create a graph to store results
|
16
15
|
c1 = []
|
17
16
|
c2 = []
|
@@ -35,17 +34,24 @@ sum = ts.device('/device:GPU:0') do
|
|
35
34
|
ts.add_n(c1)
|
36
35
|
end
|
37
36
|
|
38
|
-
t1_1 =
|
37
|
+
t1_1 = nil
|
39
38
|
t2_1 = nil
|
40
|
-
|
41
|
-
ts.session(log_device_placement: true) do |sess|
|
39
|
+
puts "===================== starting single GPU test ================"
|
40
|
+
ts.session(log_device_placement: true, profile_enabled: true) do |sess|
|
41
|
+
puts "-- warmup ---"
|
42
|
+
sess.run(sum, feed_dict: { a => A, b => B}) # warmup
|
43
|
+
puts "-- warmup ---"
|
44
|
+
time = Time.now
|
45
|
+
t1_1 = time.to_i * (10 ** 9) + time.nsec
|
42
46
|
sess.run(sum, feed_dict: { a => A, b => B})
|
43
|
-
|
47
|
+
time = Time.now
|
48
|
+
t2_1 = time.to_i * (10 ** 9) + time.nsec
|
44
49
|
end
|
45
|
-
|
50
|
+
puts "===================== end single GPU test ================"
|
51
|
+
puts "===================== MULTI GPU text ================"
|
46
52
|
# Multi GPU computing
|
47
53
|
# GPU:0 computes A^n
|
48
|
-
ts.device('/device:GPU:
|
54
|
+
ts.device('/device:GPU:0') do
|
49
55
|
a = ts.placeholder(:float32, shape: [DIMEN, DIMEN])
|
50
56
|
c2 << matpow(a, n)
|
51
57
|
end
|
@@ -56,18 +62,26 @@ ts.device('/device:GPU:1') do
|
|
56
62
|
c2 << matpow(b, n)
|
57
63
|
end
|
58
64
|
|
59
|
-
ts.device('/device:GPU:
|
65
|
+
ts.device('/device:GPU:0') do
|
60
66
|
sum = ts.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n
|
61
67
|
end
|
62
68
|
|
63
|
-
t1_2 =
|
69
|
+
t1_2 = nil
|
64
70
|
t2_2 = nil
|
65
|
-
|
71
|
+
|
72
|
+
ts.session(log_device_placement: true, profile_enabled: true) do |sess|
|
66
73
|
# Run the op.
|
74
|
+
puts "-- warmup ---"
|
75
|
+
sess.run(sum, feed_dict: {a => A, b => B}) # warm up
|
76
|
+
puts "-- warmup ---"
|
77
|
+
time = Time.now
|
78
|
+
t1_2 = time.to_i * (10 ** 9) + time.nsec
|
79
|
+
puts "================ starting multiGPU test ==============="
|
67
80
|
sess.run(sum, feed_dict: {a => A, b => B})
|
68
|
-
|
81
|
+
time = Time.now
|
82
|
+
t2_2 = time.to_i * (10 ** 9) + time.nsec
|
69
83
|
end
|
70
84
|
|
71
85
|
|
72
|
-
|
73
|
-
|
86
|
+
puts("Single GPU computation time: " + ((t2_1-t1_1)/ 1000000.to_f).to_s)
|
87
|
+
puts("Multi GPU computation time: " + ((t2_2-t1_2)/ 1000000.to_f).to_s)
|
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_development_dependency "pry-byebug"
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
|
-
spec.add_dependency "tensor_stream", "~> 0.9.
|
41
|
+
spec.add_dependency "tensor_stream", "~> 0.9.2"
|
42
42
|
spec.add_dependency "opencl_ruby_ffi"
|
43
43
|
spec.add_dependency "oily_png"
|
44
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.9.
|
103
|
+
version: 0.9.2
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.9.
|
110
|
+
version: 0.9.2
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: opencl_ruby_ffi
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -145,10 +145,10 @@ extra_rdoc_files: []
|
|
145
145
|
files:
|
146
146
|
- ".gitignore"
|
147
147
|
- ".rspec"
|
148
|
+
- ".rubocop.yml"
|
148
149
|
- ".travis.yml"
|
149
150
|
- CODE_OF_CONDUCT.md
|
150
151
|
- Gemfile
|
151
|
-
- Gemfile.lock
|
152
152
|
- LICENSE.txt
|
153
153
|
- README.md
|
154
154
|
- Rakefile
|
@@ -226,7 +226,8 @@ files:
|
|
226
226
|
- lib/tensor_stream/opencl/version.rb
|
227
227
|
- samples/iris.data
|
228
228
|
- samples/iris.rb
|
229
|
-
- samples/
|
229
|
+
- samples/mnist_data_2.1.rb
|
230
|
+
- samples/mnist_data_2.2.rb
|
230
231
|
- samples/multigpu.rb
|
231
232
|
- samples/nearest_neighbor.rb
|
232
233
|
- samples/rnn.rb
|
data/Gemfile.lock
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
tensor_stream-opencl (0.2.1)
|
5
|
-
oily_png
|
6
|
-
opencl_ruby_ffi
|
7
|
-
tensor_stream (~> 0.9.0)
|
8
|
-
|
9
|
-
GEM
|
10
|
-
remote: https://rubygems.org/
|
11
|
-
specs:
|
12
|
-
awesome_print (1.8.0)
|
13
|
-
byebug (10.0.2)
|
14
|
-
chunky_png (1.3.10)
|
15
|
-
coderay (1.1.2)
|
16
|
-
concurrent-ruby (1.0.5)
|
17
|
-
deep_merge (1.2.1)
|
18
|
-
diff-lcs (1.3)
|
19
|
-
ffi (1.9.25)
|
20
|
-
method_source (0.9.0)
|
21
|
-
mnist-learn (0.1.1)
|
22
|
-
narray (0.6.1.2)
|
23
|
-
narray_ffi (1.4.4)
|
24
|
-
ffi (~> 1.9, >= 1.9.3)
|
25
|
-
narray (~> 0.6, >= 0.6.0.8)
|
26
|
-
oily_png (1.2.1)
|
27
|
-
chunky_png (~> 1.3.7)
|
28
|
-
opencl_ruby_ffi (1.3.4)
|
29
|
-
ffi (~> 1.9, >= 1.9.3)
|
30
|
-
narray (~> 0.6, >= 0.6.0.8)
|
31
|
-
narray_ffi (~> 1.0, >= 1.0.0)
|
32
|
-
pry (0.11.3)
|
33
|
-
coderay (~> 1.1.0)
|
34
|
-
method_source (~> 0.9.0)
|
35
|
-
pry-byebug (3.6.0)
|
36
|
-
byebug (~> 10.0)
|
37
|
-
pry (~> 0.10)
|
38
|
-
rake (10.5.0)
|
39
|
-
rspec (3.8.0)
|
40
|
-
rspec-core (~> 3.8.0)
|
41
|
-
rspec-expectations (~> 3.8.0)
|
42
|
-
rspec-mocks (~> 3.8.0)
|
43
|
-
rspec-core (3.8.0)
|
44
|
-
rspec-support (~> 3.8.0)
|
45
|
-
rspec-expectations (3.8.1)
|
46
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
47
|
-
rspec-support (~> 3.8.0)
|
48
|
-
rspec-mocks (3.8.0)
|
49
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
50
|
-
rspec-support (~> 3.8.0)
|
51
|
-
rspec-support (3.8.0)
|
52
|
-
tensor_stream (0.9.0)
|
53
|
-
chunky_png
|
54
|
-
concurrent-ruby
|
55
|
-
deep_merge
|
56
|
-
|
57
|
-
PLATFORMS
|
58
|
-
ruby
|
59
|
-
|
60
|
-
DEPENDENCIES
|
61
|
-
awesome_print
|
62
|
-
bundler (~> 1.16)
|
63
|
-
mnist-learn
|
64
|
-
pry-byebug
|
65
|
-
rake (~> 10.0)
|
66
|
-
rspec (~> 3.0)
|
67
|
-
tensor_stream-opencl!
|
68
|
-
|
69
|
-
BUNDLED WITH
|
70
|
-
1.16.2
|
data/samples/mnist_data.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
-
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
-
#
|
4
|
-
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
-
#
|
6
|
-
# Requirements:
|
7
|
-
# mnist-learn gem
|
8
|
-
# opencl_ruby_ffi gem
|
9
|
-
require "bundler/setup"
|
10
|
-
require 'tensor_stream'
|
11
|
-
require 'mnist-learn'
|
12
|
-
|
13
|
-
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
14
|
-
# require 'tensor_stream/opencl'
|
15
|
-
|
16
|
-
tf = TensorStream
|
17
|
-
|
18
|
-
# Import MNIST data
|
19
|
-
puts "downloading minst data"
|
20
|
-
mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
|
21
|
-
puts "downloading finished"
|
22
|
-
|
23
|
-
x = tf.placeholder(:float32, shape: [nil, 784])
|
24
|
-
w = tf.variable(tf.zeros([784, 10]))
|
25
|
-
b = tf.variable(tf.zeros([10]))
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
# model
|
30
|
-
y = tf.nn.softmax(tf.matmul(tf.reshape(x, [-1, 784]), w) + b)
|
31
|
-
|
32
|
-
y_ = tf.placeholder(:float32, shape: [nil, 10])
|
33
|
-
|
34
|
-
# loss function
|
35
|
-
cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
|
36
|
-
|
37
|
-
is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
|
38
|
-
accuracy = tf.reduce_mean(tf.cast(is_correct, :float32))
|
39
|
-
|
40
|
-
optimizer = TensorStream::Train::AdamOptimizer.new
|
41
|
-
train_step = optimizer.minimize(cross_entropy)
|
42
|
-
|
43
|
-
sess = tf.session
|
44
|
-
init = tf.global_variables_initializer
|
45
|
-
sess.run(init)
|
46
|
-
|
47
|
-
(0...1000).each do |i|
|
48
|
-
# load batch of images and correct answers
|
49
|
-
batch_x, batch_y = mnist.train.next_batch(100)
|
50
|
-
train_data = { x => batch_x, y_ => batch_y }
|
51
|
-
|
52
|
-
# train
|
53
|
-
sess.run(train_step, feed_dict: train_data)
|
54
|
-
if (i % 10 == 0)
|
55
|
-
# success? add code to print it
|
56
|
-
a, c = sess.run([accuracy, cross_entropy], feed_dict: train_data)
|
57
|
-
puts "#{i} train accuracy #{a}, error #{c}"
|
58
|
-
|
59
|
-
# success on test data?
|
60
|
-
test_data = { x => mnist.test.images, y_ => mnist.test.labels }
|
61
|
-
a, c = sess.run([accuracy, cross_entropy], feed_dict: test_data)
|
62
|
-
puts " test accuracy #{a}, error #{c}"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|