tensor_stream 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/CHANGELOG.md +8 -0
- data/README.md +40 -1
- data/benchmark/benchmark.rb +4 -1
- data/lib/tensor_stream.rb +5 -0
- data/lib/tensor_stream/debugging/debugging.rb +4 -2
- data/lib/tensor_stream/device.rb +2 -1
- data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
- data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
- data/lib/tensor_stream/exceptions.rb +6 -0
- data/lib/tensor_stream/graph.rb +21 -6
- data/lib/tensor_stream/graph_builder.rb +67 -0
- data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
- data/lib/tensor_stream/graph_keys.rb +1 -0
- data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
- data/lib/tensor_stream/helpers/op_helper.rb +7 -33
- data/lib/tensor_stream/helpers/string_helper.rb +16 -0
- data/lib/tensor_stream/math_gradients.rb +67 -44
- data/lib/tensor_stream/nn/nn_ops.rb +7 -1
- data/lib/tensor_stream/operation.rb +14 -27
- data/lib/tensor_stream/ops.rb +82 -29
- data/lib/tensor_stream/session.rb +4 -0
- data/lib/tensor_stream/tensor.rb +30 -12
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
- data/lib/tensor_stream/train/saver.rb +46 -0
- data/lib/tensor_stream/train/utils.rb +37 -0
- data/lib/tensor_stream/trainer.rb +2 -0
- data/lib/tensor_stream/utils.rb +24 -14
- data/lib/tensor_stream/variable.rb +5 -11
- data/lib/tensor_stream/variable_scope.rb +15 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/iris.rb +8 -4
- data/samples/linear_regression.rb +1 -1
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +3 -3
- data/tensor_stream.gemspec +1 -1
- data/test_samples/raw_neural_net_sample.rb +4 -1
- metadata +21 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
|
4
|
+
data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
|
7
|
+
data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
|
data/.rubocop.yml
CHANGED
@@ -6,6 +6,12 @@ AllCops:
|
|
6
6
|
- tensor_stream.gemspec
|
7
7
|
- Rakefile
|
8
8
|
|
9
|
+
Style/StringLiterals:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Layout/TrailingBlankLines:
|
13
|
+
Enabled: false
|
14
|
+
|
9
15
|
Metrics/LineLength:
|
10
16
|
Max: 200
|
11
17
|
|
@@ -21,6 +27,10 @@ Metrics/MethodLength:
|
|
21
27
|
Metrics/CyclomaticComplexity:
|
22
28
|
Enabled: false
|
23
29
|
|
30
|
+
Metrics/BlockLength:
|
31
|
+
Exclude:
|
32
|
+
- lib/tensor_stream/math_gradients.rb
|
33
|
+
|
24
34
|
Naming/AccessorMethodName:
|
25
35
|
Exclude:
|
26
36
|
- lib/tensor_stream.rb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.7.0] - 2018-08-08
|
8
|
+
### Added
|
9
|
+
- [NEW OP] expand_dims, min, acos, asin, add_n
|
10
|
+
- Add parse_from_string support. Parse tensorflow pbtext files into tensor_stream
|
11
|
+
|
12
|
+
### Fixes
|
13
|
+
- Tweaks to GradientDescentOptimizer to expose additional methods based on tensorflow
|
14
|
+
|
7
15
|
## [0.6.0] - 2018-07-21
|
8
16
|
### Added
|
9
17
|
- [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
|
data/README.md
CHANGED
@@ -15,6 +15,7 @@ The goal of this gem is to have a high performance machine learning and compute
|
|
15
15
|
- Provision to use your own opcode evaluator (opencl, sciruby and tensorflow backends planned)
|
16
16
|
- Goal is to be as close to TensorFlow in behavior but with some freedom to add ruby specific enhancements (with lots of test cases)
|
17
17
|
- eager execution (experimental)
|
18
|
+
- (08-08-2018) Load pbtext files from tensorflow (Graph.parse_from_string)
|
18
19
|
|
19
20
|
Since this is a pure ruby implementation for now, performance is not there yet. However it should be a good enough environment to learn about tensorflow and experiment with some models.
|
20
21
|
|
@@ -69,7 +70,7 @@ b = tf.variable(rand, name: "bias")
|
|
69
70
|
pred = X * W + b
|
70
71
|
|
71
72
|
# Mean squared error
|
72
|
-
cost =
|
73
|
+
cost = ((pred - Y) ** 2).reduce(:+) / ( 2 * n_samples)
|
73
74
|
|
74
75
|
optimizer = TensorStream::Train::GradientDescentOptimizer.new(learning_rate).minimize(cost)
|
75
76
|
|
@@ -255,6 +256,44 @@ Note that the OpenCL evaluator provides speedup if you are using large tensors,
|
|
255
256
|
|
256
257
|
samples/nearest_neighbor.rb contains a sample that uses opencl.
|
257
258
|
|
259
|
+
## Export Import Models from tensorflow
|
260
|
+
|
261
|
+
Experimental support for parsing and exporting pbtext files are supported:
|
262
|
+
|
263
|
+
Exporting
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
a = ts.constant([1.0, 1.0])
|
267
|
+
b = ts.constant([1.5, 1.5])
|
268
|
+
f = a + b
|
269
|
+
|
270
|
+
File.write('my_model.pbtext', f.graph.as_graph_def)
|
271
|
+
```
|
272
|
+
|
273
|
+
Importing (Experimental)
|
274
|
+
|
275
|
+
Note that not all tensorflow ops are supported, warnings will be showed
|
276
|
+
if a certain operation is not supported yet.
|
277
|
+
|
278
|
+
|
279
|
+
```ruby
|
280
|
+
pbtext = File.read(File.join('linear_regression.pbtxt'))
|
281
|
+
|
282
|
+
# create a graph from pbtext file
|
283
|
+
graph = TensorStream::Graph.parse_from_string(pbtext)
|
284
|
+
|
285
|
+
# reference a tensor by name from the created graph,
|
286
|
+
# for example you have a tensor named out
|
287
|
+
tensor = graph.get_tensor_by_name("out")
|
288
|
+
|
289
|
+
# set graph as default and do operations on it
|
290
|
+
graph.as_default do
|
291
|
+
sess = ts.session
|
292
|
+
expect(tr(sess.run(tensor))).to eq([[1.0, 1.0], [1.0, 1.0]])
|
293
|
+
end
|
294
|
+
|
295
|
+
```
|
296
|
+
|
258
297
|
# Visualization
|
259
298
|
|
260
299
|
tensorstream does not support tensorboard yet, but a graphml generator is included:
|
data/benchmark/benchmark.rb
CHANGED
@@ -8,7 +8,7 @@ require 'tensor_stream/evaluator/opencl/opencl_evaluator'
|
|
8
8
|
def tr(t, places = 1)
|
9
9
|
if t.is_a?(Array)
|
10
10
|
return t.collect do |v|
|
11
|
-
tr(v)
|
11
|
+
tr(v, places)
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
@@ -59,6 +59,7 @@ pow_i = tf.pow(a_int, 3)
|
|
59
59
|
matmul = tf.matmul(a, b)
|
60
60
|
out_of_order = tf.matmul(a, b) + tf.matmul(a, c)
|
61
61
|
softmax = tf.nn.softmax(a)
|
62
|
+
add_n = tf.add_n([a,b,c,d])
|
62
63
|
|
63
64
|
puts TensorStream::Evaluator.default_evaluators
|
64
65
|
|
@@ -68,6 +69,8 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
68
69
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
69
70
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
70
71
|
Benchmark.bmbm do |x|
|
72
|
+
x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
|
73
|
+
x.report("opencl ruby add_n :") { 100.times do sess2.run(add_n) end }
|
71
74
|
x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
|
72
75
|
x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
|
73
76
|
x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
|
data/lib/tensor_stream.rb
CHANGED
@@ -2,25 +2,30 @@ require 'tensor_stream/version'
|
|
2
2
|
require 'deep_merge'
|
3
3
|
require 'matrix'
|
4
4
|
require 'concurrent'
|
5
|
+
require 'tensor_stream/exceptions'
|
5
6
|
require 'tensor_stream/helpers/op_helper'
|
6
7
|
require 'tensor_stream/helpers/string_helper'
|
7
8
|
require 'tensor_stream/initializer'
|
8
9
|
require 'tensor_stream/graph_keys'
|
9
10
|
require 'tensor_stream/types'
|
11
|
+
require 'tensor_stream/graph_builder'
|
10
12
|
require 'tensor_stream/graph'
|
11
13
|
require 'tensor_stream/device'
|
12
14
|
require 'tensor_stream/session'
|
13
15
|
require 'tensor_stream/tensor_shape'
|
14
16
|
require 'tensor_stream/tensor'
|
15
17
|
require 'tensor_stream/variable'
|
18
|
+
require 'tensor_stream/variable_scope'
|
16
19
|
require 'tensor_stream/operation'
|
17
20
|
require 'tensor_stream/placeholder'
|
18
21
|
require 'tensor_stream/control_flow'
|
19
22
|
require 'tensor_stream/dynamic_stitch'
|
23
|
+
require 'tensor_stream/train/utils'
|
20
24
|
require 'tensor_stream/trainer'
|
21
25
|
require 'tensor_stream/nn/nn_ops'
|
22
26
|
require 'tensor_stream/evaluator/evaluator'
|
23
27
|
require 'tensor_stream/graph_serializers/serializer'
|
28
|
+
require 'tensor_stream/graph_deserializers/protobuf'
|
24
29
|
require 'tensor_stream/graph_serializers/pbtext'
|
25
30
|
require 'tensor_stream/graph_serializers/graphml'
|
26
31
|
require 'tensor_stream/math_gradients'
|
@@ -4,10 +4,12 @@ module TensorStream
|
|
4
4
|
|
5
5
|
def add_check_numerics_ops
|
6
6
|
graph = TensorStream.get_default_graph
|
7
|
-
nodes_to_process
|
7
|
+
nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
|
8
8
|
|
9
9
|
nodes_to_process.each do |node|
|
10
|
-
node.inputs = node.inputs.
|
10
|
+
node.inputs = node.inputs.collect do |input|
|
11
|
+
next if input.nil?
|
12
|
+
|
11
13
|
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
12
14
|
TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
|
13
15
|
else
|
data/lib/tensor_stream/device.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# A tensorstream device
|
2
2
|
module TensorStream
|
3
|
+
# Class that describes a supported device
|
3
4
|
class Device
|
4
5
|
attr_accessor :name, :type, :evaluator
|
5
6
|
def initialize(name, type, evaluator)
|
@@ -8,4 +9,4 @@ module TensorStream
|
|
8
9
|
@evaluator = evaluator
|
9
10
|
end
|
10
11
|
end
|
11
|
-
end
|
12
|
+
end
|
@@ -1,5 +1,13 @@
|
|
1
1
|
module TensorStream
|
2
|
+
# Evaluator base module
|
2
3
|
module Evaluator
|
4
|
+
class OutputGroup
|
5
|
+
attr_accessor :outputs
|
6
|
+
def initialize(outputs = [])
|
7
|
+
@outputs = outputs
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
3
11
|
class UnsupportedOp < Exception
|
4
12
|
def initialize(tensor)
|
5
13
|
@tensor = tensor
|
@@ -10,31 +18,36 @@ module TensorStream
|
|
10
18
|
end
|
11
19
|
end
|
12
20
|
|
21
|
+
# Evaluator base class
|
13
22
|
class BaseEvaluator
|
14
|
-
def initialize(session,
|
23
|
+
def initialize(session, _device, thread_pool: nil, log_intermediates: false)
|
15
24
|
@session = session
|
16
25
|
@log_intermediates = log_intermediates
|
17
26
|
@thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
|
18
27
|
@context[:compute_history] = [] if log_intermediates
|
19
28
|
end
|
20
29
|
|
30
|
+
##
|
31
|
+
# Query all supported devices
|
21
32
|
def self.query_supported_devices
|
22
|
-
[Device.new(
|
33
|
+
[Device.new('cpu', :cpu, self)]
|
23
34
|
end
|
24
35
|
|
25
36
|
##
|
26
37
|
# Select the best device available in the system for this evaluator
|
27
38
|
def self.default_device
|
28
|
-
Device.new(
|
39
|
+
Device.new('cpu', :cpu, self)
|
29
40
|
end
|
30
41
|
|
31
42
|
##
|
32
43
|
# Selects the best device with the specified query, query can
|
33
44
|
# be evaluator specific
|
34
|
-
def self.fetch_device(
|
35
|
-
Device.new(
|
45
|
+
def self.fetch_device(_query = [])
|
46
|
+
Device.new('cpu', :cpu, self)
|
36
47
|
end
|
37
48
|
|
49
|
+
##
|
50
|
+
# Select device using uri
|
38
51
|
def self.query_device(query)
|
39
52
|
return default_device if query.nil? || query == :default
|
40
53
|
|
@@ -52,8 +65,8 @@ module TensorStream
|
|
52
65
|
|
53
66
|
select_index = [devices.size - 1, select_index].min
|
54
67
|
return devices[select_index]
|
55
|
-
elsif components[0]
|
56
|
-
device_type =
|
68
|
+
elsif %w[cpu gpu].include?(components[0])
|
69
|
+
device_type = components[0].to_sym
|
57
70
|
select_index = components[1].to_i
|
58
71
|
|
59
72
|
devices = all_devices.select { |d| d.type == device_type.downcase.to_sym }
|
@@ -91,34 +104,32 @@ module TensorStream
|
|
91
104
|
|
92
105
|
def invoke(tensor, execution_context)
|
93
106
|
return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
|
107
|
+
raise UnsupportedOp.new(tensor), "op #{tensor.operation} is not yet supported" unless self.class.ops.key?(tensor.operation.to_sym)
|
94
108
|
|
95
|
-
|
96
|
-
|
109
|
+
op = self.class.ops[tensor.operation.to_sym]
|
110
|
+
op_options = op[:options]
|
97
111
|
|
98
|
-
|
99
|
-
|
100
|
-
next if i.nil?
|
112
|
+
resolved_inputs = tensor.inputs.map do |i|
|
113
|
+
next if i.nil?
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
115
|
+
if i.is_a?(Array)
|
116
|
+
next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
|
117
|
+
end
|
105
118
|
|
106
|
-
|
107
|
-
|
108
|
-
|
119
|
+
if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
|
120
|
+
cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
|
121
|
+
next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
109
122
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
end
|
114
|
-
else
|
115
|
-
prepare_input(i, execution_context, op_options)
|
123
|
+
result = @session.delegate_to_evaluator(i, @context, execution_context)
|
124
|
+
convert_from_buffer(i, result).tap do |buffer|
|
125
|
+
@context[:_cache][cache_key] = buffer if i.is_const
|
116
126
|
end
|
127
|
+
else
|
128
|
+
prepare_input(i, execution_context, op_options)
|
117
129
|
end
|
118
|
-
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
119
|
-
else
|
120
|
-
raise UnsupportedOp.new(tensor)
|
121
130
|
end
|
131
|
+
|
132
|
+
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
122
133
|
end
|
123
134
|
|
124
135
|
protected
|
@@ -128,13 +139,13 @@ module TensorStream
|
|
128
139
|
|
129
140
|
input_a_args = []
|
130
141
|
input_b_args = []
|
131
|
-
|
132
|
-
input_a = input_b.size
|
133
|
-
input_b = input_a.size
|
142
|
+
|
143
|
+
input_a = Array.new(input_b.size) { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
|
144
|
+
input_b = Array.new(input_a.size) { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
|
134
145
|
|
135
146
|
input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
|
136
147
|
a, b = item
|
137
|
-
|
148
|
+
|
138
149
|
if a.nil? || b && (a < b)
|
139
150
|
input_a_args << input_b.size - index - 1
|
140
151
|
elsif b.nil? || a && (a > b)
|
@@ -142,7 +153,7 @@ module TensorStream
|
|
142
153
|
end
|
143
154
|
end
|
144
155
|
|
145
|
-
|
156
|
+
[input_a_args.reverse, input_b_args.reverse]
|
146
157
|
end
|
147
158
|
|
148
159
|
##
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
|
9
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
|
9
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void log_softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
|
25
|
+
}
|
26
|
+
}
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -15,9 +15,9 @@
|
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
16
|
|
17
17
|
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
18
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
19
|
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0]
|
20
|
+
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
21
|
}
|
22
22
|
}
|
23
23
|
|
@@ -39,8 +39,8 @@
|
|
39
39
|
}
|
40
40
|
|
41
41
|
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
42
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
43
|
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index]
|
44
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
45
|
}
|
46
46
|
}
|