tensor_stream 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/CHANGELOG.md +8 -0
- data/README.md +40 -1
- data/benchmark/benchmark.rb +4 -1
- data/lib/tensor_stream.rb +5 -0
- data/lib/tensor_stream/debugging/debugging.rb +4 -2
- data/lib/tensor_stream/device.rb +2 -1
- data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
- data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
- data/lib/tensor_stream/exceptions.rb +6 -0
- data/lib/tensor_stream/graph.rb +21 -6
- data/lib/tensor_stream/graph_builder.rb +67 -0
- data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
- data/lib/tensor_stream/graph_keys.rb +1 -0
- data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
- data/lib/tensor_stream/helpers/op_helper.rb +7 -33
- data/lib/tensor_stream/helpers/string_helper.rb +16 -0
- data/lib/tensor_stream/math_gradients.rb +67 -44
- data/lib/tensor_stream/nn/nn_ops.rb +7 -1
- data/lib/tensor_stream/operation.rb +14 -27
- data/lib/tensor_stream/ops.rb +82 -29
- data/lib/tensor_stream/session.rb +4 -0
- data/lib/tensor_stream/tensor.rb +30 -12
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
- data/lib/tensor_stream/train/saver.rb +46 -0
- data/lib/tensor_stream/train/utils.rb +37 -0
- data/lib/tensor_stream/trainer.rb +2 -0
- data/lib/tensor_stream/utils.rb +24 -14
- data/lib/tensor_stream/variable.rb +5 -11
- data/lib/tensor_stream/variable_scope.rb +15 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/iris.rb +8 -4
- data/samples/linear_regression.rb +1 -1
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +3 -3
- data/tensor_stream.gemspec +1 -1
- data/test_samples/raw_neural_net_sample.rb +4 -1
- metadata +21 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
|
4
|
+
data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
|
7
|
+
data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
|
data/.rubocop.yml
CHANGED
@@ -6,6 +6,12 @@ AllCops:
|
|
6
6
|
- tensor_stream.gemspec
|
7
7
|
- Rakefile
|
8
8
|
|
9
|
+
Style/StringLiterals:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Layout/TrailingBlankLines:
|
13
|
+
Enabled: false
|
14
|
+
|
9
15
|
Metrics/LineLength:
|
10
16
|
Max: 200
|
11
17
|
|
@@ -21,6 +27,10 @@ Metrics/MethodLength:
|
|
21
27
|
Metrics/CyclomaticComplexity:
|
22
28
|
Enabled: false
|
23
29
|
|
30
|
+
Metrics/BlockLength:
|
31
|
+
Exclude:
|
32
|
+
- lib/tensor_stream/math_gradients.rb
|
33
|
+
|
24
34
|
Naming/AccessorMethodName:
|
25
35
|
Exclude:
|
26
36
|
- lib/tensor_stream.rb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.7.0] - 2018-08-08
|
8
|
+
### Added
|
9
|
+
- [NEW OP] expand_dims, min, acos, asin, add_n
|
10
|
+
- Add parse_from_string support. Parse tensorflow pbtext files into tensor_stream
|
11
|
+
|
12
|
+
### Fixes
|
13
|
+
- Tweaks to GradientDescentOptimizer to expose additional methods based on tensorflow
|
14
|
+
|
7
15
|
## [0.6.0] - 2018-07-21
|
8
16
|
### Added
|
9
17
|
- [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
|
data/README.md
CHANGED
@@ -15,6 +15,7 @@ The goal of this gem is to have a high performance machine learning and compute
|
|
15
15
|
- Provision to use your own opcode evaluator (opencl, sciruby and tensorflow backends planned)
|
16
16
|
- Goal is to be as close to TensorFlow in behavior but with some freedom to add ruby specific enhancements (with lots of test cases)
|
17
17
|
- eager execution (experimental)
|
18
|
+
- (08-08-2018) Load pbtext files from tensorflow (Graph.parse_from_string)
|
18
19
|
|
19
20
|
Since this is a pure ruby implementation for now, performance is not there yet. However it should be a good enough environment to learn about tensorflow and experiment with some models.
|
20
21
|
|
@@ -69,7 +70,7 @@ b = tf.variable(rand, name: "bias")
|
|
69
70
|
pred = X * W + b
|
70
71
|
|
71
72
|
# Mean squared error
|
72
|
-
cost =
|
73
|
+
cost = ((pred - Y) ** 2).reduce(:+) / ( 2 * n_samples)
|
73
74
|
|
74
75
|
optimizer = TensorStream::Train::GradientDescentOptimizer.new(learning_rate).minimize(cost)
|
75
76
|
|
@@ -255,6 +256,44 @@ Note that the OpenCL evaluator provides speedup if you are using large tensors,
|
|
255
256
|
|
256
257
|
samples/nearest_neighbor.rb contains a sample that uses opencl.
|
257
258
|
|
259
|
+
## Export Import Models from tensorflow
|
260
|
+
|
261
|
+
Experimental support for parsing and exporting pbtext files are supported:
|
262
|
+
|
263
|
+
Exporting
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
a = ts.constant([1.0, 1.0])
|
267
|
+
b = ts.constant([1.5, 1.5])
|
268
|
+
f = a + b
|
269
|
+
|
270
|
+
File.write('my_model.pbtext', f.graph.as_graph_def)
|
271
|
+
```
|
272
|
+
|
273
|
+
Importing (Experimental)
|
274
|
+
|
275
|
+
Note that not all tensorflow ops are supported, warnings will be showed
|
276
|
+
if a certain operation is not supported yet.
|
277
|
+
|
278
|
+
|
279
|
+
```ruby
|
280
|
+
pbtext = File.read(File.join('linear_regression.pbtxt'))
|
281
|
+
|
282
|
+
# create a graph from pbtext file
|
283
|
+
graph = TensorStream::Graph.parse_from_string(pbtext)
|
284
|
+
|
285
|
+
# reference a tensor by name from the created graph,
|
286
|
+
# for example you have a tensor named out
|
287
|
+
tensor = graph.get_tensor_by_name("out")
|
288
|
+
|
289
|
+
# set graph as default and do operations on it
|
290
|
+
graph.as_default do
|
291
|
+
sess = ts.session
|
292
|
+
expect(tr(sess.run(tensor))).to eq([[1.0, 1.0], [1.0, 1.0]])
|
293
|
+
end
|
294
|
+
|
295
|
+
```
|
296
|
+
|
258
297
|
# Visualization
|
259
298
|
|
260
299
|
tensorstream does not support tensorboard yet, but a graphml generator is included:
|
data/benchmark/benchmark.rb
CHANGED
@@ -8,7 +8,7 @@ require 'tensor_stream/evaluator/opencl/opencl_evaluator'
|
|
8
8
|
def tr(t, places = 1)
|
9
9
|
if t.is_a?(Array)
|
10
10
|
return t.collect do |v|
|
11
|
-
tr(v)
|
11
|
+
tr(v, places)
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
@@ -59,6 +59,7 @@ pow_i = tf.pow(a_int, 3)
|
|
59
59
|
matmul = tf.matmul(a, b)
|
60
60
|
out_of_order = tf.matmul(a, b) + tf.matmul(a, c)
|
61
61
|
softmax = tf.nn.softmax(a)
|
62
|
+
add_n = tf.add_n([a,b,c,d])
|
62
63
|
|
63
64
|
puts TensorStream::Evaluator.default_evaluators
|
64
65
|
|
@@ -68,6 +69,8 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
68
69
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
69
70
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
70
71
|
Benchmark.bmbm do |x|
|
72
|
+
x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
|
73
|
+
x.report("opencl ruby add_n :") { 100.times do sess2.run(add_n) end }
|
71
74
|
x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
|
72
75
|
x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
|
73
76
|
x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
|
data/lib/tensor_stream.rb
CHANGED
@@ -2,25 +2,30 @@ require 'tensor_stream/version'
|
|
2
2
|
require 'deep_merge'
|
3
3
|
require 'matrix'
|
4
4
|
require 'concurrent'
|
5
|
+
require 'tensor_stream/exceptions'
|
5
6
|
require 'tensor_stream/helpers/op_helper'
|
6
7
|
require 'tensor_stream/helpers/string_helper'
|
7
8
|
require 'tensor_stream/initializer'
|
8
9
|
require 'tensor_stream/graph_keys'
|
9
10
|
require 'tensor_stream/types'
|
11
|
+
require 'tensor_stream/graph_builder'
|
10
12
|
require 'tensor_stream/graph'
|
11
13
|
require 'tensor_stream/device'
|
12
14
|
require 'tensor_stream/session'
|
13
15
|
require 'tensor_stream/tensor_shape'
|
14
16
|
require 'tensor_stream/tensor'
|
15
17
|
require 'tensor_stream/variable'
|
18
|
+
require 'tensor_stream/variable_scope'
|
16
19
|
require 'tensor_stream/operation'
|
17
20
|
require 'tensor_stream/placeholder'
|
18
21
|
require 'tensor_stream/control_flow'
|
19
22
|
require 'tensor_stream/dynamic_stitch'
|
23
|
+
require 'tensor_stream/train/utils'
|
20
24
|
require 'tensor_stream/trainer'
|
21
25
|
require 'tensor_stream/nn/nn_ops'
|
22
26
|
require 'tensor_stream/evaluator/evaluator'
|
23
27
|
require 'tensor_stream/graph_serializers/serializer'
|
28
|
+
require 'tensor_stream/graph_deserializers/protobuf'
|
24
29
|
require 'tensor_stream/graph_serializers/pbtext'
|
25
30
|
require 'tensor_stream/graph_serializers/graphml'
|
26
31
|
require 'tensor_stream/math_gradients'
|
@@ -4,10 +4,12 @@ module TensorStream
|
|
4
4
|
|
5
5
|
def add_check_numerics_ops
|
6
6
|
graph = TensorStream.get_default_graph
|
7
|
-
nodes_to_process
|
7
|
+
nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
|
8
8
|
|
9
9
|
nodes_to_process.each do |node|
|
10
|
-
node.inputs = node.inputs.
|
10
|
+
node.inputs = node.inputs.collect do |input|
|
11
|
+
next if input.nil?
|
12
|
+
|
11
13
|
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
12
14
|
TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
|
13
15
|
else
|
data/lib/tensor_stream/device.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# A tensorstream device
|
2
2
|
module TensorStream
|
3
|
+
# Class that describes a supported device
|
3
4
|
class Device
|
4
5
|
attr_accessor :name, :type, :evaluator
|
5
6
|
def initialize(name, type, evaluator)
|
@@ -8,4 +9,4 @@ module TensorStream
|
|
8
9
|
@evaluator = evaluator
|
9
10
|
end
|
10
11
|
end
|
11
|
-
end
|
12
|
+
end
|
@@ -1,5 +1,13 @@
|
|
1
1
|
module TensorStream
|
2
|
+
# Evaluator base module
|
2
3
|
module Evaluator
|
4
|
+
class OutputGroup
|
5
|
+
attr_accessor :outputs
|
6
|
+
def initialize(outputs = [])
|
7
|
+
@outputs = outputs
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
3
11
|
class UnsupportedOp < Exception
|
4
12
|
def initialize(tensor)
|
5
13
|
@tensor = tensor
|
@@ -10,31 +18,36 @@ module TensorStream
|
|
10
18
|
end
|
11
19
|
end
|
12
20
|
|
21
|
+
# Evaluator base class
|
13
22
|
class BaseEvaluator
|
14
|
-
def initialize(session,
|
23
|
+
def initialize(session, _device, thread_pool: nil, log_intermediates: false)
|
15
24
|
@session = session
|
16
25
|
@log_intermediates = log_intermediates
|
17
26
|
@thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
|
18
27
|
@context[:compute_history] = [] if log_intermediates
|
19
28
|
end
|
20
29
|
|
30
|
+
##
|
31
|
+
# Query all supported devices
|
21
32
|
def self.query_supported_devices
|
22
|
-
[Device.new(
|
33
|
+
[Device.new('cpu', :cpu, self)]
|
23
34
|
end
|
24
35
|
|
25
36
|
##
|
26
37
|
# Select the best device available in the system for this evaluator
|
27
38
|
def self.default_device
|
28
|
-
Device.new(
|
39
|
+
Device.new('cpu', :cpu, self)
|
29
40
|
end
|
30
41
|
|
31
42
|
##
|
32
43
|
# Selects the best device with the specified query, query can
|
33
44
|
# be evaluator specific
|
34
|
-
def self.fetch_device(
|
35
|
-
Device.new(
|
45
|
+
def self.fetch_device(_query = [])
|
46
|
+
Device.new('cpu', :cpu, self)
|
36
47
|
end
|
37
48
|
|
49
|
+
##
|
50
|
+
# Select device using uri
|
38
51
|
def self.query_device(query)
|
39
52
|
return default_device if query.nil? || query == :default
|
40
53
|
|
@@ -52,8 +65,8 @@ module TensorStream
|
|
52
65
|
|
53
66
|
select_index = [devices.size - 1, select_index].min
|
54
67
|
return devices[select_index]
|
55
|
-
elsif components[0]
|
56
|
-
device_type =
|
68
|
+
elsif %w[cpu gpu].include?(components[0])
|
69
|
+
device_type = components[0].to_sym
|
57
70
|
select_index = components[1].to_i
|
58
71
|
|
59
72
|
devices = all_devices.select { |d| d.type == device_type.downcase.to_sym }
|
@@ -91,34 +104,32 @@ module TensorStream
|
|
91
104
|
|
92
105
|
def invoke(tensor, execution_context)
|
93
106
|
return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
|
107
|
+
raise UnsupportedOp.new(tensor), "op #{tensor.operation} is not yet supported" unless self.class.ops.key?(tensor.operation.to_sym)
|
94
108
|
|
95
|
-
|
96
|
-
|
109
|
+
op = self.class.ops[tensor.operation.to_sym]
|
110
|
+
op_options = op[:options]
|
97
111
|
|
98
|
-
|
99
|
-
|
100
|
-
next if i.nil?
|
112
|
+
resolved_inputs = tensor.inputs.map do |i|
|
113
|
+
next if i.nil?
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
115
|
+
if i.is_a?(Array)
|
116
|
+
next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
|
117
|
+
end
|
105
118
|
|
106
|
-
|
107
|
-
|
108
|
-
|
119
|
+
if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
|
120
|
+
cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
|
121
|
+
next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
109
122
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
end
|
114
|
-
else
|
115
|
-
prepare_input(i, execution_context, op_options)
|
123
|
+
result = @session.delegate_to_evaluator(i, @context, execution_context)
|
124
|
+
convert_from_buffer(i, result).tap do |buffer|
|
125
|
+
@context[:_cache][cache_key] = buffer if i.is_const
|
116
126
|
end
|
127
|
+
else
|
128
|
+
prepare_input(i, execution_context, op_options)
|
117
129
|
end
|
118
|
-
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
119
|
-
else
|
120
|
-
raise UnsupportedOp.new(tensor)
|
121
130
|
end
|
131
|
+
|
132
|
+
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
122
133
|
end
|
123
134
|
|
124
135
|
protected
|
@@ -128,13 +139,13 @@ module TensorStream
|
|
128
139
|
|
129
140
|
input_a_args = []
|
130
141
|
input_b_args = []
|
131
|
-
|
132
|
-
input_a = input_b.size
|
133
|
-
input_b = input_a.size
|
142
|
+
|
143
|
+
input_a = Array.new(input_b.size) { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
|
144
|
+
input_b = Array.new(input_a.size) { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
|
134
145
|
|
135
146
|
input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
|
136
147
|
a, b = item
|
137
|
-
|
148
|
+
|
138
149
|
if a.nil? || b && (a < b)
|
139
150
|
input_a_args << input_b.size - index - 1
|
140
151
|
elsif b.nil? || a && (a > b)
|
@@ -142,7 +153,7 @@ module TensorStream
|
|
142
153
|
end
|
143
154
|
end
|
144
155
|
|
145
|
-
|
156
|
+
[input_a_args.reverse, input_b_args.reverse]
|
146
157
|
end
|
147
158
|
|
148
159
|
##
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
|
9
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
|
9
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void log_softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
|
25
|
+
}
|
26
|
+
}
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -15,9 +15,9 @@
|
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
16
|
|
17
17
|
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
18
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
19
|
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0]
|
20
|
+
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
21
|
}
|
22
22
|
}
|
23
23
|
|
@@ -39,8 +39,8 @@
|
|
39
39
|
}
|
40
40
|
|
41
41
|
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol]
|
42
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
43
|
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index]
|
44
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
45
|
}
|
46
46
|
}
|