tensor_stream 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +40 -1
  5. data/benchmark/benchmark.rb +4 -1
  6. data/lib/tensor_stream.rb +5 -0
  7. data/lib/tensor_stream/debugging/debugging.rb +4 -2
  8. data/lib/tensor_stream/device.rb +2 -1
  9. data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
  10. data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
  11. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
  12. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
  13. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
  14. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
  15. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
  16. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
  17. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
  18. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
  19. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
  20. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
  21. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
  22. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
  23. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
  24. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
  25. data/lib/tensor_stream/exceptions.rb +6 -0
  26. data/lib/tensor_stream/graph.rb +21 -6
  27. data/lib/tensor_stream/graph_builder.rb +67 -0
  28. data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
  29. data/lib/tensor_stream/graph_keys.rb +1 -0
  30. data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
  31. data/lib/tensor_stream/helpers/op_helper.rb +7 -33
  32. data/lib/tensor_stream/helpers/string_helper.rb +16 -0
  33. data/lib/tensor_stream/math_gradients.rb +67 -44
  34. data/lib/tensor_stream/nn/nn_ops.rb +7 -1
  35. data/lib/tensor_stream/operation.rb +14 -27
  36. data/lib/tensor_stream/ops.rb +82 -29
  37. data/lib/tensor_stream/session.rb +4 -0
  38. data/lib/tensor_stream/tensor.rb +30 -12
  39. data/lib/tensor_stream/tensor_shape.rb +1 -1
  40. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
  41. data/lib/tensor_stream/train/saver.rb +46 -0
  42. data/lib/tensor_stream/train/utils.rb +37 -0
  43. data/lib/tensor_stream/trainer.rb +2 -0
  44. data/lib/tensor_stream/utils.rb +24 -14
  45. data/lib/tensor_stream/variable.rb +5 -11
  46. data/lib/tensor_stream/variable_scope.rb +15 -0
  47. data/lib/tensor_stream/version.rb +1 -1
  48. data/samples/iris.rb +8 -4
  49. data/samples/linear_regression.rb +1 -1
  50. data/samples/multigpu.rb +73 -0
  51. data/samples/nearest_neighbor.rb +3 -3
  52. data/tensor_stream.gemspec +1 -1
  53. data/test_samples/raw_neural_net_sample.rb +4 -1
  54. metadata +21 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4f9e6d7a1640d4b24b92d7706bcab88ab5bb294551f38dc6ad403eb1b2a761b
4
- data.tar.gz: 71c245e394be382e3976f4b2b9989c2b89538f22cc40984a5a84e07fb0bca597
3
+ metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
4
+ data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
5
5
  SHA512:
6
- metadata.gz: a04ca2c61064dc4fb67f2653acbde962f8f8b5f56a5e5f27760f44f6032b81c61307e3b207bbb4ec42bd1713d06315ba760a710a712fb9f124efa6c7f06a7246
7
- data.tar.gz: ae8e29f412c1b63b854604696df4858ce4d3fb078c982e53a0b85fd8ff6b546c2b9502ecb4d253cb6d0f9255aa1e16b1aaa445da769a0b06bb85ed55a0aebd55
6
+ metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
7
+ data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
data/.rubocop.yml CHANGED
@@ -6,6 +6,12 @@ AllCops:
6
6
  - tensor_stream.gemspec
7
7
  - Rakefile
8
8
 
9
+ Style/StringLiterals:
10
+ Enabled: false
11
+
12
+ Layout/TrailingBlankLines:
13
+ Enabled: false
14
+
9
15
  Metrics/LineLength:
10
16
  Max: 200
11
17
 
@@ -21,6 +27,10 @@ Metrics/MethodLength:
21
27
  Metrics/CyclomaticComplexity:
22
28
  Enabled: false
23
29
 
30
+ Metrics/BlockLength:
31
+ Exclude:
32
+ - lib/tensor_stream/math_gradients.rb
33
+
24
34
  Naming/AccessorMethodName:
25
35
  Exclude:
26
36
  - lib/tensor_stream.rb
data/CHANGELOG.md CHANGED
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.7.0] - 2018-08-08
8
+ ### Added
9
+ - [NEW OP] expand_dims, min, acos, asin, add_n
10
+ - Add parse_from_string support. Parse tensorflow pbtext files into tensor_stream
11
+
12
+ ### Fixes
13
+ - Tweaks to GradientDescentOptimizer to expose additional methods based on tensorflow
14
+
7
15
  ## [0.6.0] - 2018-07-21
8
16
  ### Added
9
17
  - [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
data/README.md CHANGED
@@ -15,6 +15,7 @@ The goal of this gem is to have a high performance machine learning and compute
15
15
  - Provision to use your own opcode evaluator (opencl, sciruby and tensorflow backends planned)
16
16
  - Goal is to be as close to TensorFlow in behavior but with some freedom to add ruby specific enhancements (with lots of test cases)
17
17
  - eager execution (experimental)
18
+ - (08-08-2018) Load pbtext files from tensorflow (Graph.parse_from_string)
18
19
 
19
20
  Since this is a pure ruby implementation for now, performance is not there yet. However it should be a good enough environment to learn about tensorflow and experiment with some models.
20
21
 
@@ -69,7 +70,7 @@ b = tf.variable(rand, name: "bias")
69
70
  pred = X * W + b
70
71
 
71
72
  # Mean squared error
72
- cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / ( 2 * n_samples)
73
+ cost = ((pred - Y) ** 2).reduce(:+) / ( 2 * n_samples)
73
74
 
74
75
  optimizer = TensorStream::Train::GradientDescentOptimizer.new(learning_rate).minimize(cost)
75
76
 
@@ -255,6 +256,44 @@ Note that the OpenCL evaluator provides speedup if you are using large tensors,
255
256
 
256
257
  samples/nearest_neighbor.rb contains a sample that uses opencl.
257
258
 
259
+ ## Export Import Models from tensorflow
260
+
261
+ Experimental support for parsing and exporting pbtext files are supported:
262
+
263
+ Exporting
264
+
265
+ ```ruby
266
+ a = ts.constant([1.0, 1.0])
267
+ b = ts.constant([1.5, 1.5])
268
+ f = a + b
269
+
270
+ File.write('my_model.pbtext', f.graph.as_graph_def)
271
+ ```
272
+
273
+ Importing (Experimental)
274
+
275
+ Note that not all tensorflow ops are supported, warnings will be showed
276
+ if a certain operation is not supported yet.
277
+
278
+
279
+ ```ruby
280
+ pbtext = File.read(File.join('linear_regression.pbtxt'))
281
+
282
+ # create a graph from pbtext file
283
+ graph = TensorStream::Graph.parse_from_string(pbtext)
284
+
285
+ # reference a tensor by name from the created graph,
286
+ # for example you have a tensor named out
287
+ tensor = graph.get_tensor_by_name("out")
288
+
289
+ # set graph as default and do operations on it
290
+ graph.as_default do
291
+ sess = ts.session
292
+ expect(tr(sess.run(tensor))).to eq([[1.0, 1.0], [1.0, 1.0]])
293
+ end
294
+
295
+ ```
296
+
258
297
  # Visualization
259
298
 
260
299
  tensorstream does not support tensorboard yet, but a graphml generator is included:
@@ -8,7 +8,7 @@ require 'tensor_stream/evaluator/opencl/opencl_evaluator'
8
8
  def tr(t, places = 1)
9
9
  if t.is_a?(Array)
10
10
  return t.collect do |v|
11
- tr(v)
11
+ tr(v, places)
12
12
  end
13
13
  end
14
14
 
@@ -59,6 +59,7 @@ pow_i = tf.pow(a_int, 3)
59
59
  matmul = tf.matmul(a, b)
60
60
  out_of_order = tf.matmul(a, b) + tf.matmul(a, c)
61
61
  softmax = tf.nn.softmax(a)
62
+ add_n = tf.add_n([a,b,c,d])
62
63
 
63
64
  puts TensorStream::Evaluator.default_evaluators
64
65
 
@@ -68,6 +69,8 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
68
69
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
69
70
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
70
71
  Benchmark.bmbm do |x|
72
+ x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
73
+ x.report("opencl ruby add_n :") { 100.times do sess2.run(add_n) end }
71
74
  x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
72
75
  x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
73
76
  x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
data/lib/tensor_stream.rb CHANGED
@@ -2,25 +2,30 @@ require 'tensor_stream/version'
2
2
  require 'deep_merge'
3
3
  require 'matrix'
4
4
  require 'concurrent'
5
+ require 'tensor_stream/exceptions'
5
6
  require 'tensor_stream/helpers/op_helper'
6
7
  require 'tensor_stream/helpers/string_helper'
7
8
  require 'tensor_stream/initializer'
8
9
  require 'tensor_stream/graph_keys'
9
10
  require 'tensor_stream/types'
11
+ require 'tensor_stream/graph_builder'
10
12
  require 'tensor_stream/graph'
11
13
  require 'tensor_stream/device'
12
14
  require 'tensor_stream/session'
13
15
  require 'tensor_stream/tensor_shape'
14
16
  require 'tensor_stream/tensor'
15
17
  require 'tensor_stream/variable'
18
+ require 'tensor_stream/variable_scope'
16
19
  require 'tensor_stream/operation'
17
20
  require 'tensor_stream/placeholder'
18
21
  require 'tensor_stream/control_flow'
19
22
  require 'tensor_stream/dynamic_stitch'
23
+ require 'tensor_stream/train/utils'
20
24
  require 'tensor_stream/trainer'
21
25
  require 'tensor_stream/nn/nn_ops'
22
26
  require 'tensor_stream/evaluator/evaluator'
23
27
  require 'tensor_stream/graph_serializers/serializer'
28
+ require 'tensor_stream/graph_deserializers/protobuf'
24
29
  require 'tensor_stream/graph_serializers/pbtext'
25
30
  require 'tensor_stream/graph_serializers/graphml'
26
31
  require 'tensor_stream/math_gradients'
@@ -4,10 +4,12 @@ module TensorStream
4
4
 
5
5
  def add_check_numerics_ops
6
6
  graph = TensorStream.get_default_graph
7
- nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
7
+ nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
8
8
 
9
9
  nodes_to_process.each do |node|
10
- node.inputs = node.inputs.compact.collect do |input|
10
+ node.inputs = node.inputs.collect do |input|
11
+ next if input.nil?
12
+
11
13
  if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
12
14
  TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
13
15
  else
@@ -1,5 +1,6 @@
1
1
  # A tensorstream device
2
2
  module TensorStream
3
+ # Class that describes a supported device
3
4
  class Device
4
5
  attr_accessor :name, :type, :evaluator
5
6
  def initialize(name, type, evaluator)
@@ -8,4 +9,4 @@ module TensorStream
8
9
  @evaluator = evaluator
9
10
  end
10
11
  end
11
- end
12
+ end
@@ -1,5 +1,13 @@
1
1
  module TensorStream
2
+ # Evaluator base module
2
3
  module Evaluator
4
+ class OutputGroup
5
+ attr_accessor :outputs
6
+ def initialize(outputs = [])
7
+ @outputs = outputs
8
+ end
9
+ end
10
+
3
11
  class UnsupportedOp < Exception
4
12
  def initialize(tensor)
5
13
  @tensor = tensor
@@ -10,31 +18,36 @@ module TensorStream
10
18
  end
11
19
  end
12
20
 
21
+ # Evaluator base class
13
22
  class BaseEvaluator
14
- def initialize(session, device, thread_pool: nil, log_intermediates: false)
23
+ def initialize(session, _device, thread_pool: nil, log_intermediates: false)
15
24
  @session = session
16
25
  @log_intermediates = log_intermediates
17
26
  @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
18
27
  @context[:compute_history] = [] if log_intermediates
19
28
  end
20
29
 
30
+ ##
31
+ # Query all supported devices
21
32
  def self.query_supported_devices
22
- [Device.new("cpu", :cpu, self)]
33
+ [Device.new('cpu', :cpu, self)]
23
34
  end
24
35
 
25
36
  ##
26
37
  # Select the best device available in the system for this evaluator
27
38
  def self.default_device
28
- Device.new("cpu", :cpu, self)
39
+ Device.new('cpu', :cpu, self)
29
40
  end
30
41
 
31
42
  ##
32
43
  # Selects the best device with the specified query, query can
33
44
  # be evaluator specific
34
- def self.fetch_device(query = [])
35
- Device.new("cpu", :cpu, self)
45
+ def self.fetch_device(_query = [])
46
+ Device.new('cpu', :cpu, self)
36
47
  end
37
48
 
49
+ ##
50
+ # Select device using uri
38
51
  def self.query_device(query)
39
52
  return default_device if query.nil? || query == :default
40
53
 
@@ -52,8 +65,8 @@ module TensorStream
52
65
 
53
66
  select_index = [devices.size - 1, select_index].min
54
67
  return devices[select_index]
55
- elsif components[0] == 'cpu'
56
- device_type = :cpu
68
+ elsif %w[cpu gpu].include?(components[0])
69
+ device_type = components[0].to_sym
57
70
  select_index = components[1].to_i
58
71
 
59
72
  devices = all_devices.select { |d| d.type == device_type.downcase.to_sym }
@@ -91,34 +104,32 @@ module TensorStream
91
104
 
92
105
  def invoke(tensor, execution_context)
93
106
  return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
107
+ raise UnsupportedOp.new(tensor), "op #{tensor.operation} is not yet supported" unless self.class.ops.key?(tensor.operation.to_sym)
94
108
 
95
- if self.class.ops.key?(tensor.operation.to_sym)
96
- op = self.class.ops[tensor.operation.to_sym]
109
+ op = self.class.ops[tensor.operation.to_sym]
110
+ op_options = op[:options]
97
111
 
98
- op_options = op[:options]
99
- resolved_inputs = tensor.inputs.map do |i|
100
- next if i.nil?
112
+ resolved_inputs = tensor.inputs.map do |i|
113
+ next if i.nil?
101
114
 
102
- if i.is_a?(Array)
103
- next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
104
- end
115
+ if i.is_a?(Array)
116
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
117
+ end
105
118
 
106
- if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
107
- cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
108
- next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
119
+ if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
120
+ cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
121
+ next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
109
122
 
110
- result = @session.delegate_to_evaluator(i, @context, execution_context)
111
- convert_from_buffer(i, result).tap do |buffer|
112
- @context[:_cache][cache_key] = buffer if i.is_const
113
- end
114
- else
115
- prepare_input(i, execution_context, op_options)
123
+ result = @session.delegate_to_evaluator(i, @context, execution_context)
124
+ convert_from_buffer(i, result).tap do |buffer|
125
+ @context[:_cache][cache_key] = buffer if i.is_const
116
126
  end
127
+ else
128
+ prepare_input(i, execution_context, op_options)
117
129
  end
118
- instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
119
- else
120
- raise UnsupportedOp.new(tensor)
121
130
  end
131
+
132
+ instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
122
133
  end
123
134
 
124
135
  protected
@@ -128,13 +139,13 @@ module TensorStream
128
139
 
129
140
  input_a_args = []
130
141
  input_b_args = []
131
-
132
- input_a = input_b.size.times.map { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
133
- input_b = input_a.size.times.map { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
142
+
143
+ input_a = Array.new(input_b.size) { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
144
+ input_b = Array.new(input_a.size) { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
134
145
 
135
146
  input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
136
147
  a, b = item
137
-
148
+
138
149
  if a.nil? || b && (a < b)
139
150
  input_a_args << input_b.size - index - 1
140
151
  elsif b.nil? || a && (a > b)
@@ -142,7 +153,7 @@ module TensorStream
142
153
  end
143
154
  end
144
155
 
145
- [input_a_args.reverse, input_b_args.reverse]
156
+ [input_a_args.reverse, input_b_args.reverse]
146
157
  end
147
158
 
148
159
  ##
@@ -1,4 +1,3 @@
1
-
2
1
  require 'tensor_stream/evaluator/ruby_evaluator'
3
2
  require 'tensor_stream/evaluator/buffer'
4
3
 
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
9
+ }
@@ -0,0 +1,9 @@
1
+
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
9
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'floor_mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void log_softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
25
+ }
26
+ }
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -15,9 +15,9 @@
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
16
 
17
17
  if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[0] ? A[globalRow * N + globalCol] : B[0];
18
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
19
  } else {
20
- C[globalRow * N + globalCol] = B[0] > A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
20
+ C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
21
  }
22
22
  }
23
23
 
@@ -39,8 +39,8 @@
39
39
  }
40
40
 
41
41
  if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
42
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
43
  } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] > A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
44
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
45
  }
46
46
  }