tensor_stream 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +40 -1
  5. data/benchmark/benchmark.rb +4 -1
  6. data/lib/tensor_stream.rb +5 -0
  7. data/lib/tensor_stream/debugging/debugging.rb +4 -2
  8. data/lib/tensor_stream/device.rb +2 -1
  9. data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
  10. data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
  11. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
  12. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
  13. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
  14. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
  15. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
  16. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
  17. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
  18. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
  19. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
  20. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
  21. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
  22. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
  23. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
  24. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
  25. data/lib/tensor_stream/exceptions.rb +6 -0
  26. data/lib/tensor_stream/graph.rb +21 -6
  27. data/lib/tensor_stream/graph_builder.rb +67 -0
  28. data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
  29. data/lib/tensor_stream/graph_keys.rb +1 -0
  30. data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
  31. data/lib/tensor_stream/helpers/op_helper.rb +7 -33
  32. data/lib/tensor_stream/helpers/string_helper.rb +16 -0
  33. data/lib/tensor_stream/math_gradients.rb +67 -44
  34. data/lib/tensor_stream/nn/nn_ops.rb +7 -1
  35. data/lib/tensor_stream/operation.rb +14 -27
  36. data/lib/tensor_stream/ops.rb +82 -29
  37. data/lib/tensor_stream/session.rb +4 -0
  38. data/lib/tensor_stream/tensor.rb +30 -12
  39. data/lib/tensor_stream/tensor_shape.rb +1 -1
  40. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
  41. data/lib/tensor_stream/train/saver.rb +46 -0
  42. data/lib/tensor_stream/train/utils.rb +37 -0
  43. data/lib/tensor_stream/trainer.rb +2 -0
  44. data/lib/tensor_stream/utils.rb +24 -14
  45. data/lib/tensor_stream/variable.rb +5 -11
  46. data/lib/tensor_stream/variable_scope.rb +15 -0
  47. data/lib/tensor_stream/version.rb +1 -1
  48. data/samples/iris.rb +8 -4
  49. data/samples/linear_regression.rb +1 -1
  50. data/samples/multigpu.rb +73 -0
  51. data/samples/nearest_neighbor.rb +3 -3
  52. data/tensor_stream.gemspec +1 -1
  53. data/test_samples/raw_neural_net_sample.rb +4 -1
  54. metadata +21 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4f9e6d7a1640d4b24b92d7706bcab88ab5bb294551f38dc6ad403eb1b2a761b
4
- data.tar.gz: 71c245e394be382e3976f4b2b9989c2b89538f22cc40984a5a84e07fb0bca597
3
+ metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
4
+ data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
5
5
  SHA512:
6
- metadata.gz: a04ca2c61064dc4fb67f2653acbde962f8f8b5f56a5e5f27760f44f6032b81c61307e3b207bbb4ec42bd1713d06315ba760a710a712fb9f124efa6c7f06a7246
7
- data.tar.gz: ae8e29f412c1b63b854604696df4858ce4d3fb078c982e53a0b85fd8ff6b546c2b9502ecb4d253cb6d0f9255aa1e16b1aaa445da769a0b06bb85ed55a0aebd55
6
+ metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
7
+ data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
data/.rubocop.yml CHANGED
@@ -6,6 +6,12 @@ AllCops:
6
6
  - tensor_stream.gemspec
7
7
  - Rakefile
8
8
 
9
+ Style/StringLiterals:
10
+ Enabled: false
11
+
12
+ Layout/TrailingBlankLines:
13
+ Enabled: false
14
+
9
15
  Metrics/LineLength:
10
16
  Max: 200
11
17
 
@@ -21,6 +27,10 @@ Metrics/MethodLength:
21
27
  Metrics/CyclomaticComplexity:
22
28
  Enabled: false
23
29
 
30
+ Metrics/BlockLength:
31
+ Exclude:
32
+ - lib/tensor_stream/math_gradients.rb
33
+
24
34
  Naming/AccessorMethodName:
25
35
  Exclude:
26
36
  - lib/tensor_stream.rb
data/CHANGELOG.md CHANGED
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.7.0] - 2018-08-08
8
+ ### Added
9
+ - [NEW OP] expand_dims, min, acos, asin, add_n
10
+ - Add parse_from_string support. Parse tensorflow pbtext files into tensor_stream
11
+
12
+ ### Fixes
13
+ - Tweaks to GradientDescentOptimizer to expose additional methods based on tensorflow
14
+
7
15
  ## [0.6.0] - 2018-07-21
8
16
  ### Added
9
17
  - [NEW OP] fill, floor_div, dynamic_stitch, mod, range, size, squared_difference
data/README.md CHANGED
@@ -15,6 +15,7 @@ The goal of this gem is to have a high performance machine learning and compute
15
15
  - Provision to use your own opcode evaluator (opencl, sciruby and tensorflow backends planned)
16
16
  - Goal is to be as close to TensorFlow in behavior but with some freedom to add ruby specific enhancements (with lots of test cases)
17
17
  - eager execution (experimental)
18
+ - (08-08-2018) Load pbtext files from tensorflow (Graph.parse_from_string)
18
19
 
19
20
  Since this is a pure ruby implementation for now, performance is not there yet. However it should be a good enough environment to learn about tensorflow and experiment with some models.
20
21
 
@@ -69,7 +70,7 @@ b = tf.variable(rand, name: "bias")
69
70
  pred = X * W + b
70
71
 
71
72
  # Mean squared error
72
- cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / ( 2 * n_samples)
73
+ cost = ((pred - Y) ** 2).reduce(:+) / ( 2 * n_samples)
73
74
 
74
75
  optimizer = TensorStream::Train::GradientDescentOptimizer.new(learning_rate).minimize(cost)
75
76
 
@@ -255,6 +256,44 @@ Note that the OpenCL evaluator provides speedup if you are using large tensors,
255
256
 
256
257
  samples/nearest_neighbor.rb contains a sample that uses opencl.
257
258
 
259
+ ## Export Import Models from tensorflow
260
+
261
+ Experimental support for parsing and exporting pbtext files are supported:
262
+
263
+ Exporting
264
+
265
+ ```ruby
266
+ a = ts.constant([1.0, 1.0])
267
+ b = ts.constant([1.5, 1.5])
268
+ f = a + b
269
+
270
+ File.write('my_model.pbtext', f.graph.as_graph_def)
271
+ ```
272
+
273
+ Importing (Experimental)
274
+
275
+ Note that not all tensorflow ops are supported, warnings will be showed
276
+ if a certain operation is not supported yet.
277
+
278
+
279
+ ```ruby
280
+ pbtext = File.read(File.join('linear_regression.pbtxt'))
281
+
282
+ # create a graph from pbtext file
283
+ graph = TensorStream::Graph.parse_from_string(pbtext)
284
+
285
+ # reference a tensor by name from the created graph,
286
+ # for example you have a tensor named out
287
+ tensor = graph.get_tensor_by_name("out")
288
+
289
+ # set graph as default and do operations on it
290
+ graph.as_default do
291
+ sess = ts.session
292
+ expect(tr(sess.run(tensor))).to eq([[1.0, 1.0], [1.0, 1.0]])
293
+ end
294
+
295
+ ```
296
+
258
297
  # Visualization
259
298
 
260
299
  tensorstream does not support tensorboard yet, but a graphml generator is included:
@@ -8,7 +8,7 @@ require 'tensor_stream/evaluator/opencl/opencl_evaluator'
8
8
  def tr(t, places = 1)
9
9
  if t.is_a?(Array)
10
10
  return t.collect do |v|
11
- tr(v)
11
+ tr(v, places)
12
12
  end
13
13
  end
14
14
 
@@ -59,6 +59,7 @@ pow_i = tf.pow(a_int, 3)
59
59
  matmul = tf.matmul(a, b)
60
60
  out_of_order = tf.matmul(a, b) + tf.matmul(a, c)
61
61
  softmax = tf.nn.softmax(a)
62
+ add_n = tf.add_n([a,b,c,d])
62
63
 
63
64
  puts TensorStream::Evaluator.default_evaluators
64
65
 
@@ -68,6 +69,8 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
68
69
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
69
70
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
70
71
  Benchmark.bmbm do |x|
72
+ x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
73
+ x.report("opencl ruby add_n :") { 100.times do sess2.run(add_n) end }
71
74
  x.report("pure ruby ooo matmul :") { 100.times do sess.run(out_of_order) end }
72
75
  x.report("opencl ooo matmul :") { 100.times do sess2.run(out_of_order) end }
73
76
  x.report("pure ruby softmax :") { 100.times do sess.run(softmax) end }
data/lib/tensor_stream.rb CHANGED
@@ -2,25 +2,30 @@ require 'tensor_stream/version'
2
2
  require 'deep_merge'
3
3
  require 'matrix'
4
4
  require 'concurrent'
5
+ require 'tensor_stream/exceptions'
5
6
  require 'tensor_stream/helpers/op_helper'
6
7
  require 'tensor_stream/helpers/string_helper'
7
8
  require 'tensor_stream/initializer'
8
9
  require 'tensor_stream/graph_keys'
9
10
  require 'tensor_stream/types'
11
+ require 'tensor_stream/graph_builder'
10
12
  require 'tensor_stream/graph'
11
13
  require 'tensor_stream/device'
12
14
  require 'tensor_stream/session'
13
15
  require 'tensor_stream/tensor_shape'
14
16
  require 'tensor_stream/tensor'
15
17
  require 'tensor_stream/variable'
18
+ require 'tensor_stream/variable_scope'
16
19
  require 'tensor_stream/operation'
17
20
  require 'tensor_stream/placeholder'
18
21
  require 'tensor_stream/control_flow'
19
22
  require 'tensor_stream/dynamic_stitch'
23
+ require 'tensor_stream/train/utils'
20
24
  require 'tensor_stream/trainer'
21
25
  require 'tensor_stream/nn/nn_ops'
22
26
  require 'tensor_stream/evaluator/evaluator'
23
27
  require 'tensor_stream/graph_serializers/serializer'
28
+ require 'tensor_stream/graph_deserializers/protobuf'
24
29
  require 'tensor_stream/graph_serializers/pbtext'
25
30
  require 'tensor_stream/graph_serializers/graphml'
26
31
  require 'tensor_stream/math_gradients'
@@ -4,10 +4,12 @@ module TensorStream
4
4
 
5
5
  def add_check_numerics_ops
6
6
  graph = TensorStream.get_default_graph
7
- nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
7
+ nodes_to_process = graph.nodes.values.select { |node| node.is_a?(Operation) }
8
8
 
9
9
  nodes_to_process.each do |node|
10
- node.inputs = node.inputs.compact.collect do |input|
10
+ node.inputs = node.inputs.collect do |input|
11
+ next if input.nil?
12
+
11
13
  if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
12
14
  TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
13
15
  else
@@ -1,5 +1,6 @@
1
1
  # A tensorstream device
2
2
  module TensorStream
3
+ # Class that describes a supported device
3
4
  class Device
4
5
  attr_accessor :name, :type, :evaluator
5
6
  def initialize(name, type, evaluator)
@@ -8,4 +9,4 @@ module TensorStream
8
9
  @evaluator = evaluator
9
10
  end
10
11
  end
11
- end
12
+ end
@@ -1,5 +1,13 @@
1
1
  module TensorStream
2
+ # Evaluator base module
2
3
  module Evaluator
4
+ class OutputGroup
5
+ attr_accessor :outputs
6
+ def initialize(outputs = [])
7
+ @outputs = outputs
8
+ end
9
+ end
10
+
3
11
  class UnsupportedOp < Exception
4
12
  def initialize(tensor)
5
13
  @tensor = tensor
@@ -10,31 +18,36 @@ module TensorStream
10
18
  end
11
19
  end
12
20
 
21
+ # Evaluator base class
13
22
  class BaseEvaluator
14
- def initialize(session, device, thread_pool: nil, log_intermediates: false)
23
+ def initialize(session, _device, thread_pool: nil, log_intermediates: false)
15
24
  @session = session
16
25
  @log_intermediates = log_intermediates
17
26
  @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
18
27
  @context[:compute_history] = [] if log_intermediates
19
28
  end
20
29
 
30
+ ##
31
+ # Query all supported devices
21
32
  def self.query_supported_devices
22
- [Device.new("cpu", :cpu, self)]
33
+ [Device.new('cpu', :cpu, self)]
23
34
  end
24
35
 
25
36
  ##
26
37
  # Select the best device available in the system for this evaluator
27
38
  def self.default_device
28
- Device.new("cpu", :cpu, self)
39
+ Device.new('cpu', :cpu, self)
29
40
  end
30
41
 
31
42
  ##
32
43
  # Selects the best device with the specified query, query can
33
44
  # be evaluator specific
34
- def self.fetch_device(query = [])
35
- Device.new("cpu", :cpu, self)
45
+ def self.fetch_device(_query = [])
46
+ Device.new('cpu', :cpu, self)
36
47
  end
37
48
 
49
+ ##
50
+ # Select device using uri
38
51
  def self.query_device(query)
39
52
  return default_device if query.nil? || query == :default
40
53
 
@@ -52,8 +65,8 @@ module TensorStream
52
65
 
53
66
  select_index = [devices.size - 1, select_index].min
54
67
  return devices[select_index]
55
- elsif components[0] == 'cpu'
56
- device_type = :cpu
68
+ elsif %w[cpu gpu].include?(components[0])
69
+ device_type = components[0].to_sym
57
70
  select_index = components[1].to_i
58
71
 
59
72
  devices = all_devices.select { |d| d.type == device_type.downcase.to_sym }
@@ -91,34 +104,32 @@ module TensorStream
91
104
 
92
105
  def invoke(tensor, execution_context)
93
106
  return eval_tensor(tensor, execution_context) unless tensor.is_a?(Operation)
107
+ raise UnsupportedOp.new(tensor), "op #{tensor.operation} is not yet supported" unless self.class.ops.key?(tensor.operation.to_sym)
94
108
 
95
- if self.class.ops.key?(tensor.operation.to_sym)
96
- op = self.class.ops[tensor.operation.to_sym]
109
+ op = self.class.ops[tensor.operation.to_sym]
110
+ op_options = op[:options]
97
111
 
98
- op_options = op[:options]
99
- resolved_inputs = tensor.inputs.map do |i|
100
- next if i.nil?
112
+ resolved_inputs = tensor.inputs.map do |i|
113
+ next if i.nil?
101
114
 
102
- if i.is_a?(Array)
103
- next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
104
- end
115
+ if i.is_a?(Array)
116
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
117
+ end
105
118
 
106
- if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
107
- cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
108
- next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
119
+ if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
120
+ cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
121
+ next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
109
122
 
110
- result = @session.delegate_to_evaluator(i, @context, execution_context)
111
- convert_from_buffer(i, result).tap do |buffer|
112
- @context[:_cache][cache_key] = buffer if i.is_const
113
- end
114
- else
115
- prepare_input(i, execution_context, op_options)
123
+ result = @session.delegate_to_evaluator(i, @context, execution_context)
124
+ convert_from_buffer(i, result).tap do |buffer|
125
+ @context[:_cache][cache_key] = buffer if i.is_const
116
126
  end
127
+ else
128
+ prepare_input(i, execution_context, op_options)
117
129
  end
118
- instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
119
- else
120
- raise UnsupportedOp.new(tensor)
121
130
  end
131
+
132
+ instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
122
133
  end
123
134
 
124
135
  protected
@@ -128,13 +139,13 @@ module TensorStream
128
139
 
129
140
  input_a_args = []
130
141
  input_b_args = []
131
-
132
- input_a = input_b.size.times.map { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
133
- input_b = input_a.size.times.map { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
142
+
143
+ input_a = Array.new(input_b.size) { |i| i < input_a.size ? input_a[i] : nil }.reverse if input_a.size < input_b.size
144
+ input_b = Array.new(input_a.size) { |i| i < input_b.size ? input_b[i] : nil }.reverse if input_a.size > input_b.size
134
145
 
135
146
  input_a.reverse.zip(input_b.reverse).each_with_index do |item, index|
136
147
  a, b = item
137
-
148
+
138
149
  if a.nil? || b && (a < b)
139
150
  input_a_args << input_b.size - index - 1
140
151
  elsif b.nil? || a && (a > b)
@@ -142,7 +153,7 @@ module TensorStream
142
153
  end
143
154
  end
144
155
 
145
- [input_a_args.reverse, input_b_args.reverse]
156
+ [input_a_args.reverse, input_b_args.reverse]
146
157
  end
147
158
 
148
159
  ##
@@ -1,4 +1,3 @@
1
-
2
1
  require 'tensor_stream/evaluator/ruby_evaluator'
3
2
  require 'tensor_stream/evaluator/buffer'
4
3
 
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
9
+ }
@@ -0,0 +1,9 @@
1
+
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
9
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'floor_mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void log_softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
25
+ }
26
+ }
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -15,9 +15,9 @@
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
16
 
17
17
  if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[0] ? A[globalRow * N + globalCol] : B[0];
18
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
19
  } else {
20
- C[globalRow * N + globalCol] = B[0] > A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
20
+ C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
21
  }
22
22
  }
23
23
 
@@ -39,8 +39,8 @@
39
39
  }
40
40
 
41
41
  if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
42
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
43
  } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] > A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
44
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
45
  }
46
46
  }