tensor_stream-opencl 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
4
- data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
3
+ metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
4
+ data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
5
5
  SHA512:
6
- metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
7
- data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
6
+ metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
7
+ data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
data/.gitignore CHANGED
@@ -9,6 +9,7 @@
9
9
  Gemfile.lock
10
10
  *.gem
11
11
  *.ckpt
12
+ profile.json
12
13
 
13
14
  # rspec failure tracking
14
15
  .rspec_status
@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
26
26
  SHAPES = [32, 32]
27
27
 
28
28
  sess = tf.session(:ruby_evaluator)
29
-
29
+ large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
30
30
  a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
31
31
  a_int = tf.constant([
32
32
  [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
49
49
 
50
50
  d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
51
51
 
52
+ sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
53
+ sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
54
+
52
55
  p = tf.placeholder('float')
53
56
  q = tf.placeholder('float')
54
57
 
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
61
64
  softmax = tf.nn.softmax(a)
62
65
  add_n = tf.add_n([a,b,c,d])
63
66
  split = tf.split(a, 4)
67
+ sum = tf.reduce_sum(large_tensor)
68
+ sum_axis_1 = tf.reduce_sum(large_tensor, 1)
69
+ min = tf.min(large_tensor, 1)
70
+ index = large_tensor[0]
71
+
72
+ conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
73
+ conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
64
74
 
65
75
  puts TensorStream::Evaluator.default_evaluators
66
76
 
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
70
80
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
71
81
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
72
82
  Benchmark.bmbm do |x|
83
+ x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
84
+ x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
85
+ x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
86
+ x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
87
+ x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
88
+ x.report("opencl arr index :") { 100.times do sess2.run(index) end }
89
+ x.report("pure ruby min :") { 100.times do sess.run(min) end }
90
+ x.report("opencl min :") { 100.times do sess2.run(min) end }
91
+ x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
92
+ x.report("opencl sum :") { 100.times do sess2.run(sum) end }
93
+ x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
94
+ x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
73
95
  x.report("pure ruby split :") { 100.times do sess.run(split) end }
74
96
  x.report("opencl split :") { 100.times do sess2.run(split) end }
75
97
  x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
@@ -0,0 +1,56 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
5
+ Rehearsal --------------------------------------------------------------
6
+ pure ruby arr index : 0.005448 0.003557 0.009005 ( 0.008999)
7
+ opencl arr index : 0.074642 0.190132 0.264774 ( 0.275557)
8
+ pure ruby min : 0.256004 0.000777 0.256781 ( 0.256682)
9
+ opencl min : 0.017543 0.004523 0.022066 ( 0.018797)
10
+ pure ruby sum : 0.313039 0.000565 0.313604 ( 0.313535)
11
+ opencl sum : 0.009037 0.004249 0.013286 ( 0.011073)
12
+ pure ruby split : 0.017223 0.000300 0.017523 ( 0.017542)
13
+ opencl split : 0.033489 0.014394 0.047883 ( 0.038798)
14
+ pure ruby add_n : 0.159864 0.000153 0.160017 ( 0.159992)
15
+ opencl add_n : 0.018535 0.000563 0.019098 ( 0.016168)
16
+ pure ruby ooo matmul : 1.390970 0.000304 1.391274 ( 1.390790)
17
+ opencl ooo matmul : 0.014119 0.000229 0.014348 ( 0.011738)
18
+ pure ruby softmax : 0.024103 0.000014 0.024117 ( 0.024135)
19
+ opencl softmax : 0.010602 0.004277 0.014879 ( 0.011941)
20
+ pure ruby matmul : 0.668126 0.000006 0.668132 ( 0.667778)
21
+ opencl matmul : 0.006672 0.007527 0.014199 ( 0.011594)
22
+ pure ruby : 2.388817 0.000005 2.388822 ( 2.387870)
23
+ opencl : 0.152289 0.007804 0.160093 ( 0.156279)
24
+ pure ruby single function: 0.356575 0.000062 0.356637 ( 0.356488)
25
+ opencl singlefunction: 0.120073 0.000210 0.120283 ( 0.116378)
26
+ pure ruby pow float: 0.088966 0.000051 0.089017 ( 0.088996)
27
+ opencl pow float: 0.018054 0.000100 0.018154 ( 0.015429)
28
+ pure ruby pow int: 0.025430 0.000070 0.025500 ( 0.025524)
29
+ opencl pow int: 0.015652 0.003880 0.019532 ( 0.017386)
30
+ ----------------------------------------------------- total: 6.429024sec
31
+
32
+ user system total real
33
+ pure ruby arr index : 0.003564 0.000110 0.003674 ( 0.003636)
34
+ opencl arr index : 0.007966 0.003974 0.011940 ( 0.009775)
35
+ pure ruby min : 0.246153 0.000102 0.246255 ( 0.246172)
36
+ opencl min : 0.011787 0.007785 0.019572 ( 0.016169)
37
+ pure ruby sum : 0.294371 0.000000 0.294371 ( 0.294335)
38
+ opencl sum : 0.008266 0.003879 0.012145 ( 0.009315)
39
+ pure ruby split : 0.014552 0.000000 0.014552 ( 0.014539)
40
+ opencl split : 0.037984 0.004037 0.042021 ( 0.033276)
41
+ pure ruby add_n : 0.146300 0.000053 0.146353 ( 0.146319)
42
+ opencl add_n : 0.006426 0.007827 0.014253 ( 0.011461)
43
+ pure ruby ooo matmul : 1.373232 0.000096 1.373328 ( 1.372788)
44
+ opencl ooo matmul : 0.013838 0.000000 0.013838 ( 0.011088)
45
+ pure ruby softmax : 0.024478 0.000000 0.024478 ( 0.024493)
46
+ opencl softmax : 0.014117 0.000022 0.014139 ( 0.011246)
47
+ pure ruby matmul : 0.653146 0.000054 0.653200 ( 0.652889)
48
+ opencl matmul : 0.002750 0.011934 0.014684 ( 0.011729)
49
+ pure ruby : 2.392733 0.000058 2.392791 ( 2.391726)
50
+ opencl : 0.140118 0.016001 0.156119 ( 0.151788)
51
+ pure ruby single function: 0.352515 0.000000 0.352515 ( 0.352443)
52
+ opencl singlefunction: 0.093955 0.011813 0.105768 ( 0.102301)
53
+ pure ruby pow float: 0.083659 0.000000 0.083659 ( 0.083623)
54
+ opencl pow float: 0.017433 0.000125 0.017558 ( 0.014508)
55
+ pure ruby pow int: 0.018381 0.000000 0.018381 ( 0.018391)
56
+ opencl pow int: 0.008186 0.003755 0.011941 ( 0.009828)
@@ -197,6 +197,7 @@ module TensorStream
197
197
  ops = if axis.zero? # fast path
198
198
  inputs.each_with_index.map do |input, index|
199
199
  next if input.empty_value?
200
+
200
201
  start = index * input.buffer.size * input.buffer.element_size
201
202
  region = [input.buffer.size * input.buffer.element_size, 1, 1]
202
203
  event_wait_list = build_event_wait_list(input)
@@ -339,7 +340,7 @@ module TensorStream
339
340
 
340
341
  register_op :index, noop: true do |context, tensor, inputs|
341
342
  a = _run(inputs[0], context)
342
- index = read_final_result(_run(inputs[1], context))
343
+ index = inputs[1].value || read_final_result(_run(inputs[1], context))
343
344
 
344
345
  if a.is_a?(TensorStream::Evaluator::OutputGroup)
345
346
  a.outputs[index]
@@ -348,8 +349,7 @@ module TensorStream
348
349
  else
349
350
  new_shape = a.shape.dup
350
351
  new_shape.shift
351
- input_a = read_final_result(a)
352
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
352
+ _create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
353
353
  end
354
354
  end
355
355
 
@@ -7,9 +7,23 @@ module TensorStream
7
7
  register_op :decode_png do |context, tensor, inputs|
8
8
  content = _run(inputs[0], context)
9
9
  channels = tensor.options[:channels]
10
+ resample_new_shape = tensor.options[:new_shape]
11
+ resample_method = tensor.options[:resample_method] || :bilinear
10
12
  channels = 4 if channels.zero?
11
13
 
12
14
  image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
15
+
16
+ if resample_new_shape
17
+ case resample_method
18
+ when :bilinear
19
+ image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
20
+ when :nearest_neighbor
21
+ image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
22
+ else
23
+ raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
24
+ end
25
+ end
26
+
13
27
  output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
14
28
 
15
29
  image.grayscale! if channels == 1
@@ -38,6 +52,10 @@ module TensorStream
38
52
 
39
53
  register_op :encode_png do |_context, tensor, inputs|
40
54
  image_data = inputs[0]
55
+
56
+ resample_new_shape = tensor.options[:new_shape]
57
+ resample_method = tensor.options[:resample_method] || :bilinear
58
+
41
59
  height, width, channels = image_data.shape
42
60
  image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
43
61
  \
@@ -53,6 +71,18 @@ module TensorStream
53
71
  end
54
72
  end
55
73
  end
74
+
75
+ if resample_new_shape
76
+ case resample_method
77
+ when :bilinear
78
+ png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
79
+ when :nearest_neighbor
80
+ png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
81
+ else
82
+ raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
83
+ end
84
+ end
85
+
56
86
  convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
57
87
  end
58
88
  end
@@ -0,0 +1,27 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ const int batch_index = get_global_id(0);
6
+ const int h_index = get_global_id(1);
7
+ const int w_index = get_global_id(2);
8
+ const int h_index_with_stride = h_index * <%= stride[0] %>;
9
+ const int w_index_with_stride = w_index * <%= stride[1] %>;
10
+
11
+ const int image_index = batch_index * height * width * <%= ch %>;
12
+ const int image_row_width = width * <%= ch %>;
13
+
14
+ for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
15
+ <%= ctype %> sum = 0;
16
+ for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
17
+ for(int y = 0; y < <%= fh %>; y++) {
18
+ for (int x = 0; x < <%= fw %>; x++) {
19
+ if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
20
+ sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
21
+ }
22
+ }
23
+ }
24
+ }
25
+ output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
26
+ }
27
+ }
@@ -0,0 +1,26 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ const int fh_index = get_global_id(0);
6
+ const int fw_index = get_global_id(1);
7
+ const int f_out_channel = get_global_id(2);
8
+ const int image_size = height * width * <%= ch %>;
9
+ const int grad_image_row_width = width * <%= out_ch %>;
10
+
11
+ for(int channel = 0; channel < <%= ch %>; channel++) {
12
+ <%= ctype %> grad_sum = 0.0;
13
+ for(int batch = 0; batch < batch_size; batch++) {
14
+ const int image_index = batch * height * width * <%= out_ch %>;
15
+ for(int y = 0; y < height; y++) {
16
+ for (int x = 0; x < width; x++) {
17
+ if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
18
+ const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
19
+ grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
20
+ }
21
+ }
22
+ }
23
+ }
24
+ output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
25
+ }
26
+ }
@@ -0,0 +1,32 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ int batch_index = get_global_id(0);
6
+ int h_index = get_global_id(1); // orig image y
7
+ int w_index = get_global_id(2); // orig image x
8
+
9
+ int h_index_with_stride = h_index / <%= stride[0] %>;
10
+ int w_index_with_stride = w_index / <%= stride[1] %>;
11
+ int grad_height = height / <%= stride[0] %>;
12
+ int grad_width = width / <%= stride[1] %>;
13
+
14
+ int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
15
+ int image_row_width = grad_width * <%= out_ch %>;
16
+
17
+ for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
18
+ <%= ctype %> g = 0.0;
19
+ for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
20
+ for(int y = 0; y < <%= fh %>; y++) {
21
+ for (int x = 0; x < <%= fw %>; x++) {
22
+ if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
23
+ <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
24
+ g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
25
+ }
26
+ }
27
+ }
28
+ }
29
+
30
+ output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> + w_index * <%= ch %> + channel_index ] = g;
31
+ }
32
+ }
@@ -1,8 +1,6 @@
1
1
  // First naive implementation
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
3
  __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
4
- const int A_transpose,
5
- const int B_transpose,
6
4
  const __global <%= c_dtype %>* A,
7
5
  const __global <%= c_dtype %>* B,
8
6
  __global <%= c_dtype %>* C) {
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
16
14
  for (int k=0; k<K; k++) {
17
15
  int a_index = globalRow*K + k;
18
16
  int b_index = k*N + globalCol;
19
-
20
- if (A_transpose) {
21
- a_index = M*k + globalRow;
22
- }
23
-
24
- if (B_transpose) {
25
- b_index = globalCol*K + k;
26
- }
17
+ <% if ta %>a_index = M*k + globalRow;<% end %>
18
+ <% if tb %>b_index = globalCol*K + k;<% end %>
27
19
  acc += A[a_index] * B[b_index];
28
20
  }
29
21
 
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -13,12 +13,8 @@
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
16
+
17
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
22
18
  }
23
19
 
24
20
  // 1D + Scalar floating point add op broadcast
@@ -26,7 +22,7 @@
26
22
  // Get the index of the current element to be processed
27
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
25
+
30
26
  int b_m_index = globalRow;
31
27
  int b_n_index = globalCol;
32
28
 
@@ -38,9 +34,5 @@
38
34
  b_n_index = b_n_index % N2;
39
35
  }
40
36
 
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
37
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
46
38
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> sum = 0;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ sum += <%= sums.join(' + ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ sum += A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ sum += A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = sum / <%= n %>;
26
+ }
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -14,11 +14,7 @@
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
16
 
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
17
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
22
18
  }
23
19
 
24
20
  // 1D + Scalar floating point add op broadcast
@@ -38,9 +34,5 @@
38
34
  b_n_index = b_n_index % N2;
39
35
  }
40
36
 
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
37
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
46
38
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> prod = 1;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ prod *= <%= sums.join(' * ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ prod *= A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ prod *= A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = prod;
26
+ }