tensor_stream-opencl 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
4
- data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
3
+ metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
4
+ data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
5
5
  SHA512:
6
- metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
7
- data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
6
+ metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
7
+ data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
data/.gitignore CHANGED
@@ -9,6 +9,7 @@
9
9
  Gemfile.lock
10
10
  *.gem
11
11
  *.ckpt
12
+ profile.json
12
13
 
13
14
  # rspec failure tracking
14
15
  .rspec_status
@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
26
26
  SHAPES = [32, 32]
27
27
 
28
28
  sess = tf.session(:ruby_evaluator)
29
-
29
+ large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
30
30
  a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
31
31
  a_int = tf.constant([
32
32
  [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
49
49
 
50
50
  d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
51
51
 
52
+ sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
53
+ sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
54
+
52
55
  p = tf.placeholder('float')
53
56
  q = tf.placeholder('float')
54
57
 
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
61
64
  softmax = tf.nn.softmax(a)
62
65
  add_n = tf.add_n([a,b,c,d])
63
66
  split = tf.split(a, 4)
67
+ sum = tf.reduce_sum(large_tensor)
68
+ sum_axis_1 = tf.reduce_sum(large_tensor, 1)
69
+ min = tf.min(large_tensor, 1)
70
+ index = large_tensor[0]
71
+
72
+ conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
73
+ conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
64
74
 
65
75
  puts TensorStream::Evaluator.default_evaluators
66
76
 
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
70
80
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
71
81
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
72
82
  Benchmark.bmbm do |x|
83
+ x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
84
+ x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
85
+ x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
86
+ x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
87
+ x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
88
+ x.report("opencl arr index :") { 100.times do sess2.run(index) end }
89
+ x.report("pure ruby min :") { 100.times do sess.run(min) end }
90
+ x.report("opencl min :") { 100.times do sess2.run(min) end }
91
+ x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
92
+ x.report("opencl sum :") { 100.times do sess2.run(sum) end }
93
+ x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
94
+ x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
73
95
  x.report("pure ruby split :") { 100.times do sess.run(split) end }
74
96
  x.report("opencl split :") { 100.times do sess2.run(split) end }
75
97
  x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
@@ -0,0 +1,56 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
5
+ Rehearsal --------------------------------------------------------------
6
+ pure ruby arr index : 0.005448 0.003557 0.009005 ( 0.008999)
7
+ opencl arr index : 0.074642 0.190132 0.264774 ( 0.275557)
8
+ pure ruby min : 0.256004 0.000777 0.256781 ( 0.256682)
9
+ opencl min : 0.017543 0.004523 0.022066 ( 0.018797)
10
+ pure ruby sum : 0.313039 0.000565 0.313604 ( 0.313535)
11
+ opencl sum : 0.009037 0.004249 0.013286 ( 0.011073)
12
+ pure ruby split : 0.017223 0.000300 0.017523 ( 0.017542)
13
+ opencl split : 0.033489 0.014394 0.047883 ( 0.038798)
14
+ pure ruby add_n : 0.159864 0.000153 0.160017 ( 0.159992)
15
+ opencl add_n : 0.018535 0.000563 0.019098 ( 0.016168)
16
+ pure ruby ooo matmul : 1.390970 0.000304 1.391274 ( 1.390790)
17
+ opencl ooo matmul : 0.014119 0.000229 0.014348 ( 0.011738)
18
+ pure ruby softmax : 0.024103 0.000014 0.024117 ( 0.024135)
19
+ opencl softmax : 0.010602 0.004277 0.014879 ( 0.011941)
20
+ pure ruby matmul : 0.668126 0.000006 0.668132 ( 0.667778)
21
+ opencl matmul : 0.006672 0.007527 0.014199 ( 0.011594)
22
+ pure ruby : 2.388817 0.000005 2.388822 ( 2.387870)
23
+ opencl : 0.152289 0.007804 0.160093 ( 0.156279)
24
+ pure ruby single function: 0.356575 0.000062 0.356637 ( 0.356488)
25
+ opencl singlefunction: 0.120073 0.000210 0.120283 ( 0.116378)
26
+ pure ruby pow float: 0.088966 0.000051 0.089017 ( 0.088996)
27
+ opencl pow float: 0.018054 0.000100 0.018154 ( 0.015429)
28
+ pure ruby pow int: 0.025430 0.000070 0.025500 ( 0.025524)
29
+ opencl pow int: 0.015652 0.003880 0.019532 ( 0.017386)
30
+ ----------------------------------------------------- total: 6.429024sec
31
+
32
+ user system total real
33
+ pure ruby arr index : 0.003564 0.000110 0.003674 ( 0.003636)
34
+ opencl arr index : 0.007966 0.003974 0.011940 ( 0.009775)
35
+ pure ruby min : 0.246153 0.000102 0.246255 ( 0.246172)
36
+ opencl min : 0.011787 0.007785 0.019572 ( 0.016169)
37
+ pure ruby sum : 0.294371 0.000000 0.294371 ( 0.294335)
38
+ opencl sum : 0.008266 0.003879 0.012145 ( 0.009315)
39
+ pure ruby split : 0.014552 0.000000 0.014552 ( 0.014539)
40
+ opencl split : 0.037984 0.004037 0.042021 ( 0.033276)
41
+ pure ruby add_n : 0.146300 0.000053 0.146353 ( 0.146319)
42
+ opencl add_n : 0.006426 0.007827 0.014253 ( 0.011461)
43
+ pure ruby ooo matmul : 1.373232 0.000096 1.373328 ( 1.372788)
44
+ opencl ooo matmul : 0.013838 0.000000 0.013838 ( 0.011088)
45
+ pure ruby softmax : 0.024478 0.000000 0.024478 ( 0.024493)
46
+ opencl softmax : 0.014117 0.000022 0.014139 ( 0.011246)
47
+ pure ruby matmul : 0.653146 0.000054 0.653200 ( 0.652889)
48
+ opencl matmul : 0.002750 0.011934 0.014684 ( 0.011729)
49
+ pure ruby : 2.392733 0.000058 2.392791 ( 2.391726)
50
+ opencl : 0.140118 0.016001 0.156119 ( 0.151788)
51
+ pure ruby single function: 0.352515 0.000000 0.352515 ( 0.352443)
52
+ opencl singlefunction: 0.093955 0.011813 0.105768 ( 0.102301)
53
+ pure ruby pow float: 0.083659 0.000000 0.083659 ( 0.083623)
54
+ opencl pow float: 0.017433 0.000125 0.017558 ( 0.014508)
55
+ pure ruby pow int: 0.018381 0.000000 0.018381 ( 0.018391)
56
+ opencl pow int: 0.008186 0.003755 0.011941 ( 0.009828)
@@ -197,6 +197,7 @@ module TensorStream
197
197
  ops = if axis.zero? # fast path
198
198
  inputs.each_with_index.map do |input, index|
199
199
  next if input.empty_value?
200
+
200
201
  start = index * input.buffer.size * input.buffer.element_size
201
202
  region = [input.buffer.size * input.buffer.element_size, 1, 1]
202
203
  event_wait_list = build_event_wait_list(input)
@@ -339,7 +340,7 @@ module TensorStream
339
340
 
340
341
  register_op :index, noop: true do |context, tensor, inputs|
341
342
  a = _run(inputs[0], context)
342
- index = read_final_result(_run(inputs[1], context))
343
+ index = inputs[1].value || read_final_result(_run(inputs[1], context))
343
344
 
344
345
  if a.is_a?(TensorStream::Evaluator::OutputGroup)
345
346
  a.outputs[index]
@@ -348,8 +349,7 @@ module TensorStream
348
349
  else
349
350
  new_shape = a.shape.dup
350
351
  new_shape.shift
351
- input_a = read_final_result(a)
352
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
352
+ _create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
353
353
  end
354
354
  end
355
355
 
@@ -7,9 +7,23 @@ module TensorStream
7
7
  register_op :decode_png do |context, tensor, inputs|
8
8
  content = _run(inputs[0], context)
9
9
  channels = tensor.options[:channels]
10
+ resample_new_shape = tensor.options[:new_shape]
11
+ resample_method = tensor.options[:resample_method] || :bilinear
10
12
  channels = 4 if channels.zero?
11
13
 
12
14
  image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
15
+
16
+ if resample_new_shape
17
+ case resample_method
18
+ when :bilinear
19
+ image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
20
+ when :nearest_neighbor
21
+ image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
22
+ else
23
+ raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
24
+ end
25
+ end
26
+
13
27
  output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
14
28
 
15
29
  image.grayscale! if channels == 1
@@ -38,6 +52,10 @@ module TensorStream
38
52
 
39
53
  register_op :encode_png do |_context, tensor, inputs|
40
54
  image_data = inputs[0]
55
+
56
+ resample_new_shape = tensor.options[:new_shape]
57
+ resample_method = tensor.options[:resample_method] || :bilinear
58
+
41
59
  height, width, channels = image_data.shape
42
60
  image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
43
61
  \
@@ -53,6 +71,18 @@ module TensorStream
53
71
  end
54
72
  end
55
73
  end
74
+
75
+ if resample_new_shape
76
+ case resample_method
77
+ when :bilinear
78
+ png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
79
+ when :nearest_neighbor
80
+ png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
81
+ else
82
+ raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
83
+ end
84
+ end
85
+
56
86
  convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
57
87
  end
58
88
  end
@@ -0,0 +1,27 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ const int batch_index = get_global_id(0);
6
+ const int h_index = get_global_id(1);
7
+ const int w_index = get_global_id(2);
8
+ const int h_index_with_stride = h_index * <%= stride[0] %>;
9
+ const int w_index_with_stride = w_index * <%= stride[1] %>;
10
+
11
+ const int image_index = batch_index * height * width * <%= ch %>;
12
+ const int image_row_width = width * <%= ch %>;
13
+
14
+ for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
15
+ <%= ctype %> sum = 0;
16
+ for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
17
+ for(int y = 0; y < <%= fh %>; y++) {
18
+ for (int x = 0; x < <%= fw %>; x++) {
19
+ if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
20
+ sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
21
+ }
22
+ }
23
+ }
24
+ }
25
+ output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
26
+ }
27
+ }
@@ -0,0 +1,26 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ const int fh_index = get_global_id(0);
6
+ const int fw_index = get_global_id(1);
7
+ const int f_out_channel = get_global_id(2);
8
+ const int image_size = height * width * <%= ch %>;
9
+ const int grad_image_row_width = width * <%= out_ch %>;
10
+
11
+ for(int channel = 0; channel < <%= ch %>; channel++) {
12
+ <%= ctype %> grad_sum = 0.0;
13
+ for(int batch = 0; batch < batch_size; batch++) {
14
+ const int image_index = batch * height * width * <%= out_ch %>;
15
+ for(int y = 0; y < height; y++) {
16
+ for (int x = 0; x < width; x++) {
17
+ if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
18
+ const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
19
+ grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
20
+ }
21
+ }
22
+ }
23
+ }
24
+ output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
25
+ }
26
+ }
@@ -0,0 +1,32 @@
1
+ % ctype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
4
+ // Get the index of the current element to be processed
5
+ int batch_index = get_global_id(0);
6
+ int h_index = get_global_id(1); // orig image y
7
+ int w_index = get_global_id(2); // orig image x
8
+
9
+ int h_index_with_stride = h_index / <%= stride[0] %>;
10
+ int w_index_with_stride = w_index / <%= stride[1] %>;
11
+ int grad_height = height / <%= stride[0] %>;
12
+ int grad_width = width / <%= stride[1] %>;
13
+
14
+ int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
15
+ int image_row_width = grad_width * <%= out_ch %>;
16
+
17
+ for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
18
+ <%= ctype %> g = 0.0;
19
+ for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
20
+ for(int y = 0; y < <%= fh %>; y++) {
21
+ for (int x = 0; x < <%= fw %>; x++) {
22
+ if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
23
+ <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
24
+ g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
25
+ }
26
+ }
27
+ }
28
+ }
29
+
30
+ output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> + w_index * <%= ch %> + channel_index ] = g;
31
+ }
32
+ }
@@ -1,8 +1,6 @@
1
1
  // First naive implementation
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
3
  __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
4
- const int A_transpose,
5
- const int B_transpose,
6
4
  const __global <%= c_dtype %>* A,
7
5
  const __global <%= c_dtype %>* B,
8
6
  __global <%= c_dtype %>* C) {
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
16
14
  for (int k=0; k<K; k++) {
17
15
  int a_index = globalRow*K + k;
18
16
  int b_index = k*N + globalCol;
19
-
20
- if (A_transpose) {
21
- a_index = M*k + globalRow;
22
- }
23
-
24
- if (B_transpose) {
25
- b_index = globalCol*K + k;
26
- }
17
+ <% if ta %>a_index = M*k + globalRow;<% end %>
18
+ <% if tb %>b_index = globalCol*K + k;<% end %>
27
19
  acc += A[a_index] * B[b_index];
28
20
  }
29
21
 
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -13,12 +13,8 @@
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
16
+
17
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
22
18
  }
23
19
 
24
20
  // 1D + Scalar floating point add op broadcast
@@ -26,7 +22,7 @@
26
22
  // Get the index of the current element to be processed
27
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
25
+
30
26
  int b_m_index = globalRow;
31
27
  int b_n_index = globalCol;
32
28
 
@@ -38,9 +34,5 @@
38
34
  b_n_index = b_n_index % N2;
39
35
  }
40
36
 
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
37
+ C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
46
38
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> sum = 0;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ sum += <%= sums.join(' + ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ sum += A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ sum += A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = sum / <%= n %>;
26
+ }
@@ -5,7 +5,7 @@
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
@@ -14,11 +14,7 @@
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
16
 
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
17
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
22
18
  }
23
19
 
24
20
  // 1D + Scalar floating point add op broadcast
@@ -38,9 +34,5 @@
38
34
  b_n_index = b_n_index % N2;
39
35
  }
40
36
 
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
37
+ C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
46
38
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> prod = 1;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ prod *= <%= sums.join(' * ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ prod *= A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ prod *= A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = prod;
26
+ }