RubyGems - barracuda - Versions diffs - 1.1 → 1.2 - Mend

barracuda 1.1 → 1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.md CHANGED

@@ -58,16 +58,13 @@ EXAMPLE
 Consider the following example to sum a bunch of integers:
     program = Program.new <<-'eof'
-      __kernel sum(__global int *out, __global int *in, int total) {
-        int id = get_global_id(0);
-        if (id < total) atom_add(&out[0], in[id]);
+      __kernel sum(__global int *in, __global int *out) {
+        atom_add(out, in[get_global_id(0)]);
       }
     eof
-    arr    = (1..65536).to_a
-    input  = Buffer.new(arr)
     output = OutputBuffer.new(:int, 1)
-    program.sum(output, input, arr.size)
+    program.sum((1..65536).to_a, output)
     puts "The sum is: " + output.data[0].to_s
@@ -86,12 +83,6 @@ manually specify the work group size, call the kernel with an options hash:
     program.my_kernel_method(..., :times => 512)
-Note that the work group size must be a power of 2. Barracuda will increase
-the work group size to the next power of 2 if it needs to. This means your
-OpenCL program might run more iterations of your kernel method than you
-request. Because we can't rely on the work group size, we pass in the total
-data size to ensure we do not exceed the bounds of our data.
 CONVERTING TYPES
 ----------------

data/benchmarks/normalize.rb CHANGED

@@ -31,12 +31,11 @@ prog = Program.new <<-'eof'
   }
 eof
-num_vecs = 100000
+num_vecs = 1000000
 arr = []
 num_vecs.times { arr.push(rand, rand, rand, 0.0) }
 output = OutputBuffer.new(:float, arr.size)
 Benchmark.bmbm do |x|
   x.report("cpu") { norm_all(arr) }
   x.report("gpu") { prog.norm(output, arr, num_vecs) }

data/benchmarks/sort.rb CHANGED

@@ -9,7 +9,6 @@ prog = Program.new <<-'eof'
   __kernel sort(__global int *out, __global int *in, int total) {
     int i, final_index = 0, extra = 0;
     int id = get_global_id(0);
-    if (id >= total) return;
     int my_value = in[id];
     for (i = 0; i < total; i++) {
       if (in[i] < my_value) final_index++;
@@ -19,7 +18,7 @@ prog = Program.new <<-'eof'
   }
 eof
-max = 1000
+max = 10000
 arr = (1..max).map { (rand * max).to_i }
 output = OutputBuffer.new(:int, arr.size)

data/benchmarks/to_float.rb CHANGED

@@ -6,9 +6,9 @@ require 'benchmark'
 include Barracuda
 prog = Program.new <<-'eof'
-  __kernel sum(__global float *out, __global int *in, int total) {
+  __kernel sum(__global float *out, __global int *in) {
     int i = get_global_id(0);
-    if (i < total) out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
+    out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
   }
 eof
@@ -16,9 +16,8 @@ arr = (1..3333333).to_a
 input = Buffer.new(arr)
 output = OutputBuffer.new(:float, arr.size)
-TIMES = 1
 Benchmark.bmbm do |x|
-  x.report("regular") { TIMES.times { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } } }
-  x.report("opencl") { TIMES.times { prog.sum(output, input, arr.size); output.clear } }
+  x.report("regular") { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } }
+  x.report("opencl") { prog.sum(output, input) }
 end

data/ext/barracuda.c CHANGED

@@ -1,4 +1,5 @@
 #include <ruby.h>
+#include <math.h>
 #include <OpenCL/OpenCL.h>
 static VALUE rb_mBarracuda;
@@ -38,18 +39,15 @@ static VALUE buffer_data_set(VALUE self, VALUE new_value);
 static cl_device_id device_id = NULL;
 static cl_context context = NULL;
+static size_t max_work_group_size = 65535;
 static int err;
-#define VERSION_STRING "1.1"
+#define VERSION_STRING "1.2"
 struct program {
     cl_program program;
 };
-struct kernel {
-    cl_kernel kernel;
-};
 struct buffer {
     VALUE arr;
     ID type;
@@ -376,8 +374,6 @@ buffer_data_set(VALUE self, VALUE new_value)
 static VALUE
 buffer_initialize(int argc, VALUE *argv, VALUE self)
 {
-    GET_BUFFER();
     if (argc == 0) {
         rb_raise(rb_eArgError, "no buffer data given");
     }
@@ -495,21 +491,13 @@ program_compile(VALUE self, VALUE source)
     return Qtrue;
 }
-#define CLEAN() program_clean(kernel, commands);
-#define ERROR(msg) if (err != CL_SUCCESS) { CLEAN(); rb_raise(rb_eOpenCLError, msg); }
-static void
-program_clean(cl_kernel kernel, cl_command_queue commands)
-{
-    clReleaseKernel(kernel);
-    clReleaseCommandQueue(commands);
-}
+#define CLEAN() { clReleaseKernel(kernel); clReleaseCommandQueue(commands); }
 static VALUE
 program_method_missing(int argc, VALUE *argv, VALUE self)
 {
     int i;
-    size_t local = 0, global = 0;
+    size_t global[3] = {1, 1, 1}, local;
     cl_kernel kernel;
     cl_command_queue commands;
     GET_PROGRAM();
@@ -533,7 +521,7 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
         if (i == argc - 1 && TYPE(item) == T_HASH) {
             VALUE worker_size = rb_hash_aref(item, ID2SYM(id_times));
             if (RTEST(worker_size) && TYPE(worker_size) == T_FIXNUM) {
-                global = FIX2UINT(worker_size);
+                global[0] = FIX2UINT(worker_size);
             }
             else {
                 CLEAN();
@@ -553,8 +541,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
             struct buffer *buffer;
             Data_Get_Struct(item, struct buffer, buffer);
             err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
-            if (buffer->num_items > global) {
-                global = buffer->num_items;
+            if (buffer->num_items > global[0]) {
+                global[0] = buffer->num_items;
             }
         }
         else if (CLASS_OF(item) == rb_cBuffer) {
@@ -565,8 +553,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
             clEnqueueWriteBuffer(commands, buffer->data, CL_TRUE, 0,
                 buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
             err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
-            if (buffer->num_items > global) {
-                global = buffer->num_items;
+            if (buffer->num_items > global[0]) {
+                global[0] = buffer->num_items;
             }
         }
         else {
@@ -599,17 +587,16 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
     }
     err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &local, NULL);
-    ERROR("failed to retrieve kernel work group info");
-    { /* global work size must be power of 2, greater than 3 and not smaller than local */
-        size_t size = 4;
-        while (size < global) size *= 2;
-        global = size;
-        if (global < local) global = local;
+    err = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        CLEAN();
+        if (err == CL_INVALID_KERNEL_ARGS) {
+            rb_raise(rb_eArgError, "invalid arguments");
+        }
+        else {
+            rb_raise(rb_eOpenCLError, "failed to execute kernel method %d", err);
+        }
     }
-    clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
-    if (err) { CLEAN(); rb_raise(rb_eOpenCLError, "failed to execute kernel method"); }
     clFinish(commands);
@@ -620,7 +607,10 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
             Data_Get_Struct(item, struct buffer, buffer);
             err = clEnqueueReadBuffer(commands, buffer->data, CL_TRUE, 0,
                 buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
-            ERROR("failed to read output buffer");
+            if (err != CL_SUCCESS) {
+                CLEAN();
+                rb_raise(rb_eOpenCLError, "failed to read output buffer");
+            }
             buffer_read(item);
         }
     }
@@ -645,6 +635,10 @@ init_opencl()
             rb_raise(rb_eOpenCLError, "failed to create a program context");
         }
     }
+    clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+        sizeof(size_t), &max_work_group_size, NULL);
+    max_work_group_size = 4096;
 }
 void

data/test/test_barracuda.rb CHANGED

@@ -135,7 +135,7 @@ class TestProgram < Test::Unit::TestCase
   def test_kernel_run
     p = Program.new("__kernel x_y_z(int x) { }")
-    assert_nothing_raised { p.x_y_z }
+    assert_raise(ArgumentError) { p.x_y_z }
   end
   def test_kernel_missing
@@ -217,7 +217,7 @@ class TestProgram < Test::Unit::TestCase
     p = Program.new <<-'eof'
       __kernel sum(__global int* out, __global int* in, int total) {
         int id = get_global_id(0);
-        if (id < total) atom_add(&out[0], in[id]);
+        if (id < total) atom_add(out, in[id]);
       }
     eof
@@ -233,7 +233,7 @@ class TestProgram < Test::Unit::TestCase
     p = Program.new <<-'eof'
       __kernel sum(__global int* out, __global int* in, int total) {
         int id = get_global_id(0);
-        if (id < total) atom_add(&out[0], in[id]);
+        if (id < total) atom_add(out, in[id]);
       }
     eof
@@ -271,4 +271,21 @@ class TestProgram < Test::Unit::TestCase
     p.copy_to_out(out, [2.5, 2.5, 2.5, 2.5])
     assert_equal [3, 3, 3, 3], out.data
   end
+  def test_program_no_total
+    p = Program.new <<-'eof'
+      __kernel copy(__global int *out, __global int *in) {
+        int i = get_global_id(0);
+        out[i] = in[i] + 1;
+      }
+    eof
+    out = OutputBuffer.new(:int, 3)
+    p.copy(out, (1..3).to_a)
+    assert_equal (2..4).to_a, out.data
+    out = OutputBuffer.new(:int, 50446)
+    p.copy(out, (1..50446).to_a)
+    assert_equal (2..50447).to_a, out.data
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: barracuda
 version: !ruby/object:Gem::Version
-  version: "1.1"
+  version: "1.2"
 platform: ruby
 authors:
 - Loren Segal
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-09-02 00:00:00 -04:00
+date: 2009-09-03 00:00:00 -04:00
 default_executable:
 dependencies: []