barracuda 1.1 → 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -12
- data/benchmarks/normalize.rb +1 -2
- data/benchmarks/sort.rb +1 -2
- data/benchmarks/to_float.rb +4 -5
- data/ext/barracuda.c +27 -33
- data/test/test_barracuda.rb +20 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -58,16 +58,13 @@ EXAMPLE
|
|
58
58
|
Consider the following example to sum a bunch of integers:
|
59
59
|
|
60
60
|
program = Program.new <<-'eof'
|
61
|
-
__kernel sum(__global int *
|
62
|
-
|
63
|
-
if (id < total) atom_add(&out[0], in[id]);
|
61
|
+
__kernel sum(__global int *in, __global int *out) {
|
62
|
+
atom_add(out, in[get_global_id(0)]);
|
64
63
|
}
|
65
64
|
eof
|
66
65
|
|
67
|
-
arr = (1..65536).to_a
|
68
|
-
input = Buffer.new(arr)
|
69
66
|
output = OutputBuffer.new(:int, 1)
|
70
|
-
program.sum(
|
67
|
+
program.sum((1..65536).to_a, output)
|
71
68
|
|
72
69
|
puts "The sum is: " + output.data[0].to_s
|
73
70
|
|
@@ -86,12 +83,6 @@ manually specify the work group size, call the kernel with an options hash:
|
|
86
83
|
|
87
84
|
program.my_kernel_method(..., :times => 512)
|
88
85
|
|
89
|
-
Note that the work group size must be a power of 2. Barracuda will increase
|
90
|
-
the work group size to the next power of 2 if it needs to. This means your
|
91
|
-
OpenCL program might run more iterations of your kernel method than you
|
92
|
-
request. Because we can't rely on the work group size, we pass in the total
|
93
|
-
data size to ensure we do not exceed the bounds of our data.
|
94
|
-
|
95
86
|
CONVERTING TYPES
|
96
87
|
----------------
|
97
88
|
|
data/benchmarks/normalize.rb
CHANGED
@@ -31,12 +31,11 @@ prog = Program.new <<-'eof'
|
|
31
31
|
}
|
32
32
|
eof
|
33
33
|
|
34
|
-
num_vecs =
|
34
|
+
num_vecs = 1000000
|
35
35
|
arr = []
|
36
36
|
num_vecs.times { arr.push(rand, rand, rand, 0.0) }
|
37
37
|
output = OutputBuffer.new(:float, arr.size)
|
38
38
|
|
39
|
-
|
40
39
|
Benchmark.bmbm do |x|
|
41
40
|
x.report("cpu") { norm_all(arr) }
|
42
41
|
x.report("gpu") { prog.norm(output, arr, num_vecs) }
|
data/benchmarks/sort.rb
CHANGED
@@ -9,7 +9,6 @@ prog = Program.new <<-'eof'
|
|
9
9
|
__kernel sort(__global int *out, __global int *in, int total) {
|
10
10
|
int i, final_index = 0, extra = 0;
|
11
11
|
int id = get_global_id(0);
|
12
|
-
if (id >= total) return;
|
13
12
|
int my_value = in[id];
|
14
13
|
for (i = 0; i < total; i++) {
|
15
14
|
if (in[i] < my_value) final_index++;
|
@@ -19,7 +18,7 @@ prog = Program.new <<-'eof'
|
|
19
18
|
}
|
20
19
|
eof
|
21
20
|
|
22
|
-
max =
|
21
|
+
max = 10000
|
23
22
|
arr = (1..max).map { (rand * max).to_i }
|
24
23
|
output = OutputBuffer.new(:int, arr.size)
|
25
24
|
|
data/benchmarks/to_float.rb
CHANGED
@@ -6,9 +6,9 @@ require 'benchmark'
|
|
6
6
|
include Barracuda
|
7
7
|
|
8
8
|
prog = Program.new <<-'eof'
|
9
|
-
__kernel sum(__global float *out, __global int *in
|
9
|
+
__kernel sum(__global float *out, __global int *in) {
|
10
10
|
int i = get_global_id(0);
|
11
|
-
|
11
|
+
out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
|
12
12
|
}
|
13
13
|
eof
|
14
14
|
|
@@ -16,9 +16,8 @@ arr = (1..3333333).to_a
|
|
16
16
|
input = Buffer.new(arr)
|
17
17
|
output = OutputBuffer.new(:float, arr.size)
|
18
18
|
|
19
|
-
TIMES = 1
|
20
19
|
Benchmark.bmbm do |x|
|
21
|
-
x.report("regular") {
|
22
|
-
x.report("opencl") {
|
20
|
+
x.report("regular") { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } }
|
21
|
+
x.report("opencl") { prog.sum(output, input) }
|
23
22
|
end
|
24
23
|
|
data/ext/barracuda.c
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include <math.h>
|
2
3
|
#include <OpenCL/OpenCL.h>
|
3
4
|
|
4
5
|
static VALUE rb_mBarracuda;
|
@@ -38,18 +39,15 @@ static VALUE buffer_data_set(VALUE self, VALUE new_value);
|
|
38
39
|
|
39
40
|
static cl_device_id device_id = NULL;
|
40
41
|
static cl_context context = NULL;
|
42
|
+
static size_t max_work_group_size = 65535;
|
41
43
|
static int err;
|
42
44
|
|
43
|
-
#define VERSION_STRING "1.
|
45
|
+
#define VERSION_STRING "1.2"
|
44
46
|
|
45
47
|
struct program {
|
46
48
|
cl_program program;
|
47
49
|
};
|
48
50
|
|
49
|
-
struct kernel {
|
50
|
-
cl_kernel kernel;
|
51
|
-
};
|
52
|
-
|
53
51
|
struct buffer {
|
54
52
|
VALUE arr;
|
55
53
|
ID type;
|
@@ -376,8 +374,6 @@ buffer_data_set(VALUE self, VALUE new_value)
|
|
376
374
|
static VALUE
|
377
375
|
buffer_initialize(int argc, VALUE *argv, VALUE self)
|
378
376
|
{
|
379
|
-
GET_BUFFER();
|
380
|
-
|
381
377
|
if (argc == 0) {
|
382
378
|
rb_raise(rb_eArgError, "no buffer data given");
|
383
379
|
}
|
@@ -495,21 +491,13 @@ program_compile(VALUE self, VALUE source)
|
|
495
491
|
return Qtrue;
|
496
492
|
}
|
497
493
|
|
498
|
-
#define CLEAN()
|
499
|
-
#define ERROR(msg) if (err != CL_SUCCESS) { CLEAN(); rb_raise(rb_eOpenCLError, msg); }
|
500
|
-
|
501
|
-
static void
|
502
|
-
program_clean(cl_kernel kernel, cl_command_queue commands)
|
503
|
-
{
|
504
|
-
clReleaseKernel(kernel);
|
505
|
-
clReleaseCommandQueue(commands);
|
506
|
-
}
|
494
|
+
#define CLEAN() { clReleaseKernel(kernel); clReleaseCommandQueue(commands); }
|
507
495
|
|
508
496
|
static VALUE
|
509
497
|
program_method_missing(int argc, VALUE *argv, VALUE self)
|
510
498
|
{
|
511
499
|
int i;
|
512
|
-
size_t
|
500
|
+
size_t global[3] = {1, 1, 1}, local;
|
513
501
|
cl_kernel kernel;
|
514
502
|
cl_command_queue commands;
|
515
503
|
GET_PROGRAM();
|
@@ -533,7 +521,7 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
533
521
|
if (i == argc - 1 && TYPE(item) == T_HASH) {
|
534
522
|
VALUE worker_size = rb_hash_aref(item, ID2SYM(id_times));
|
535
523
|
if (RTEST(worker_size) && TYPE(worker_size) == T_FIXNUM) {
|
536
|
-
global = FIX2UINT(worker_size);
|
524
|
+
global[0] = FIX2UINT(worker_size);
|
537
525
|
}
|
538
526
|
else {
|
539
527
|
CLEAN();
|
@@ -553,8 +541,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
553
541
|
struct buffer *buffer;
|
554
542
|
Data_Get_Struct(item, struct buffer, buffer);
|
555
543
|
err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
|
556
|
-
if (buffer->num_items > global) {
|
557
|
-
global = buffer->num_items;
|
544
|
+
if (buffer->num_items > global[0]) {
|
545
|
+
global[0] = buffer->num_items;
|
558
546
|
}
|
559
547
|
}
|
560
548
|
else if (CLASS_OF(item) == rb_cBuffer) {
|
@@ -565,8 +553,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
565
553
|
clEnqueueWriteBuffer(commands, buffer->data, CL_TRUE, 0,
|
566
554
|
buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
|
567
555
|
err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
|
568
|
-
if (buffer->num_items > global) {
|
569
|
-
global = buffer->num_items;
|
556
|
+
if (buffer->num_items > global[0]) {
|
557
|
+
global[0] = buffer->num_items;
|
570
558
|
}
|
571
559
|
}
|
572
560
|
else {
|
@@ -599,17 +587,16 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
599
587
|
}
|
600
588
|
|
601
589
|
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &local, NULL);
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
590
|
+
err = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
|
591
|
+
if (err != CL_SUCCESS) {
|
592
|
+
CLEAN();
|
593
|
+
if (err == CL_INVALID_KERNEL_ARGS) {
|
594
|
+
rb_raise(rb_eArgError, "invalid arguments");
|
595
|
+
}
|
596
|
+
else {
|
597
|
+
rb_raise(rb_eOpenCLError, "failed to execute kernel method %d", err);
|
598
|
+
}
|
609
599
|
}
|
610
|
-
|
611
|
-
clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
612
|
-
if (err) { CLEAN(); rb_raise(rb_eOpenCLError, "failed to execute kernel method"); }
|
613
600
|
|
614
601
|
clFinish(commands);
|
615
602
|
|
@@ -620,7 +607,10 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
620
607
|
Data_Get_Struct(item, struct buffer, buffer);
|
621
608
|
err = clEnqueueReadBuffer(commands, buffer->data, CL_TRUE, 0,
|
622
609
|
buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
|
623
|
-
|
610
|
+
if (err != CL_SUCCESS) {
|
611
|
+
CLEAN();
|
612
|
+
rb_raise(rb_eOpenCLError, "failed to read output buffer");
|
613
|
+
}
|
624
614
|
buffer_read(item);
|
625
615
|
}
|
626
616
|
}
|
@@ -645,6 +635,10 @@ init_opencl()
|
|
645
635
|
rb_raise(rb_eOpenCLError, "failed to create a program context");
|
646
636
|
}
|
647
637
|
}
|
638
|
+
|
639
|
+
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
|
640
|
+
sizeof(size_t), &max_work_group_size, NULL);
|
641
|
+
max_work_group_size = 4096;
|
648
642
|
}
|
649
643
|
|
650
644
|
void
|
data/test/test_barracuda.rb
CHANGED
@@ -135,7 +135,7 @@ class TestProgram < Test::Unit::TestCase
|
|
135
135
|
|
136
136
|
def test_kernel_run
|
137
137
|
p = Program.new("__kernel x_y_z(int x) { }")
|
138
|
-
|
138
|
+
assert_raise(ArgumentError) { p.x_y_z }
|
139
139
|
end
|
140
140
|
|
141
141
|
def test_kernel_missing
|
@@ -217,7 +217,7 @@ class TestProgram < Test::Unit::TestCase
|
|
217
217
|
p = Program.new <<-'eof'
|
218
218
|
__kernel sum(__global int* out, __global int* in, int total) {
|
219
219
|
int id = get_global_id(0);
|
220
|
-
if (id < total) atom_add(
|
220
|
+
if (id < total) atom_add(out, in[id]);
|
221
221
|
}
|
222
222
|
eof
|
223
223
|
|
@@ -233,7 +233,7 @@ class TestProgram < Test::Unit::TestCase
|
|
233
233
|
p = Program.new <<-'eof'
|
234
234
|
__kernel sum(__global int* out, __global int* in, int total) {
|
235
235
|
int id = get_global_id(0);
|
236
|
-
if (id < total) atom_add(
|
236
|
+
if (id < total) atom_add(out, in[id]);
|
237
237
|
}
|
238
238
|
eof
|
239
239
|
|
@@ -271,4 +271,21 @@ class TestProgram < Test::Unit::TestCase
|
|
271
271
|
p.copy_to_out(out, [2.5, 2.5, 2.5, 2.5])
|
272
272
|
assert_equal [3, 3, 3, 3], out.data
|
273
273
|
end
|
274
|
+
|
275
|
+
def test_program_no_total
|
276
|
+
p = Program.new <<-'eof'
|
277
|
+
__kernel copy(__global int *out, __global int *in) {
|
278
|
+
int i = get_global_id(0);
|
279
|
+
out[i] = in[i] + 1;
|
280
|
+
}
|
281
|
+
eof
|
282
|
+
|
283
|
+
out = OutputBuffer.new(:int, 3)
|
284
|
+
p.copy(out, (1..3).to_a)
|
285
|
+
assert_equal (2..4).to_a, out.data
|
286
|
+
|
287
|
+
out = OutputBuffer.new(:int, 50446)
|
288
|
+
p.copy(out, (1..50446).to_a)
|
289
|
+
assert_equal (2..50447).to_a, out.data
|
290
|
+
end
|
274
291
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: barracuda
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "1.
|
4
|
+
version: "1.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Loren Segal
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-03 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|