barracuda 1.1 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -12
- data/benchmarks/normalize.rb +1 -2
- data/benchmarks/sort.rb +1 -2
- data/benchmarks/to_float.rb +4 -5
- data/ext/barracuda.c +27 -33
- data/test/test_barracuda.rb +20 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -58,16 +58,13 @@ EXAMPLE
|
|
58
58
|
Consider the following example to sum a bunch of integers:
|
59
59
|
|
60
60
|
program = Program.new <<-'eof'
|
61
|
-
__kernel sum(__global int *
|
62
|
-
|
63
|
-
if (id < total) atom_add(&out[0], in[id]);
|
61
|
+
__kernel sum(__global int *in, __global int *out) {
|
62
|
+
atom_add(out, in[get_global_id(0)]);
|
64
63
|
}
|
65
64
|
eof
|
66
65
|
|
67
|
-
arr = (1..65536).to_a
|
68
|
-
input = Buffer.new(arr)
|
69
66
|
output = OutputBuffer.new(:int, 1)
|
70
|
-
program.sum(
|
67
|
+
program.sum((1..65536).to_a, output)
|
71
68
|
|
72
69
|
puts "The sum is: " + output.data[0].to_s
|
73
70
|
|
@@ -86,12 +83,6 @@ manually specify the work group size, call the kernel with an options hash:
|
|
86
83
|
|
87
84
|
program.my_kernel_method(..., :times => 512)
|
88
85
|
|
89
|
-
Note that the work group size must be a power of 2. Barracuda will increase
|
90
|
-
the work group size to the next power of 2 if it needs to. This means your
|
91
|
-
OpenCL program might run more iterations of your kernel method than you
|
92
|
-
request. Because we can't rely on the work group size, we pass in the total
|
93
|
-
data size to ensure we do not exceed the bounds of our data.
|
94
|
-
|
95
86
|
CONVERTING TYPES
|
96
87
|
----------------
|
97
88
|
|
data/benchmarks/normalize.rb
CHANGED
@@ -31,12 +31,11 @@ prog = Program.new <<-'eof'
|
|
31
31
|
}
|
32
32
|
eof
|
33
33
|
|
34
|
-
num_vecs =
|
34
|
+
num_vecs = 1000000
|
35
35
|
arr = []
|
36
36
|
num_vecs.times { arr.push(rand, rand, rand, 0.0) }
|
37
37
|
output = OutputBuffer.new(:float, arr.size)
|
38
38
|
|
39
|
-
|
40
39
|
Benchmark.bmbm do |x|
|
41
40
|
x.report("cpu") { norm_all(arr) }
|
42
41
|
x.report("gpu") { prog.norm(output, arr, num_vecs) }
|
data/benchmarks/sort.rb
CHANGED
@@ -9,7 +9,6 @@ prog = Program.new <<-'eof'
|
|
9
9
|
__kernel sort(__global int *out, __global int *in, int total) {
|
10
10
|
int i, final_index = 0, extra = 0;
|
11
11
|
int id = get_global_id(0);
|
12
|
-
if (id >= total) return;
|
13
12
|
int my_value = in[id];
|
14
13
|
for (i = 0; i < total; i++) {
|
15
14
|
if (in[i] < my_value) final_index++;
|
@@ -19,7 +18,7 @@ prog = Program.new <<-'eof'
|
|
19
18
|
}
|
20
19
|
eof
|
21
20
|
|
22
|
-
max =
|
21
|
+
max = 10000
|
23
22
|
arr = (1..max).map { (rand * max).to_i }
|
24
23
|
output = OutputBuffer.new(:int, arr.size)
|
25
24
|
|
data/benchmarks/to_float.rb
CHANGED
@@ -6,9 +6,9 @@ require 'benchmark'
|
|
6
6
|
include Barracuda
|
7
7
|
|
8
8
|
prog = Program.new <<-'eof'
|
9
|
-
__kernel sum(__global float *out, __global int *in
|
9
|
+
__kernel sum(__global float *out, __global int *in) {
|
10
10
|
int i = get_global_id(0);
|
11
|
-
|
11
|
+
out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
|
12
12
|
}
|
13
13
|
eof
|
14
14
|
|
@@ -16,9 +16,8 @@ arr = (1..3333333).to_a
|
|
16
16
|
input = Buffer.new(arr)
|
17
17
|
output = OutputBuffer.new(:float, arr.size)
|
18
18
|
|
19
|
-
TIMES = 1
|
20
19
|
Benchmark.bmbm do |x|
|
21
|
-
x.report("regular") {
|
22
|
-
x.report("opencl") {
|
20
|
+
x.report("regular") { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } }
|
21
|
+
x.report("opencl") { prog.sum(output, input) }
|
23
22
|
end
|
24
23
|
|
data/ext/barracuda.c
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include <math.h>
|
2
3
|
#include <OpenCL/OpenCL.h>
|
3
4
|
|
4
5
|
static VALUE rb_mBarracuda;
|
@@ -38,18 +39,15 @@ static VALUE buffer_data_set(VALUE self, VALUE new_value);
|
|
38
39
|
|
39
40
|
static cl_device_id device_id = NULL;
|
40
41
|
static cl_context context = NULL;
|
42
|
+
static size_t max_work_group_size = 65535;
|
41
43
|
static int err;
|
42
44
|
|
43
|
-
#define VERSION_STRING "1.
|
45
|
+
#define VERSION_STRING "1.2"
|
44
46
|
|
45
47
|
struct program {
|
46
48
|
cl_program program;
|
47
49
|
};
|
48
50
|
|
49
|
-
struct kernel {
|
50
|
-
cl_kernel kernel;
|
51
|
-
};
|
52
|
-
|
53
51
|
struct buffer {
|
54
52
|
VALUE arr;
|
55
53
|
ID type;
|
@@ -376,8 +374,6 @@ buffer_data_set(VALUE self, VALUE new_value)
|
|
376
374
|
static VALUE
|
377
375
|
buffer_initialize(int argc, VALUE *argv, VALUE self)
|
378
376
|
{
|
379
|
-
GET_BUFFER();
|
380
|
-
|
381
377
|
if (argc == 0) {
|
382
378
|
rb_raise(rb_eArgError, "no buffer data given");
|
383
379
|
}
|
@@ -495,21 +491,13 @@ program_compile(VALUE self, VALUE source)
|
|
495
491
|
return Qtrue;
|
496
492
|
}
|
497
493
|
|
498
|
-
#define CLEAN()
|
499
|
-
#define ERROR(msg) if (err != CL_SUCCESS) { CLEAN(); rb_raise(rb_eOpenCLError, msg); }
|
500
|
-
|
501
|
-
static void
|
502
|
-
program_clean(cl_kernel kernel, cl_command_queue commands)
|
503
|
-
{
|
504
|
-
clReleaseKernel(kernel);
|
505
|
-
clReleaseCommandQueue(commands);
|
506
|
-
}
|
494
|
+
#define CLEAN() { clReleaseKernel(kernel); clReleaseCommandQueue(commands); }
|
507
495
|
|
508
496
|
static VALUE
|
509
497
|
program_method_missing(int argc, VALUE *argv, VALUE self)
|
510
498
|
{
|
511
499
|
int i;
|
512
|
-
size_t
|
500
|
+
size_t global[3] = {1, 1, 1}, local;
|
513
501
|
cl_kernel kernel;
|
514
502
|
cl_command_queue commands;
|
515
503
|
GET_PROGRAM();
|
@@ -533,7 +521,7 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
533
521
|
if (i == argc - 1 && TYPE(item) == T_HASH) {
|
534
522
|
VALUE worker_size = rb_hash_aref(item, ID2SYM(id_times));
|
535
523
|
if (RTEST(worker_size) && TYPE(worker_size) == T_FIXNUM) {
|
536
|
-
global = FIX2UINT(worker_size);
|
524
|
+
global[0] = FIX2UINT(worker_size);
|
537
525
|
}
|
538
526
|
else {
|
539
527
|
CLEAN();
|
@@ -553,8 +541,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
553
541
|
struct buffer *buffer;
|
554
542
|
Data_Get_Struct(item, struct buffer, buffer);
|
555
543
|
err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
|
556
|
-
if (buffer->num_items > global) {
|
557
|
-
global = buffer->num_items;
|
544
|
+
if (buffer->num_items > global[0]) {
|
545
|
+
global[0] = buffer->num_items;
|
558
546
|
}
|
559
547
|
}
|
560
548
|
else if (CLASS_OF(item) == rb_cBuffer) {
|
@@ -565,8 +553,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
565
553
|
clEnqueueWriteBuffer(commands, buffer->data, CL_TRUE, 0,
|
566
554
|
buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
|
567
555
|
err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
|
568
|
-
if (buffer->num_items > global) {
|
569
|
-
global = buffer->num_items;
|
556
|
+
if (buffer->num_items > global[0]) {
|
557
|
+
global[0] = buffer->num_items;
|
570
558
|
}
|
571
559
|
}
|
572
560
|
else {
|
@@ -599,17 +587,16 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
599
587
|
}
|
600
588
|
|
601
589
|
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &local, NULL);
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
590
|
+
err = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
|
591
|
+
if (err != CL_SUCCESS) {
|
592
|
+
CLEAN();
|
593
|
+
if (err == CL_INVALID_KERNEL_ARGS) {
|
594
|
+
rb_raise(rb_eArgError, "invalid arguments");
|
595
|
+
}
|
596
|
+
else {
|
597
|
+
rb_raise(rb_eOpenCLError, "failed to execute kernel method %d", err);
|
598
|
+
}
|
609
599
|
}
|
610
|
-
|
611
|
-
clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
612
|
-
if (err) { CLEAN(); rb_raise(rb_eOpenCLError, "failed to execute kernel method"); }
|
613
600
|
|
614
601
|
clFinish(commands);
|
615
602
|
|
@@ -620,7 +607,10 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
|
|
620
607
|
Data_Get_Struct(item, struct buffer, buffer);
|
621
608
|
err = clEnqueueReadBuffer(commands, buffer->data, CL_TRUE, 0,
|
622
609
|
buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
|
623
|
-
|
610
|
+
if (err != CL_SUCCESS) {
|
611
|
+
CLEAN();
|
612
|
+
rb_raise(rb_eOpenCLError, "failed to read output buffer");
|
613
|
+
}
|
624
614
|
buffer_read(item);
|
625
615
|
}
|
626
616
|
}
|
@@ -645,6 +635,10 @@ init_opencl()
|
|
645
635
|
rb_raise(rb_eOpenCLError, "failed to create a program context");
|
646
636
|
}
|
647
637
|
}
|
638
|
+
|
639
|
+
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
|
640
|
+
sizeof(size_t), &max_work_group_size, NULL);
|
641
|
+
max_work_group_size = 4096;
|
648
642
|
}
|
649
643
|
|
650
644
|
void
|
data/test/test_barracuda.rb
CHANGED
@@ -135,7 +135,7 @@ class TestProgram < Test::Unit::TestCase
|
|
135
135
|
|
136
136
|
def test_kernel_run
|
137
137
|
p = Program.new("__kernel x_y_z(int x) { }")
|
138
|
-
|
138
|
+
assert_raise(ArgumentError) { p.x_y_z }
|
139
139
|
end
|
140
140
|
|
141
141
|
def test_kernel_missing
|
@@ -217,7 +217,7 @@ class TestProgram < Test::Unit::TestCase
|
|
217
217
|
p = Program.new <<-'eof'
|
218
218
|
__kernel sum(__global int* out, __global int* in, int total) {
|
219
219
|
int id = get_global_id(0);
|
220
|
-
if (id < total) atom_add(
|
220
|
+
if (id < total) atom_add(out, in[id]);
|
221
221
|
}
|
222
222
|
eof
|
223
223
|
|
@@ -233,7 +233,7 @@ class TestProgram < Test::Unit::TestCase
|
|
233
233
|
p = Program.new <<-'eof'
|
234
234
|
__kernel sum(__global int* out, __global int* in, int total) {
|
235
235
|
int id = get_global_id(0);
|
236
|
-
if (id < total) atom_add(
|
236
|
+
if (id < total) atom_add(out, in[id]);
|
237
237
|
}
|
238
238
|
eof
|
239
239
|
|
@@ -271,4 +271,21 @@ class TestProgram < Test::Unit::TestCase
|
|
271
271
|
p.copy_to_out(out, [2.5, 2.5, 2.5, 2.5])
|
272
272
|
assert_equal [3, 3, 3, 3], out.data
|
273
273
|
end
|
274
|
+
|
275
|
+
def test_program_no_total
|
276
|
+
p = Program.new <<-'eof'
|
277
|
+
__kernel copy(__global int *out, __global int *in) {
|
278
|
+
int i = get_global_id(0);
|
279
|
+
out[i] = in[i] + 1;
|
280
|
+
}
|
281
|
+
eof
|
282
|
+
|
283
|
+
out = OutputBuffer.new(:int, 3)
|
284
|
+
p.copy(out, (1..3).to_a)
|
285
|
+
assert_equal (2..4).to_a, out.data
|
286
|
+
|
287
|
+
out = OutputBuffer.new(:int, 50446)
|
288
|
+
p.copy(out, (1..50446).to_a)
|
289
|
+
assert_equal (2..50447).to_a, out.data
|
290
|
+
end
|
274
291
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: barracuda
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "1.
|
4
|
+
version: "1.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Loren Segal
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-03 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|