barracuda 1.1 → 1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -58,16 +58,13 @@ EXAMPLE
58
58
  Consider the following example to sum a bunch of integers:
59
59
 
60
60
  program = Program.new <<-'eof'
61
- __kernel sum(__global int *out, __global int *in, int total) {
62
- int id = get_global_id(0);
63
- if (id < total) atom_add(&out[0], in[id]);
61
+ __kernel sum(__global int *in, __global int *out) {
62
+ atom_add(out, in[get_global_id(0)]);
64
63
  }
65
64
  eof
66
65
 
67
- arr = (1..65536).to_a
68
- input = Buffer.new(arr)
69
66
  output = OutputBuffer.new(:int, 1)
70
- program.sum(output, input, arr.size)
67
+ program.sum((1..65536).to_a, output)
71
68
 
72
69
  puts "The sum is: " + output.data[0].to_s
73
70
 
@@ -86,12 +83,6 @@ manually specify the work group size, call the kernel with an options hash:
86
83
 
87
84
  program.my_kernel_method(..., :times => 512)
88
85
 
89
- Note that the work group size must be a power of 2. Barracuda will increase
90
- the work group size to the next power of 2 if it needs to. This means your
91
- OpenCL program might run more iterations of your kernel method than you
92
- request. Because we can't rely on the work group size, we pass in the total
93
- data size to ensure we do not exceed the bounds of our data.
94
-
95
86
  CONVERTING TYPES
96
87
  ----------------
97
88
 
@@ -31,12 +31,11 @@ prog = Program.new <<-'eof'
31
31
  }
32
32
  eof
33
33
 
34
- num_vecs = 100000
34
+ num_vecs = 1000000
35
35
  arr = []
36
36
  num_vecs.times { arr.push(rand, rand, rand, 0.0) }
37
37
  output = OutputBuffer.new(:float, arr.size)
38
38
 
39
-
40
39
  Benchmark.bmbm do |x|
41
40
  x.report("cpu") { norm_all(arr) }
42
41
  x.report("gpu") { prog.norm(output, arr, num_vecs) }
@@ -9,7 +9,6 @@ prog = Program.new <<-'eof'
9
9
  __kernel sort(__global int *out, __global int *in, int total) {
10
10
  int i, final_index = 0, extra = 0;
11
11
  int id = get_global_id(0);
12
- if (id >= total) return;
13
12
  int my_value = in[id];
14
13
  for (i = 0; i < total; i++) {
15
14
  if (in[i] < my_value) final_index++;
@@ -19,7 +18,7 @@ prog = Program.new <<-'eof'
19
18
  }
20
19
  eof
21
20
 
22
- max = 1000
21
+ max = 10000
23
22
  arr = (1..max).map { (rand * max).to_i }
24
23
  output = OutputBuffer.new(:int, arr.size)
25
24
 
@@ -6,9 +6,9 @@ require 'benchmark'
6
6
  include Barracuda
7
7
 
8
8
  prog = Program.new <<-'eof'
9
- __kernel sum(__global float *out, __global int *in, int total) {
9
+ __kernel sum(__global float *out, __global int *in) {
10
10
  int i = get_global_id(0);
11
- if (i < total) out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
11
+ out[i] = ((float)in[i] + 0.5) / 3.8 + 2.0;
12
12
  }
13
13
  eof
14
14
 
@@ -16,9 +16,8 @@ arr = (1..3333333).to_a
16
16
  input = Buffer.new(arr)
17
17
  output = OutputBuffer.new(:float, arr.size)
18
18
 
19
- TIMES = 1
20
19
  Benchmark.bmbm do |x|
21
- x.report("regular") { TIMES.times { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } } }
22
- x.report("opencl") { TIMES.times { prog.sum(output, input, arr.size); output.clear } }
20
+ x.report("regular") { arr.map {|x| (x.to_f + 0.5) / 3.8 + 2.0 } }
21
+ x.report("opencl") { prog.sum(output, input) }
23
22
  end
24
23
 
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <math.h>
2
3
  #include <OpenCL/OpenCL.h>
3
4
 
4
5
  static VALUE rb_mBarracuda;
@@ -38,18 +39,15 @@ static VALUE buffer_data_set(VALUE self, VALUE new_value);
38
39
 
39
40
  static cl_device_id device_id = NULL;
40
41
  static cl_context context = NULL;
42
+ static size_t max_work_group_size = 65535;
41
43
  static int err;
42
44
 
43
- #define VERSION_STRING "1.1"
45
+ #define VERSION_STRING "1.2"
44
46
 
45
47
  struct program {
46
48
  cl_program program;
47
49
  };
48
50
 
49
- struct kernel {
50
- cl_kernel kernel;
51
- };
52
-
53
51
  struct buffer {
54
52
  VALUE arr;
55
53
  ID type;
@@ -376,8 +374,6 @@ buffer_data_set(VALUE self, VALUE new_value)
376
374
  static VALUE
377
375
  buffer_initialize(int argc, VALUE *argv, VALUE self)
378
376
  {
379
- GET_BUFFER();
380
-
381
377
  if (argc == 0) {
382
378
  rb_raise(rb_eArgError, "no buffer data given");
383
379
  }
@@ -495,21 +491,13 @@ program_compile(VALUE self, VALUE source)
495
491
  return Qtrue;
496
492
  }
497
493
 
498
- #define CLEAN() program_clean(kernel, commands);
499
- #define ERROR(msg) if (err != CL_SUCCESS) { CLEAN(); rb_raise(rb_eOpenCLError, msg); }
500
-
501
- static void
502
- program_clean(cl_kernel kernel, cl_command_queue commands)
503
- {
504
- clReleaseKernel(kernel);
505
- clReleaseCommandQueue(commands);
506
- }
494
+ #define CLEAN() { clReleaseKernel(kernel); clReleaseCommandQueue(commands); }
507
495
 
508
496
  static VALUE
509
497
  program_method_missing(int argc, VALUE *argv, VALUE self)
510
498
  {
511
499
  int i;
512
- size_t local = 0, global = 0;
500
+ size_t global[3] = {1, 1, 1}, local;
513
501
  cl_kernel kernel;
514
502
  cl_command_queue commands;
515
503
  GET_PROGRAM();
@@ -533,7 +521,7 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
533
521
  if (i == argc - 1 && TYPE(item) == T_HASH) {
534
522
  VALUE worker_size = rb_hash_aref(item, ID2SYM(id_times));
535
523
  if (RTEST(worker_size) && TYPE(worker_size) == T_FIXNUM) {
536
- global = FIX2UINT(worker_size);
524
+ global[0] = FIX2UINT(worker_size);
537
525
  }
538
526
  else {
539
527
  CLEAN();
@@ -553,8 +541,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
553
541
  struct buffer *buffer;
554
542
  Data_Get_Struct(item, struct buffer, buffer);
555
543
  err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
556
- if (buffer->num_items > global) {
557
- global = buffer->num_items;
544
+ if (buffer->num_items > global[0]) {
545
+ global[0] = buffer->num_items;
558
546
  }
559
547
  }
560
548
  else if (CLASS_OF(item) == rb_cBuffer) {
@@ -565,8 +553,8 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
565
553
  clEnqueueWriteBuffer(commands, buffer->data, CL_TRUE, 0,
566
554
  buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
567
555
  err = clSetKernelArg(kernel, i - 1, sizeof(cl_mem), &buffer->data);
568
- if (buffer->num_items > global) {
569
- global = buffer->num_items;
556
+ if (buffer->num_items > global[0]) {
557
+ global[0] = buffer->num_items;
570
558
  }
571
559
  }
572
560
  else {
@@ -599,17 +587,16 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
599
587
  }
600
588
 
601
589
  err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &local, NULL);
602
- ERROR("failed to retrieve kernel work group info");
603
-
604
- { /* global work size must be power of 2, greater than 3 and not smaller than local */
605
- size_t size = 4;
606
- while (size < global) size *= 2;
607
- global = size;
608
- if (global < local) global = local;
590
+ err = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
591
+ if (err != CL_SUCCESS) {
592
+ CLEAN();
593
+ if (err == CL_INVALID_KERNEL_ARGS) {
594
+ rb_raise(rb_eArgError, "invalid arguments");
595
+ }
596
+ else {
597
+ rb_raise(rb_eOpenCLError, "failed to execute kernel method %d", err);
598
+ }
609
599
  }
610
-
611
- clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
612
- if (err) { CLEAN(); rb_raise(rb_eOpenCLError, "failed to execute kernel method"); }
613
600
 
614
601
  clFinish(commands);
615
602
 
@@ -620,7 +607,10 @@ program_method_missing(int argc, VALUE *argv, VALUE self)
620
607
  Data_Get_Struct(item, struct buffer, buffer);
621
608
  err = clEnqueueReadBuffer(commands, buffer->data, CL_TRUE, 0,
622
609
  buffer->num_items * buffer->member_size, buffer->cachebuf, 0, NULL, NULL);
623
- ERROR("failed to read output buffer");
610
+ if (err != CL_SUCCESS) {
611
+ CLEAN();
612
+ rb_raise(rb_eOpenCLError, "failed to read output buffer");
613
+ }
624
614
  buffer_read(item);
625
615
  }
626
616
  }
@@ -645,6 +635,10 @@ init_opencl()
645
635
  rb_raise(rb_eOpenCLError, "failed to create a program context");
646
636
  }
647
637
  }
638
+
639
+ clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
640
+ sizeof(size_t), &max_work_group_size, NULL);
641
+ max_work_group_size = 4096;
648
642
  }
649
643
 
650
644
  void
@@ -135,7 +135,7 @@ class TestProgram < Test::Unit::TestCase
135
135
 
136
136
  def test_kernel_run
137
137
  p = Program.new("__kernel x_y_z(int x) { }")
138
- assert_nothing_raised { p.x_y_z }
138
+ assert_raise(ArgumentError) { p.x_y_z }
139
139
  end
140
140
 
141
141
  def test_kernel_missing
@@ -217,7 +217,7 @@ class TestProgram < Test::Unit::TestCase
217
217
  p = Program.new <<-'eof'
218
218
  __kernel sum(__global int* out, __global int* in, int total) {
219
219
  int id = get_global_id(0);
220
- if (id < total) atom_add(&out[0], in[id]);
220
+ if (id < total) atom_add(out, in[id]);
221
221
  }
222
222
  eof
223
223
 
@@ -233,7 +233,7 @@ class TestProgram < Test::Unit::TestCase
233
233
  p = Program.new <<-'eof'
234
234
  __kernel sum(__global int* out, __global int* in, int total) {
235
235
  int id = get_global_id(0);
236
- if (id < total) atom_add(&out[0], in[id]);
236
+ if (id < total) atom_add(out, in[id]);
237
237
  }
238
238
  eof
239
239
 
@@ -271,4 +271,21 @@ class TestProgram < Test::Unit::TestCase
271
271
  p.copy_to_out(out, [2.5, 2.5, 2.5, 2.5])
272
272
  assert_equal [3, 3, 3, 3], out.data
273
273
  end
274
+
275
+ def test_program_no_total
276
+ p = Program.new <<-'eof'
277
+ __kernel copy(__global int *out, __global int *in) {
278
+ int i = get_global_id(0);
279
+ out[i] = in[i] + 1;
280
+ }
281
+ eof
282
+
283
+ out = OutputBuffer.new(:int, 3)
284
+ p.copy(out, (1..3).to_a)
285
+ assert_equal (2..4).to_a, out.data
286
+
287
+ out = OutputBuffer.new(:int, 50446)
288
+ p.copy(out, (1..50446).to_a)
289
+ assert_equal (2..50447).to_a, out.data
290
+ end
274
291
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: barracuda
3
3
  version: !ruby/object:Gem::Version
4
- version: "1.1"
4
+ version: "1.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Loren Segal
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-02 00:00:00 -04:00
12
+ date: 2009-09-03 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies: []
15
15