CooCoo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/CooCoo.gemspec +47 -0
  4. data/Gemfile +4 -0
  5. data/Gemfile.lock +88 -0
  6. data/README.md +123 -0
  7. data/Rakefile +81 -0
  8. data/bin/cuda-dev-info +25 -0
  9. data/bin/cuda-free +28 -0
  10. data/bin/cuda-free-trend +7 -0
  11. data/bin/ffi-gen +267 -0
  12. data/bin/spec_runner_html.sh +42 -0
  13. data/bin/trainer +198 -0
  14. data/bin/trend-cost +13 -0
  15. data/examples/char-rnn.rb +405 -0
  16. data/examples/cifar/cifar.rb +94 -0
  17. data/examples/img-similarity.rb +201 -0
  18. data/examples/math_ops.rb +57 -0
  19. data/examples/mnist.rb +365 -0
  20. data/examples/mnist_classifier.rb +293 -0
  21. data/examples/mnist_dream.rb +214 -0
  22. data/examples/seeds.rb +268 -0
  23. data/examples/seeds_dataset.txt +210 -0
  24. data/examples/t10k-images-idx3-ubyte +0 -0
  25. data/examples/t10k-labels-idx1-ubyte +0 -0
  26. data/examples/train-images-idx3-ubyte +0 -0
  27. data/examples/train-labels-idx1-ubyte +0 -0
  28. data/ext/buffer/Rakefile +50 -0
  29. data/ext/buffer/buffer.pre.cu +727 -0
  30. data/ext/buffer/matrix.pre.cu +49 -0
  31. data/lib/CooCoo.rb +1 -0
  32. data/lib/coo-coo.rb +18 -0
  33. data/lib/coo-coo/activation_functions.rb +344 -0
  34. data/lib/coo-coo/consts.rb +5 -0
  35. data/lib/coo-coo/convolution.rb +298 -0
  36. data/lib/coo-coo/core_ext.rb +75 -0
  37. data/lib/coo-coo/cost_functions.rb +91 -0
  38. data/lib/coo-coo/cuda.rb +116 -0
  39. data/lib/coo-coo/cuda/device_buffer.rb +240 -0
  40. data/lib/coo-coo/cuda/device_buffer/ffi.rb +109 -0
  41. data/lib/coo-coo/cuda/error.rb +51 -0
  42. data/lib/coo-coo/cuda/host_buffer.rb +117 -0
  43. data/lib/coo-coo/cuda/runtime.rb +157 -0
  44. data/lib/coo-coo/cuda/vector.rb +315 -0
  45. data/lib/coo-coo/data_sources.rb +2 -0
  46. data/lib/coo-coo/data_sources/xournal.rb +25 -0
  47. data/lib/coo-coo/data_sources/xournal/bitmap_stream.rb +197 -0
  48. data/lib/coo-coo/data_sources/xournal/document.rb +377 -0
  49. data/lib/coo-coo/data_sources/xournal/loader.rb +144 -0
  50. data/lib/coo-coo/data_sources/xournal/renderer.rb +101 -0
  51. data/lib/coo-coo/data_sources/xournal/saver.rb +99 -0
  52. data/lib/coo-coo/data_sources/xournal/training_document.rb +78 -0
  53. data/lib/coo-coo/data_sources/xournal/training_document/constants.rb +15 -0
  54. data/lib/coo-coo/data_sources/xournal/training_document/document_maker.rb +89 -0
  55. data/lib/coo-coo/data_sources/xournal/training_document/document_reader.rb +105 -0
  56. data/lib/coo-coo/data_sources/xournal/training_document/example.rb +37 -0
  57. data/lib/coo-coo/data_sources/xournal/training_document/sets.rb +76 -0
  58. data/lib/coo-coo/debug.rb +8 -0
  59. data/lib/coo-coo/dot.rb +129 -0
  60. data/lib/coo-coo/drawing.rb +4 -0
  61. data/lib/coo-coo/drawing/cairo_canvas.rb +100 -0
  62. data/lib/coo-coo/drawing/canvas.rb +68 -0
  63. data/lib/coo-coo/drawing/chunky_canvas.rb +101 -0
  64. data/lib/coo-coo/drawing/sixel.rb +214 -0
  65. data/lib/coo-coo/enum.rb +17 -0
  66. data/lib/coo-coo/from_name.rb +58 -0
  67. data/lib/coo-coo/fully_connected_layer.rb +205 -0
  68. data/lib/coo-coo/generation_script.rb +38 -0
  69. data/lib/coo-coo/grapher.rb +140 -0
  70. data/lib/coo-coo/image.rb +286 -0
  71. data/lib/coo-coo/layer.rb +67 -0
  72. data/lib/coo-coo/layer_factory.rb +26 -0
  73. data/lib/coo-coo/linear_layer.rb +59 -0
  74. data/lib/coo-coo/math.rb +607 -0
  75. data/lib/coo-coo/math/abstract_vector.rb +121 -0
  76. data/lib/coo-coo/math/functions.rb +39 -0
  77. data/lib/coo-coo/math/interpolation.rb +7 -0
  78. data/lib/coo-coo/network.rb +264 -0
  79. data/lib/coo-coo/neuron.rb +112 -0
  80. data/lib/coo-coo/neuron_layer.rb +168 -0
  81. data/lib/coo-coo/option_parser.rb +18 -0
  82. data/lib/coo-coo/platform.rb +17 -0
  83. data/lib/coo-coo/progress_bar.rb +11 -0
  84. data/lib/coo-coo/recurrence/backend.rb +99 -0
  85. data/lib/coo-coo/recurrence/frontend.rb +101 -0
  86. data/lib/coo-coo/sequence.rb +187 -0
  87. data/lib/coo-coo/shell.rb +2 -0
  88. data/lib/coo-coo/temporal_network.rb +291 -0
  89. data/lib/coo-coo/trainer.rb +21 -0
  90. data/lib/coo-coo/trainer/base.rb +67 -0
  91. data/lib/coo-coo/trainer/batch.rb +82 -0
  92. data/lib/coo-coo/trainer/batch_stats.rb +27 -0
  93. data/lib/coo-coo/trainer/momentum_stochastic.rb +59 -0
  94. data/lib/coo-coo/trainer/stochastic.rb +47 -0
  95. data/lib/coo-coo/transformer.rb +272 -0
  96. data/lib/coo-coo/vector_layer.rb +194 -0
  97. data/lib/coo-coo/version.rb +3 -0
  98. data/lib/coo-coo/weight_deltas.rb +23 -0
  99. data/prototypes/convolution.rb +116 -0
  100. data/prototypes/linear_drop.rb +51 -0
  101. data/prototypes/recurrent_layers.rb +79 -0
  102. data/www/images/screamer.png +0 -0
  103. data/www/images/screamer.xcf +0 -0
  104. data/www/index.html +82 -0
  105. metadata +373 -0
@@ -0,0 +1,50 @@
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'lib', 'coo-coo', 'platform')
2
+
3
+ NVCC = ENV.fetch("NVCC", "nvcc")
4
+ NVCCFLAGS = Array.new
5
+ NVCCFLAGS << "-g" if ENV['DEBUG']
6
+
7
+ if CooCoo::Platform.windows?
8
+ #NVCCFLAGS="--cl-version=2015"
9
+ else
10
+ NVCCFLAGS << "--compiler-options='-fPIC -Wall'"
11
+ end
12
+
13
+ DLEXT=RbConfig::CONFIG['DLEXT']
14
+
15
+ task :default => [ "buffer.#{DLEXT}", "buffer_ffi.rb", "matrix_ffi.rb" ]
16
+
17
+ task :clean do
18
+ sh("rm -f buffer.#{DLEXT} buffer.ext buffer.lib buffer.cu buffer.h buffer_ffi.rb matrix.cu matrix.h matrix_ffi.rb")
19
+ end
20
+
21
+ desc "Build buffer.#{DLEXT}"
22
+ file "buffer.#{DLEXT}" => [ 'buffer.cu', 'matrix.cu' ] do |t|
23
+ sh("#{NVCC} #{NVCCFLAGS.join(' ')} -shared -o #{t.name} #{t.sources.join(' ')}")
24
+ end
25
+
26
+ def ffi_gen(*args)
27
+ ruby(File.join(CooCoo.root, 'bin', 'ffi-gen'), *args)
28
+ end
29
+
30
+ def ffi_file(target)
31
+ file "#{target}.cu" => [ "#{target}.pre.cu", "#{target}.h" ] do |t|
32
+ ffi_gen('--source', '-o', t.name, t.source)
33
+ end
34
+
35
+ file "#{target}.h" => "#{target}.pre.cu" do |t|
36
+ ffi_gen('--header', '-o', t.name, *t.sources)
37
+ end
38
+
39
+ file "#{target}_ffi.rb" => "#{target}.pre.cu" do |t|
40
+ ffi_gen('--ffi',
41
+ '--module', "CooCoo::DeviceBuffer",
42
+ '--library', 'buffer',
43
+ '-t', 'int=cudaError_t',
44
+ '-o', t.name,
45
+ *t.sources)
46
+ end
47
+ end
48
+
49
+ ffi_file 'buffer'
50
+ ffi_file 'matrix'
@@ -0,0 +1,727 @@
1
+ #include <stdio.h>
2
+ #include <math.h>
3
+
4
+ #include "public.h"
5
+ #include "buffer.h"
6
+
7
+ #ifdef IN_PUBLIC
8
+ typedef struct Buffer_s
9
+ {
10
+ double *data;
11
+ size_t length;
12
+ } *Buffer;
13
+ #endif
14
+
15
+ __device__ int grid(int ndims)
16
+ {
17
+ int ret = blockIdx.x * blockDim.x + threadIdx.x;
18
+ if(ndims == 2) {
19
+ ret += threadIdx.y + blockIdx.y * blockDim.y * blockDim.x;
20
+ } else if(ndims == 3) {
21
+ ret += threadIdx.z + blockIdx.z * blockDim.z * blockDim.y * blockDim.x;
22
+ }
23
+
24
+ return ret;
25
+ }
26
+
27
+ static int _initialized = -1;
28
+ static int _block_size = 256;
29
+ static int _max_grid_size = 1024;
30
+
31
+ //static int _threads_per_block = 1;
32
+
33
+ PUBLIC int buffer_block_size()
34
+ {
35
+ return _block_size;
36
+ }
37
+
38
+ PUBLIC void buffer_set_block_size(int bs)
39
+ {
40
+ _block_size = bs;
41
+ }
42
+
43
+ PUBLIC int buffer_max_grid_size()
44
+ {
45
+ return _max_grid_size;
46
+ }
47
+
48
+ PUBLIC void buffer_set_max_grid_size(int gs)
49
+ {
50
+ _max_grid_size = gs;
51
+ }
52
+
53
+ static size_t _total_bytes_allocated = 0;
54
+
55
+ PUBLIC size_t buffer_total_bytes_allocated()
56
+ {
57
+ return _total_bytes_allocated;
58
+ }
59
+
60
+ static long long _num_allocated = 0;
61
+
62
+ PUBLIC long long buffer_num_allocated()
63
+ {
64
+ return _num_allocated;
65
+ }
66
+
67
+
68
+ typedef void (*kernel_func_t)(int, double *, const double *, const double *, int, void *);
69
+
70
+ Buffer launch_kerneln(kernel_func_t kernel, int length, const Buffer a, const Buffer b, void *data)
71
+ {
72
+ Buffer out;
73
+ int i;
74
+
75
+ if(a != NULL) {
76
+ int grid_size = (length + _block_size - 1) / _block_size;
77
+
78
+ out = buffer_new(length, 0.0);
79
+ if(out == NULL) {
80
+ return NULL;
81
+ }
82
+
83
+ if(grid_size >= _max_grid_size) {
84
+ for(i = 0; i < (grid_size / _max_grid_size); i++) {
85
+ kernel<<< _max_grid_size, _block_size >>>(length, out->data, a->data, b? b->data : NULL, i * _max_grid_size, data);
86
+ }
87
+ } else {
88
+ kernel<<< grid_size, _block_size >>>(length, out->data, a->data, b? b->data : NULL, 0, data);
89
+ }
90
+ return out;
91
+ } else {
92
+ return NULL;
93
+ }
94
+ }
95
+
96
+ Buffer launch_kernel(kernel_func_t kernel, const Buffer a, const Buffer b, void *data)
97
+ {
98
+ if(a != NULL && (b == NULL || a->length == b->length)) {
99
+ return launch_kerneln(kernel, a->length, a, b, data);
100
+ } else {
101
+ return NULL;
102
+ }
103
+ }
104
+
105
+ typedef void (*kerneld_func_t)(int, double *, const double *, double, int);
106
+
107
+ Buffer launchd_kernel(kerneld_func_t kernel, const Buffer a, double b, size_t offset)
108
+ {
109
+ Buffer out;
110
+ int i;
111
+
112
+ size_t length = a->length - offset;
113
+ int grid_size = (length + _block_size - 1) / _block_size;
114
+
115
+ if(a == NULL) return NULL;
116
+
117
+ out = buffer_new(length, 0.0);
118
+ if(out == NULL) return NULL;
119
+
120
+ if(grid_size >= _max_grid_size) {
121
+ for(i = 0; i < (grid_size / _max_grid_size); i++) {
122
+ kernel<<< _max_grid_size, _block_size >>>(out->length, out->data, a->data + offset, b, i * _max_grid_size);
123
+ }
124
+ } else {
125
+ kernel<<< grid_size, _block_size >>>(out->length, out->data, a->data + offset, b, 0);
126
+ }
127
+ return out;
128
+ }
129
+
130
+ typedef void (*modkerneld_func_t)(int, double *, double, int);
131
+
132
+ void launchd_modkernel(modkerneld_func_t kernel, const Buffer a, double b, size_t offset)
133
+ {
134
+ int i;
135
+
136
+ size_t length = a->length - offset;
137
+ int grid_size = (length + _block_size - 1) / _block_size;
138
+
139
+ if(a == NULL) return;
140
+
141
+ if(grid_size >= _max_grid_size) {
142
+ for(i = 0; i < (grid_size / _max_grid_size); i++) {
143
+ kernel<<< _max_grid_size, _block_size >>>(length, a->data + offset, b, i * _max_grid_size);
144
+ }
145
+ } else {
146
+ kernel<<< grid_size, _block_size >>>(length, a->data + offset, b, 0);
147
+ }
148
+ }
149
+
150
+
151
+ PUBLIC cudaError_t buffer_init(int device)
152
+ {
153
+ if(_initialized == -1) {
154
+ cudaDeviceProp props;
155
+ cudaError_t err;
156
+
157
+ err = cudaSetDevice(device);
158
+ if(err != 0) {
159
+ return err;
160
+ }
161
+
162
+ err = cudaGetDeviceProperties(&props, device);
163
+ if(err == 0) {
164
+ _block_size = props.maxThreadsPerBlock;
165
+ _max_grid_size = props.maxGridSize[0];
166
+ _initialized = device;
167
+
168
+ return cudaSuccess;
169
+ } else {
170
+ return err;
171
+ }
172
+ } else {
173
+ return cudaSuccess;
174
+ }
175
+ }
176
+
177
+ __global__ void buffer_setd_inner(int len, double *a, double b, int grid_offset)
178
+ {
179
+ int i = grid_offset + grid(1);
180
+ if(i < len) {
181
+ a[i] = b;
182
+ }
183
+ }
184
+
185
+ PUBLIC cudaError_t buffer_setd(Buffer b, double value, size_t offset, size_t length)
186
+ {
187
+ if(value == 0.0) {
188
+ if(offset < b->length) {
189
+ if(offset + length >= b->length) {
190
+ length = b->length - offset;
191
+ }
192
+ return cudaMemset(b->data, 0, length * sizeof(double));
193
+ } else {
194
+ return cudaSuccess;
195
+ }
196
+ } else {
197
+ launchd_modkernel(buffer_setd_inner, b, value, offset);
198
+ return cudaSuccess;
199
+ }
200
+ }
201
+
202
+ PUBLIC Buffer buffer_new(size_t length, double initial_value)
203
+ {
204
+ Buffer ptr = (Buffer)malloc(sizeof(Buffer_s));;
205
+ if(buffer_init(0) != 0) {
206
+ return NULL;
207
+ }
208
+
209
+ if(ptr != NULL) {
210
+ if(cudaMalloc(&ptr->data, length * sizeof(double)) != cudaSuccess) {
211
+ ptr->data = NULL;
212
+ buffer_free(ptr);
213
+ return NULL;
214
+ }
215
+
216
+ ptr->length = length;
217
+ if(buffer_setd(ptr, initial_value, 0, length) != cudaSuccess) {
218
+ buffer_free(ptr);
219
+ return NULL;
220
+ }
221
+
222
+ _total_bytes_allocated += length * sizeof(double);
223
+ _num_allocated++;
224
+ }
225
+
226
+ return ptr;
227
+ }
228
+
229
+ PUBLIC cudaError_t buffer_free(Buffer buffer)
230
+ {
231
+ if(buffer != NULL) {
232
+ if(buffer->data != NULL) {
233
+ cudaError_t err = cudaFree(buffer->data);
234
+ if(err != cudaSuccess) {
235
+ return err;
236
+ }
237
+ _total_bytes_allocated -= buffer->length * sizeof(double);
238
+ _num_allocated--;
239
+ }
240
+ free(buffer);
241
+ }
242
+
243
+ return cudaSuccess;
244
+ }
245
+
246
+ PUBLIC cudaError_t buffer_set(Buffer buffer, Buffer other)
247
+ {
248
+ if(buffer == NULL || other == NULL) return cudaErrorUnknown;
249
+
250
+ size_t length = other->length;
251
+ if(length > buffer->length) {
252
+ length = buffer->length;
253
+ }
254
+ return cudaMemcpy(buffer->data, other->data, length * sizeof(double), cudaMemcpyDeviceToDevice);
255
+ }
256
+
257
+ PUBLIC cudaError_t buffer_setn(Buffer buffer, size_t offset, Buffer other, size_t length)
258
+ {
259
+ if(length > other->length) {
260
+ length = other->length;
261
+ }
262
+ if((offset + length) > buffer->length) {
263
+ length = buffer->length - offset;
264
+ }
265
+ return cudaMemcpy(buffer->data + offset, other->data, length * sizeof(double), cudaMemcpyDeviceToDevice);
266
+ }
267
+
268
+ PUBLIC cudaError_t buffer_setvn(Buffer buffer, size_t offset, void *data, size_t length)
269
+ {
270
+ if((offset + length) > buffer->length) {
271
+ length = buffer->length - offset;
272
+ }
273
+ return cudaMemcpy(buffer->data + offset, data, length * sizeof(double), cudaMemcpyHostToDevice);
274
+ }
275
+
276
+ PUBLIC cudaError_t buffer_setv(Buffer buffer, void *data, size_t length)
277
+ {
278
+ return buffer_setvn(buffer, 0, data, length);
279
+ }
280
+
281
+ PUBLIC cudaError_t buffer_set_element(Buffer buffer, size_t n, double v)
282
+ {
283
+ if(buffer != NULL && n < buffer->length) {
284
+ return cudaMemcpy(buffer->data + n, &v, sizeof(double), cudaMemcpyHostToDevice);
285
+ } else {
286
+ return cudaErrorUnknown;
287
+ }
288
+ }
289
+
290
+ PUBLIC double buffer_get_element(Buffer buffer, size_t n)
291
+ {
292
+ if(buffer != NULL && n < buffer->length) {
293
+ double out;
294
+ cudaMemcpy(&out, buffer->data + n, sizeof(double), cudaMemcpyDeviceToHost);
295
+ return out;
296
+ } else {
297
+ return NAN;
298
+ }
299
+ }
300
+
301
+ PUBLIC cudaError_t buffer_get(Buffer buffer, void *out, size_t max_length)
302
+ {
303
+ if(buffer == NULL) return cudaErrorUnknown;
304
+
305
+ if(max_length > buffer->length) {
306
+ max_length = buffer->length;
307
+ }
308
+
309
+ cudaError_t err = cudaMemcpy(out, buffer->data, max_length * sizeof(double), cudaMemcpyDeviceToHost);
310
+
311
+ return err;
312
+ }
313
+
314
+ PUBLIC Buffer buffer_slice(Buffer buffer, size_t n, size_t max_length)
315
+ {
316
+ if(buffer != NULL && n < buffer->length) {
317
+ Buffer out = buffer_new(max_length, 0.0);
318
+ if(out == NULL) return NULL;
319
+
320
+ if((n + max_length) >= buffer->length) {
321
+ max_length = buffer->length - n;
322
+ }
323
+
324
+ cudaError_t err = cudaMemcpy(out->data, buffer->data + n, max_length * sizeof(double), cudaMemcpyDeviceToDevice);
325
+
326
+ if(err == cudaSuccess) {
327
+ return out;
328
+ } else {
329
+ buffer_free(out);
330
+ return NULL;
331
+ }
332
+ } else {
333
+ return NULL;
334
+ }
335
+ }
336
+
337
+ PUBLIC cudaError_t buffer_host_slice(Buffer buffer, void *out, size_t n, size_t max_length)
338
+ {
339
+ if(buffer != NULL && n < buffer->length) {
340
+ if((n + max_length) >= buffer->length) {
341
+ max_length = buffer->length - n;
342
+ }
343
+
344
+ cudaError_t err = cudaMemcpy(out, buffer->data + n, max_length * sizeof(double), cudaMemcpyDeviceToHost);
345
+ return err;
346
+ } else {
347
+ return cudaErrorUnknown;
348
+ }
349
+ }
350
+
351
+ PUBLIC size_t buffer_length(const Buffer b)
352
+ {
353
+ if(b != NULL) {
354
+ return b->length;
355
+ } else {
356
+ return 0;
357
+ }
358
+ }
359
+
360
+ PUBLIC Buffer buffer_slice_2d(const Buffer in, int width, int height, int x, int y, int out_width, int out_height, double empty)
361
+ {
362
+ Buffer out = buffer_new(out_width * out_height, empty);
363
+ if(out == NULL) return NULL;
364
+
365
+ if(y >= height || x >= width) {
366
+ return out;
367
+ }
368
+
369
+ for(int i = 0; i < out_height; i++) {
370
+ int oi = i * out_width;
371
+ int w = out_width;
372
+ int iy = y + i;
373
+ int ii = (iy) * width + x;
374
+
375
+ if(x < 0) {
376
+ ii = ii - x;
377
+ w = w + x;
378
+ oi = oi - x;
379
+ } else if(x + w > width) {
380
+ w = width - x;
381
+ }
382
+
383
+ if(iy < 0) {
384
+ continue;
385
+ } else if(iy >= height || ii >= in->length) {
386
+ break;
387
+ }
388
+
389
+ cudaError_t err = cudaMemcpy(out->data + oi, in->data + ii, w * sizeof(double), cudaMemcpyDeviceToDevice);
390
+ if(err != cudaSuccess) {
391
+ buffer_free(out);
392
+ return NULL;
393
+ }
394
+ }
395
+
396
+ return out;
397
+ }
398
+
399
+ cudaError_t buffer_set2d_inner(Buffer dest, size_t dest_width, const void *src, size_t src_width, size_t src_height, size_t x, size_t y, cudaMemcpyKind kind)
400
+ {
401
+ if(dest == NULL
402
+ || dest->data == NULL
403
+ || dest_width == 0
404
+ || src == NULL
405
+ || src_width == 0
406
+ || src_height == 0) {
407
+ return cudaErrorInvalidValue;
408
+ }
409
+
410
+ size_t copy_width = src_width;
411
+ if(x + src_width > dest_width) {
412
+ copy_width = dest_width - x;
413
+ }
414
+ size_t dest_height = dest->length / dest_width;
415
+ size_t copy_height = src_height;
416
+ if(y + src_height > dest_height) {
417
+ copy_height = dest_height - y;
418
+ }
419
+ size_t idx = y * dest_width + x;
420
+ if(idx >= dest->length
421
+ || (idx + copy_width * copy_height) > dest->length) {
422
+ return cudaSuccess;
423
+ }
424
+ cudaError_t err = cudaMemcpy2D(dest->data + idx,
425
+ dest_width * sizeof(double),
426
+ src, src_width * sizeof(double),
427
+ copy_width * sizeof(double),
428
+ copy_height,
429
+ kind);
430
+ return err;
431
+ }
432
+
433
+ PUBLIC cudaError_t buffer_set2d(Buffer dest, size_t dest_width, const Buffer src, size_t src_width, size_t x, size_t y)
434
+ {
435
+ if(dest->length == 0
436
+ || src->length == 0
437
+ || src_width == 0
438
+ || src->length % src_width > 0) {
439
+ return cudaErrorInvalidValue;
440
+ }
441
+ return buffer_set2d_inner(dest, dest_width, src->data, src_width, src->length / src_width, x, y, cudaMemcpyDeviceToDevice);
442
+ }
443
+
444
+ PUBLIC cudaError_t buffer_set2dv(Buffer dest, size_t dest_width, const void *src, size_t src_width, size_t src_height, size_t x, size_t y)
445
+ {
446
+ return buffer_set2d_inner(dest, dest_width, src, src_width, src_height, x, y, cudaMemcpyHostToDevice);
447
+ }
448
+
449
+ typedef double (*ReduceOp)(double a, double b);
450
+ typedef void (*ReduceKernel)(const double *, double *, size_t, size_t);
451
+
452
+ #define REDUCE_KERNEL(NAME, OP, INITIAL) \
453
+ double NAME ## _initial_value = INITIAL; \
454
+ double NAME ## _op(double a, double b) \
455
+ { \
456
+ return OP; \
457
+ } \
458
+ \
459
+ __global__ void NAME(const double *data, double *partial_sums, size_t length, size_t offset) \
460
+ { \
461
+ extern __shared__ double sdata[]; \
462
+ int i = offset + grid(1); \
463
+ int n; \
464
+ \
465
+ sdata[threadIdx.x] = (i < length)? data[i] : INITIAL;\
466
+ __syncthreads();\
467
+ \
468
+ for(n = blockDim.x / 2; n > 0; n = n / 2) {\
469
+ if(threadIdx.x < n) {\
470
+ double a = sdata[threadIdx.x]; \
471
+ double b = sdata[threadIdx.x + n]; \
472
+ sdata[threadIdx.x] = OP;\
473
+ }\
474
+ __syncthreads();\
475
+ }\
476
+ \
477
+ if(threadIdx.x == 0) { \
478
+ partial_sums[blockIdx.x] = sdata[0]; \
479
+ } \
480
+ }
481
+
482
+ double launch_reduce_inner(ReduceOp op, ReduceKernel reduce_kernel, const Buffer b, double initial)
483
+ {
484
+ int i;
485
+ int grid_size = (b->length + _block_size - 1) / _block_size;
486
+ Buffer partial_buffer;
487
+ double *partial_sums;
488
+ double out = initial;
489
+ int usable_grid = grid_size;
490
+
491
+ if(b == NULL || b->length == 0) {
492
+ return NAN;
493
+ }
494
+
495
+ if(grid_size >= _max_grid_size) {
496
+ usable_grid = _max_grid_size;
497
+ }
498
+
499
+ partial_buffer = buffer_new(_block_size, initial);
500
+ if(partial_buffer == NULL) return NAN;
501
+
502
+ for(i = 0; i < grid_size / usable_grid; i++) {
503
+ reduce_kernel<<< usable_grid, _block_size, _block_size * sizeof(double) >>>(b->data, partial_buffer->data, b->length, i * _block_size);
504
+ }
505
+
506
+ partial_sums = (double *)malloc(sizeof(double) * _block_size);
507
+ if(partial_sums == NULL) {
508
+ buffer_free(partial_buffer);
509
+ return NAN;
510
+ }
511
+
512
+ buffer_get(partial_buffer, partial_sums, _block_size);
513
+
514
+ out = partial_sums[0];
515
+ for(i = 1; i < _block_size; i++) {
516
+ out = op(out, partial_sums[i]);
517
+ }
518
+
519
+ buffer_free(partial_buffer);
520
+ free(partial_sums);
521
+
522
+ return out;
523
+ }
524
+
525
+ #define launch_reduce(kernel, buffer) launch_reduce_inner(kernel ## _op, kernel, buffer, kernel ## _initial_value)
526
+
527
+
528
+ REDUCE_KERNEL(buffer_sum_kernel, a + b, 0.0);
529
+
530
+ PUBLIC double buffer_sum(const Buffer b)
531
+ {
532
+ return launch_reduce(buffer_sum_kernel, b);
533
+ }
534
+
535
+ REDUCE_KERNEL(buffer_min_kernel, fmin(a, b), NAN);
536
+
537
+ PUBLIC double buffer_min(const Buffer b)
538
+ {
539
+ return launch_reduce(buffer_min_kernel, b);
540
+ }
541
+
542
+ REDUCE_KERNEL(buffer_max_kernel, fmax(a, b), NAN);
543
+ PUBLIC double buffer_max(const Buffer b)
544
+ {
545
+ return launch_reduce(buffer_max_kernel, b);
546
+ }
547
+
548
+ #define BINARY_OP(name, operation) \
549
+ __global__ void buffer_ ## name ## _inner(int len, double *out, const double *a, const double *b, int grid_offset, void *) \
550
+ { \
551
+ int i = grid_offset + grid(1); \
552
+ if(i < len) { \
553
+ operation; \
554
+ } \
555
+ } \
556
+ \
557
+ PUBLIC Buffer buffer_##name(const Buffer a, const Buffer b) \
558
+ { \
559
+ return launch_kernel(buffer_ ## name ## _inner, a, b, NULL); \
560
+ }
561
+
562
+ BINARY_OP(add, { out[i] = a[i] + b[i]; });
563
+ BINARY_OP(sub, { out[i] = a[i] - b[i]; });
564
+ BINARY_OP(mul, { out[i] = a[i] * b[i]; });
565
+ BINARY_OP(pow, { out[i] = pow(a[i], b[i]); });
566
+ BINARY_OP(div, { out[i] = a[i] / b[i]; });
567
+ BINARY_OP(collect_eq, { out[i] = a[i] == b[i]; });
568
+ BINARY_OP(collect_neq, { out[i] = a[i] != b[i]; });
569
+ BINARY_OP(collect_lt, { out[i] = a[i] < b[i]; });
570
+ BINARY_OP(collect_lte, { out[i] = a[i] <= b[i]; });
571
+ BINARY_OP(collect_gt, { out[i] = a[i] > b[i]; });
572
+ BINARY_OP(collect_gte, { out[i] = a[i] >= b[i]; });
573
+
574
+ #define SCALAR_OP(name, operation) \
575
+ __global__ void buffer_ ## name ## d_inner(int len, double *out, const double *a, const double b, int grid_offset) \
576
+ { \
577
+ int i = grid_offset + grid(1); \
578
+ if(i < len) { \
579
+ operation; \
580
+ } \
581
+ } \
582
+ \
583
+ PUBLIC Buffer buffer_##name ## d(const Buffer a, double b) \
584
+ { \
585
+ return launchd_kernel(buffer_ ## name ## d_inner, a, b, 0); \
586
+ }
587
+
588
+ SCALAR_OP(add, { out[i] = a[i] + b; });
589
+ SCALAR_OP(sub, { out[i] = a[i] - b; });
590
+ SCALAR_OP(mul, { out[i] = a[i] * b; });
591
+ SCALAR_OP(pow, { out[i] = pow(a[i], b); });
592
+ SCALAR_OP(div, { out[i] = a[i] / b; });
593
+ SCALAR_OP(collect_eq, { out[i] = a[i] == b; });
594
+ SCALAR_OP(collect_neq, { out[i] = a[i] != b; });
595
+ SCALAR_OP(collect_lt, { out[i] = a[i] < b; });
596
+ SCALAR_OP(collect_lte, { out[i] = a[i] <= b; });
597
+ SCALAR_OP(collect_gt, { out[i] = a[i] > b; });
598
+ SCALAR_OP(collect_gte, { out[i] = a[i] >= b; });
599
+
600
+ PUBLIC int buffer_eq(const Buffer a, const Buffer b)
601
+ {
602
+ // compare
603
+ Buffer results = buffer_collect_eq(a, b);
604
+ if(results != NULL) {
605
+ // reduce
606
+ double sum = buffer_sum(results);
607
+
608
+ // clean up
609
+ buffer_free(results);
610
+
611
+ return sum == a->length;
612
+ } else {
613
+ return 0;
614
+ }
615
+ }
616
+
617
+ #define FUNCTION_OP(name, operation) \
618
+ __global__ void buffer_ ## name ## _inner(int len, double *out, const double *a, const double b, int grid_offset) \
619
+ { \
620
+ int i = grid_offset + grid(1); \
621
+ if(i < len) { \
622
+ operation; \
623
+ } \
624
+ } \
625
+ \
626
+ PUBLIC Buffer buffer_##name(const Buffer a) \
627
+ { \
628
+ return launchd_kernel(buffer_ ## name ## _inner, a, 0.0, 0); \
629
+ }
630
+
631
+
632
+ FUNCTION_OP(abs, { out[i] = abs(a[i]); });
633
+ FUNCTION_OP(exp, { out[i] = exp(a[i]); });
634
+ FUNCTION_OP(log, { out[i] = log(a[i]); });
635
+ FUNCTION_OP(log10, { out[i] = log10(a[i]); });
636
+ FUNCTION_OP(log2, { out[i] = log2(a[i]); });
637
+ FUNCTION_OP(sqrt, { out[i] = sqrt(a[i]); });
638
+ FUNCTION_OP(floor, { out[i] = floor(a[i]); });
639
+ FUNCTION_OP(ceil, { out[i] = ceil(a[i]); });
640
+ FUNCTION_OP(round, { out[i] = round(a[i]); });
641
+ FUNCTION_OP(sin, { out[i] = sin(a[i]); });
642
+ FUNCTION_OP(cos, { out[i] = cos(a[i]); });
643
+ FUNCTION_OP(tan, { out[i] = tan(a[i]); });
644
+ FUNCTION_OP(asin, { out[i] = asin(a[i]); });
645
+ FUNCTION_OP(acos, { out[i] = acos(a[i]); });
646
+ FUNCTION_OP(atan, { out[i] = atan(a[i]); });
647
+ FUNCTION_OP(sinh, { out[i] = sinh(a[i]); });
648
+ FUNCTION_OP(cosh, { out[i] = cosh(a[i]); });
649
+ FUNCTION_OP(tanh, { out[i] = tanh(a[i]); });
650
+ FUNCTION_OP(asinh, { out[i] = asinh(a[i]); });
651
+ FUNCTION_OP(acosh, { out[i] = acosh(a[i]); });
652
+ FUNCTION_OP(atanh, { out[i] = atanh(a[i]); });
653
+ FUNCTION_OP(collect_nan, { out[i] = isnan(a[i]); });
654
+ FUNCTION_OP(collect_inf, { out[i] = isinf(a[i]); });
655
+
656
+ __global__ void buffer_dot_inner(double *out, const double *a, const double *b, size_t aw, size_t ah, size_t bw, size_t bh)
657
+ {
658
+ size_t row = blockIdx.y * blockDim.y + threadIdx.y;
659
+ size_t col = blockIdx.x * blockDim.x + threadIdx.x;
660
+
661
+ double sum = 0.0;
662
+
663
+ if (row < ah && col < bw) {
664
+ for (size_t i = 0; i < bh; i++) {
665
+ sum += a[row * aw + i] * b[i * bw + col];
666
+ }
667
+ }
668
+ out[row * bw + col] = sum;
669
+ }
670
+
671
+ PUBLIC Buffer buffer_dot(const Buffer a, size_t aw, size_t ah, const Buffer b, size_t bw, size_t bh)
672
+ {
673
+ if(aw * ah != a->length && bw * bh != b-> length && aw != bh) {
674
+ return NULL;
675
+ } else {
676
+ Buffer out = buffer_new(ah * bw, 0.0);
677
+ if(out == NULL) return NULL;
678
+
679
+ dim3 dim(bw, ah);
680
+
681
+ buffer_dot_inner<<< dim, 1 >>>(out->data, a->data, b->data, aw, ah, bw, bh);
682
+
683
+ return out;
684
+ }
685
+ }
686
+
687
+ __global__ void buffer_identity_inner(double *data, size_t w, size_t h)
688
+ {
689
+ size_t row = blockIdx.y * blockDim.y + threadIdx.y;
690
+ size_t col = blockIdx.x * blockDim.x + threadIdx.x;
691
+
692
+ if(row < h && col < w) {
693
+ data[row * w + col] = (double)(row == col);
694
+ }
695
+ }
696
+
697
+ PUBLIC Buffer buffer_identity(size_t w, size_t h)
698
+ {
699
+ Buffer i = buffer_new(w * h, 0.0);
700
+ dim3 dim(w, h);
701
+ buffer_identity_inner<<< dim, 1 >>>(i->data, w, h);
702
+ return i;
703
+ }
704
+
705
+ __global__ void buffer_diagflat_inner(double *data, const double *a, size_t len)
706
+ {
707
+ size_t row = blockIdx.y * blockDim.y + threadIdx.y;
708
+ size_t col = blockIdx.x * blockDim.x + threadIdx.x;
709
+
710
+ if(row >= len || col >= len) {
711
+ return;
712
+ }
713
+
714
+ if(row == col) {
715
+ data[row * len + col] = a[row];
716
+ } else {
717
+ data[row * len + col] = 0.0;
718
+ }
719
+ }
720
+
721
+ PUBLIC Buffer buffer_diagflat(const Buffer a)
722
+ {
723
+ Buffer i = buffer_new(a->length * a->length, 0.0);
724
+ dim3 dim(a->length, a->length);
725
+ buffer_diagflat_inner<<< dim, 1 >>>(i->data, a->data, a->length);
726
+ return i;
727
+ }