CooCoo 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/CooCoo.gemspec +47 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +88 -0
- data/README.md +123 -0
- data/Rakefile +81 -0
- data/bin/cuda-dev-info +25 -0
- data/bin/cuda-free +28 -0
- data/bin/cuda-free-trend +7 -0
- data/bin/ffi-gen +267 -0
- data/bin/spec_runner_html.sh +42 -0
- data/bin/trainer +198 -0
- data/bin/trend-cost +13 -0
- data/examples/char-rnn.rb +405 -0
- data/examples/cifar/cifar.rb +94 -0
- data/examples/img-similarity.rb +201 -0
- data/examples/math_ops.rb +57 -0
- data/examples/mnist.rb +365 -0
- data/examples/mnist_classifier.rb +293 -0
- data/examples/mnist_dream.rb +214 -0
- data/examples/seeds.rb +268 -0
- data/examples/seeds_dataset.txt +210 -0
- data/examples/t10k-images-idx3-ubyte +0 -0
- data/examples/t10k-labels-idx1-ubyte +0 -0
- data/examples/train-images-idx3-ubyte +0 -0
- data/examples/train-labels-idx1-ubyte +0 -0
- data/ext/buffer/Rakefile +50 -0
- data/ext/buffer/buffer.pre.cu +727 -0
- data/ext/buffer/matrix.pre.cu +49 -0
- data/lib/CooCoo.rb +1 -0
- data/lib/coo-coo.rb +18 -0
- data/lib/coo-coo/activation_functions.rb +344 -0
- data/lib/coo-coo/consts.rb +5 -0
- data/lib/coo-coo/convolution.rb +298 -0
- data/lib/coo-coo/core_ext.rb +75 -0
- data/lib/coo-coo/cost_functions.rb +91 -0
- data/lib/coo-coo/cuda.rb +116 -0
- data/lib/coo-coo/cuda/device_buffer.rb +240 -0
- data/lib/coo-coo/cuda/device_buffer/ffi.rb +109 -0
- data/lib/coo-coo/cuda/error.rb +51 -0
- data/lib/coo-coo/cuda/host_buffer.rb +117 -0
- data/lib/coo-coo/cuda/runtime.rb +157 -0
- data/lib/coo-coo/cuda/vector.rb +315 -0
- data/lib/coo-coo/data_sources.rb +2 -0
- data/lib/coo-coo/data_sources/xournal.rb +25 -0
- data/lib/coo-coo/data_sources/xournal/bitmap_stream.rb +197 -0
- data/lib/coo-coo/data_sources/xournal/document.rb +377 -0
- data/lib/coo-coo/data_sources/xournal/loader.rb +144 -0
- data/lib/coo-coo/data_sources/xournal/renderer.rb +101 -0
- data/lib/coo-coo/data_sources/xournal/saver.rb +99 -0
- data/lib/coo-coo/data_sources/xournal/training_document.rb +78 -0
- data/lib/coo-coo/data_sources/xournal/training_document/constants.rb +15 -0
- data/lib/coo-coo/data_sources/xournal/training_document/document_maker.rb +89 -0
- data/lib/coo-coo/data_sources/xournal/training_document/document_reader.rb +105 -0
- data/lib/coo-coo/data_sources/xournal/training_document/example.rb +37 -0
- data/lib/coo-coo/data_sources/xournal/training_document/sets.rb +76 -0
- data/lib/coo-coo/debug.rb +8 -0
- data/lib/coo-coo/dot.rb +129 -0
- data/lib/coo-coo/drawing.rb +4 -0
- data/lib/coo-coo/drawing/cairo_canvas.rb +100 -0
- data/lib/coo-coo/drawing/canvas.rb +68 -0
- data/lib/coo-coo/drawing/chunky_canvas.rb +101 -0
- data/lib/coo-coo/drawing/sixel.rb +214 -0
- data/lib/coo-coo/enum.rb +17 -0
- data/lib/coo-coo/from_name.rb +58 -0
- data/lib/coo-coo/fully_connected_layer.rb +205 -0
- data/lib/coo-coo/generation_script.rb +38 -0
- data/lib/coo-coo/grapher.rb +140 -0
- data/lib/coo-coo/image.rb +286 -0
- data/lib/coo-coo/layer.rb +67 -0
- data/lib/coo-coo/layer_factory.rb +26 -0
- data/lib/coo-coo/linear_layer.rb +59 -0
- data/lib/coo-coo/math.rb +607 -0
- data/lib/coo-coo/math/abstract_vector.rb +121 -0
- data/lib/coo-coo/math/functions.rb +39 -0
- data/lib/coo-coo/math/interpolation.rb +7 -0
- data/lib/coo-coo/network.rb +264 -0
- data/lib/coo-coo/neuron.rb +112 -0
- data/lib/coo-coo/neuron_layer.rb +168 -0
- data/lib/coo-coo/option_parser.rb +18 -0
- data/lib/coo-coo/platform.rb +17 -0
- data/lib/coo-coo/progress_bar.rb +11 -0
- data/lib/coo-coo/recurrence/backend.rb +99 -0
- data/lib/coo-coo/recurrence/frontend.rb +101 -0
- data/lib/coo-coo/sequence.rb +187 -0
- data/lib/coo-coo/shell.rb +2 -0
- data/lib/coo-coo/temporal_network.rb +291 -0
- data/lib/coo-coo/trainer.rb +21 -0
- data/lib/coo-coo/trainer/base.rb +67 -0
- data/lib/coo-coo/trainer/batch.rb +82 -0
- data/lib/coo-coo/trainer/batch_stats.rb +27 -0
- data/lib/coo-coo/trainer/momentum_stochastic.rb +59 -0
- data/lib/coo-coo/trainer/stochastic.rb +47 -0
- data/lib/coo-coo/transformer.rb +272 -0
- data/lib/coo-coo/vector_layer.rb +194 -0
- data/lib/coo-coo/version.rb +3 -0
- data/lib/coo-coo/weight_deltas.rb +23 -0
- data/prototypes/convolution.rb +116 -0
- data/prototypes/linear_drop.rb +51 -0
- data/prototypes/recurrent_layers.rb +79 -0
- data/www/images/screamer.png +0 -0
- data/www/images/screamer.xcf +0 -0
- data/www/index.html +82 -0
- metadata +373 -0
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/buffer/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'lib', 'coo-coo', 'platform')
|
2
|
+
|
3
|
+
NVCC = ENV.fetch("NVCC", "nvcc")
|
4
|
+
NVCCFLAGS = Array.new
|
5
|
+
NVCCFLAGS << "-g" if ENV['DEBUG']
|
6
|
+
|
7
|
+
if CooCoo::Platform.windows?
|
8
|
+
#NVCCFLAGS="--cl-version=2015"
|
9
|
+
else
|
10
|
+
NVCCFLAGS << "--compiler-options='-fPIC -Wall'"
|
11
|
+
end
|
12
|
+
|
13
|
+
DLEXT=RbConfig::CONFIG['DLEXT']
|
14
|
+
|
15
|
+
task :default => [ "buffer.#{DLEXT}", "buffer_ffi.rb", "matrix_ffi.rb" ]
|
16
|
+
|
17
|
+
task :clean do
|
18
|
+
sh("rm -f buffer.#{DLEXT} buffer.ext buffer.lib buffer.cu buffer.h buffer_ffi.rb matrix.cu matrix.h matrix_ffi.rb")
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Build buffer.#{DLEXT}"
|
22
|
+
file "buffer.#{DLEXT}" => [ 'buffer.cu', 'matrix.cu' ] do |t|
|
23
|
+
sh("#{NVCC} #{NVCCFLAGS.join(' ')} -shared -o #{t.name} #{t.sources.join(' ')}")
|
24
|
+
end
|
25
|
+
|
26
|
+
def ffi_gen(*args)
|
27
|
+
ruby(File.join(CooCoo.root, 'bin', 'ffi-gen'), *args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def ffi_file(target)
|
31
|
+
file "#{target}.cu" => [ "#{target}.pre.cu", "#{target}.h" ] do |t|
|
32
|
+
ffi_gen('--source', '-o', t.name, t.source)
|
33
|
+
end
|
34
|
+
|
35
|
+
file "#{target}.h" => "#{target}.pre.cu" do |t|
|
36
|
+
ffi_gen('--header', '-o', t.name, *t.sources)
|
37
|
+
end
|
38
|
+
|
39
|
+
file "#{target}_ffi.rb" => "#{target}.pre.cu" do |t|
|
40
|
+
ffi_gen('--ffi',
|
41
|
+
'--module', "CooCoo::DeviceBuffer",
|
42
|
+
'--library', 'buffer',
|
43
|
+
'-t', 'int=cudaError_t',
|
44
|
+
'-o', t.name,
|
45
|
+
*t.sources)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
ffi_file 'buffer'
|
50
|
+
ffi_file 'matrix'
|
@@ -0,0 +1,727 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <math.h>
|
3
|
+
|
4
|
+
#include "public.h"
|
5
|
+
#include "buffer.h"
|
6
|
+
|
7
|
+
#ifdef IN_PUBLIC
|
8
|
+
typedef struct Buffer_s
|
9
|
+
{
|
10
|
+
double *data;
|
11
|
+
size_t length;
|
12
|
+
} *Buffer;
|
13
|
+
#endif
|
14
|
+
|
15
|
+
__device__ int grid(int ndims)
|
16
|
+
{
|
17
|
+
int ret = blockIdx.x * blockDim.x + threadIdx.x;
|
18
|
+
if(ndims == 2) {
|
19
|
+
ret += threadIdx.y + blockIdx.y * blockDim.y * blockDim.x;
|
20
|
+
} else if(ndims == 3) {
|
21
|
+
ret += threadIdx.z + blockIdx.z * blockDim.z * blockDim.y * blockDim.x;
|
22
|
+
}
|
23
|
+
|
24
|
+
return ret;
|
25
|
+
}
|
26
|
+
|
27
|
+
static int _initialized = -1;
|
28
|
+
static int _block_size = 256;
|
29
|
+
static int _max_grid_size = 1024;
|
30
|
+
|
31
|
+
//static int _threads_per_block = 1;
|
32
|
+
|
33
|
+
PUBLIC int buffer_block_size()
|
34
|
+
{
|
35
|
+
return _block_size;
|
36
|
+
}
|
37
|
+
|
38
|
+
PUBLIC void buffer_set_block_size(int bs)
|
39
|
+
{
|
40
|
+
_block_size = bs;
|
41
|
+
}
|
42
|
+
|
43
|
+
PUBLIC int buffer_max_grid_size()
|
44
|
+
{
|
45
|
+
return _max_grid_size;
|
46
|
+
}
|
47
|
+
|
48
|
+
PUBLIC void buffer_set_max_grid_size(int gs)
|
49
|
+
{
|
50
|
+
_max_grid_size = gs;
|
51
|
+
}
|
52
|
+
|
53
|
+
static size_t _total_bytes_allocated = 0;
|
54
|
+
|
55
|
+
PUBLIC size_t buffer_total_bytes_allocated()
|
56
|
+
{
|
57
|
+
return _total_bytes_allocated;
|
58
|
+
}
|
59
|
+
|
60
|
+
static long long _num_allocated = 0;
|
61
|
+
|
62
|
+
PUBLIC long long buffer_num_allocated()
|
63
|
+
{
|
64
|
+
return _num_allocated;
|
65
|
+
}
|
66
|
+
|
67
|
+
|
68
|
+
typedef void (*kernel_func_t)(int, double *, const double *, const double *, int, void *);
|
69
|
+
|
70
|
+
Buffer launch_kerneln(kernel_func_t kernel, int length, const Buffer a, const Buffer b, void *data)
|
71
|
+
{
|
72
|
+
Buffer out;
|
73
|
+
int i;
|
74
|
+
|
75
|
+
if(a != NULL) {
|
76
|
+
int grid_size = (length + _block_size - 1) / _block_size;
|
77
|
+
|
78
|
+
out = buffer_new(length, 0.0);
|
79
|
+
if(out == NULL) {
|
80
|
+
return NULL;
|
81
|
+
}
|
82
|
+
|
83
|
+
if(grid_size >= _max_grid_size) {
|
84
|
+
for(i = 0; i < (grid_size / _max_grid_size); i++) {
|
85
|
+
kernel<<< _max_grid_size, _block_size >>>(length, out->data, a->data, b? b->data : NULL, i * _max_grid_size, data);
|
86
|
+
}
|
87
|
+
} else {
|
88
|
+
kernel<<< grid_size, _block_size >>>(length, out->data, a->data, b? b->data : NULL, 0, data);
|
89
|
+
}
|
90
|
+
return out;
|
91
|
+
} else {
|
92
|
+
return NULL;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
Buffer launch_kernel(kernel_func_t kernel, const Buffer a, const Buffer b, void *data)
|
97
|
+
{
|
98
|
+
if(a != NULL && (b == NULL || a->length == b->length)) {
|
99
|
+
return launch_kerneln(kernel, a->length, a, b, data);
|
100
|
+
} else {
|
101
|
+
return NULL;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
typedef void (*kerneld_func_t)(int, double *, const double *, double, int);
|
106
|
+
|
107
|
+
Buffer launchd_kernel(kerneld_func_t kernel, const Buffer a, double b, size_t offset)
|
108
|
+
{
|
109
|
+
Buffer out;
|
110
|
+
int i;
|
111
|
+
|
112
|
+
size_t length = a->length - offset;
|
113
|
+
int grid_size = (length + _block_size - 1) / _block_size;
|
114
|
+
|
115
|
+
if(a == NULL) return NULL;
|
116
|
+
|
117
|
+
out = buffer_new(length, 0.0);
|
118
|
+
if(out == NULL) return NULL;
|
119
|
+
|
120
|
+
if(grid_size >= _max_grid_size) {
|
121
|
+
for(i = 0; i < (grid_size / _max_grid_size); i++) {
|
122
|
+
kernel<<< _max_grid_size, _block_size >>>(out->length, out->data, a->data + offset, b, i * _max_grid_size);
|
123
|
+
}
|
124
|
+
} else {
|
125
|
+
kernel<<< grid_size, _block_size >>>(out->length, out->data, a->data + offset, b, 0);
|
126
|
+
}
|
127
|
+
return out;
|
128
|
+
}
|
129
|
+
|
130
|
+
typedef void (*modkerneld_func_t)(int, double *, double, int);
|
131
|
+
|
132
|
+
void launchd_modkernel(modkerneld_func_t kernel, const Buffer a, double b, size_t offset)
|
133
|
+
{
|
134
|
+
int i;
|
135
|
+
|
136
|
+
size_t length = a->length - offset;
|
137
|
+
int grid_size = (length + _block_size - 1) / _block_size;
|
138
|
+
|
139
|
+
if(a == NULL) return;
|
140
|
+
|
141
|
+
if(grid_size >= _max_grid_size) {
|
142
|
+
for(i = 0; i < (grid_size / _max_grid_size); i++) {
|
143
|
+
kernel<<< _max_grid_size, _block_size >>>(length, a->data + offset, b, i * _max_grid_size);
|
144
|
+
}
|
145
|
+
} else {
|
146
|
+
kernel<<< grid_size, _block_size >>>(length, a->data + offset, b, 0);
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
|
151
|
+
PUBLIC cudaError_t buffer_init(int device)
|
152
|
+
{
|
153
|
+
if(_initialized == -1) {
|
154
|
+
cudaDeviceProp props;
|
155
|
+
cudaError_t err;
|
156
|
+
|
157
|
+
err = cudaSetDevice(device);
|
158
|
+
if(err != 0) {
|
159
|
+
return err;
|
160
|
+
}
|
161
|
+
|
162
|
+
err = cudaGetDeviceProperties(&props, device);
|
163
|
+
if(err == 0) {
|
164
|
+
_block_size = props.maxThreadsPerBlock;
|
165
|
+
_max_grid_size = props.maxGridSize[0];
|
166
|
+
_initialized = device;
|
167
|
+
|
168
|
+
return cudaSuccess;
|
169
|
+
} else {
|
170
|
+
return err;
|
171
|
+
}
|
172
|
+
} else {
|
173
|
+
return cudaSuccess;
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
__global__ void buffer_setd_inner(int len, double *a, double b, int grid_offset)
|
178
|
+
{
|
179
|
+
int i = grid_offset + grid(1);
|
180
|
+
if(i < len) {
|
181
|
+
a[i] = b;
|
182
|
+
}
|
183
|
+
}
|
184
|
+
|
185
|
+
PUBLIC cudaError_t buffer_setd(Buffer b, double value, size_t offset, size_t length)
|
186
|
+
{
|
187
|
+
if(value == 0.0) {
|
188
|
+
if(offset < b->length) {
|
189
|
+
if(offset + length >= b->length) {
|
190
|
+
length = b->length - offset;
|
191
|
+
}
|
192
|
+
return cudaMemset(b->data, 0, length * sizeof(double));
|
193
|
+
} else {
|
194
|
+
return cudaSuccess;
|
195
|
+
}
|
196
|
+
} else {
|
197
|
+
launchd_modkernel(buffer_setd_inner, b, value, offset);
|
198
|
+
return cudaSuccess;
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
PUBLIC Buffer buffer_new(size_t length, double initial_value)
|
203
|
+
{
|
204
|
+
Buffer ptr = (Buffer)malloc(sizeof(Buffer_s));;
|
205
|
+
if(buffer_init(0) != 0) {
|
206
|
+
return NULL;
|
207
|
+
}
|
208
|
+
|
209
|
+
if(ptr != NULL) {
|
210
|
+
if(cudaMalloc(&ptr->data, length * sizeof(double)) != cudaSuccess) {
|
211
|
+
ptr->data = NULL;
|
212
|
+
buffer_free(ptr);
|
213
|
+
return NULL;
|
214
|
+
}
|
215
|
+
|
216
|
+
ptr->length = length;
|
217
|
+
if(buffer_setd(ptr, initial_value, 0, length) != cudaSuccess) {
|
218
|
+
buffer_free(ptr);
|
219
|
+
return NULL;
|
220
|
+
}
|
221
|
+
|
222
|
+
_total_bytes_allocated += length * sizeof(double);
|
223
|
+
_num_allocated++;
|
224
|
+
}
|
225
|
+
|
226
|
+
return ptr;
|
227
|
+
}
|
228
|
+
|
229
|
+
PUBLIC cudaError_t buffer_free(Buffer buffer)
|
230
|
+
{
|
231
|
+
if(buffer != NULL) {
|
232
|
+
if(buffer->data != NULL) {
|
233
|
+
cudaError_t err = cudaFree(buffer->data);
|
234
|
+
if(err != cudaSuccess) {
|
235
|
+
return err;
|
236
|
+
}
|
237
|
+
_total_bytes_allocated -= buffer->length * sizeof(double);
|
238
|
+
_num_allocated--;
|
239
|
+
}
|
240
|
+
free(buffer);
|
241
|
+
}
|
242
|
+
|
243
|
+
return cudaSuccess;
|
244
|
+
}
|
245
|
+
|
246
|
+
PUBLIC cudaError_t buffer_set(Buffer buffer, Buffer other)
|
247
|
+
{
|
248
|
+
if(buffer == NULL || other == NULL) return cudaErrorUnknown;
|
249
|
+
|
250
|
+
size_t length = other->length;
|
251
|
+
if(length > buffer->length) {
|
252
|
+
length = buffer->length;
|
253
|
+
}
|
254
|
+
return cudaMemcpy(buffer->data, other->data, length * sizeof(double), cudaMemcpyDeviceToDevice);
|
255
|
+
}
|
256
|
+
|
257
|
+
PUBLIC cudaError_t buffer_setn(Buffer buffer, size_t offset, Buffer other, size_t length)
|
258
|
+
{
|
259
|
+
if(length > other->length) {
|
260
|
+
length = other->length;
|
261
|
+
}
|
262
|
+
if((offset + length) > buffer->length) {
|
263
|
+
length = buffer->length - offset;
|
264
|
+
}
|
265
|
+
return cudaMemcpy(buffer->data + offset, other->data, length * sizeof(double), cudaMemcpyDeviceToDevice);
|
266
|
+
}
|
267
|
+
|
268
|
+
PUBLIC cudaError_t buffer_setvn(Buffer buffer, size_t offset, void *data, size_t length)
|
269
|
+
{
|
270
|
+
if((offset + length) > buffer->length) {
|
271
|
+
length = buffer->length - offset;
|
272
|
+
}
|
273
|
+
return cudaMemcpy(buffer->data + offset, data, length * sizeof(double), cudaMemcpyHostToDevice);
|
274
|
+
}
|
275
|
+
|
276
|
+
PUBLIC cudaError_t buffer_setv(Buffer buffer, void *data, size_t length)
|
277
|
+
{
|
278
|
+
return buffer_setvn(buffer, 0, data, length);
|
279
|
+
}
|
280
|
+
|
281
|
+
PUBLIC cudaError_t buffer_set_element(Buffer buffer, size_t n, double v)
|
282
|
+
{
|
283
|
+
if(buffer != NULL && n < buffer->length) {
|
284
|
+
return cudaMemcpy(buffer->data + n, &v, sizeof(double), cudaMemcpyHostToDevice);
|
285
|
+
} else {
|
286
|
+
return cudaErrorUnknown;
|
287
|
+
}
|
288
|
+
}
|
289
|
+
|
290
|
+
PUBLIC double buffer_get_element(Buffer buffer, size_t n)
|
291
|
+
{
|
292
|
+
if(buffer != NULL && n < buffer->length) {
|
293
|
+
double out;
|
294
|
+
cudaMemcpy(&out, buffer->data + n, sizeof(double), cudaMemcpyDeviceToHost);
|
295
|
+
return out;
|
296
|
+
} else {
|
297
|
+
return NAN;
|
298
|
+
}
|
299
|
+
}
|
300
|
+
|
301
|
+
PUBLIC cudaError_t buffer_get(Buffer buffer, void *out, size_t max_length)
|
302
|
+
{
|
303
|
+
if(buffer == NULL) return cudaErrorUnknown;
|
304
|
+
|
305
|
+
if(max_length > buffer->length) {
|
306
|
+
max_length = buffer->length;
|
307
|
+
}
|
308
|
+
|
309
|
+
cudaError_t err = cudaMemcpy(out, buffer->data, max_length * sizeof(double), cudaMemcpyDeviceToHost);
|
310
|
+
|
311
|
+
return err;
|
312
|
+
}
|
313
|
+
|
314
|
+
PUBLIC Buffer buffer_slice(Buffer buffer, size_t n, size_t max_length)
|
315
|
+
{
|
316
|
+
if(buffer != NULL && n < buffer->length) {
|
317
|
+
Buffer out = buffer_new(max_length, 0.0);
|
318
|
+
if(out == NULL) return NULL;
|
319
|
+
|
320
|
+
if((n + max_length) >= buffer->length) {
|
321
|
+
max_length = buffer->length - n;
|
322
|
+
}
|
323
|
+
|
324
|
+
cudaError_t err = cudaMemcpy(out->data, buffer->data + n, max_length * sizeof(double), cudaMemcpyDeviceToDevice);
|
325
|
+
|
326
|
+
if(err == cudaSuccess) {
|
327
|
+
return out;
|
328
|
+
} else {
|
329
|
+
buffer_free(out);
|
330
|
+
return NULL;
|
331
|
+
}
|
332
|
+
} else {
|
333
|
+
return NULL;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
PUBLIC cudaError_t buffer_host_slice(Buffer buffer, void *out, size_t n, size_t max_length)
|
338
|
+
{
|
339
|
+
if(buffer != NULL && n < buffer->length) {
|
340
|
+
if((n + max_length) >= buffer->length) {
|
341
|
+
max_length = buffer->length - n;
|
342
|
+
}
|
343
|
+
|
344
|
+
cudaError_t err = cudaMemcpy(out, buffer->data + n, max_length * sizeof(double), cudaMemcpyDeviceToHost);
|
345
|
+
return err;
|
346
|
+
} else {
|
347
|
+
return cudaErrorUnknown;
|
348
|
+
}
|
349
|
+
}
|
350
|
+
|
351
|
+
PUBLIC size_t buffer_length(const Buffer b)
|
352
|
+
{
|
353
|
+
if(b != NULL) {
|
354
|
+
return b->length;
|
355
|
+
} else {
|
356
|
+
return 0;
|
357
|
+
}
|
358
|
+
}
|
359
|
+
|
360
|
+
PUBLIC Buffer buffer_slice_2d(const Buffer in, int width, int height, int x, int y, int out_width, int out_height, double empty)
|
361
|
+
{
|
362
|
+
Buffer out = buffer_new(out_width * out_height, empty);
|
363
|
+
if(out == NULL) return NULL;
|
364
|
+
|
365
|
+
if(y >= height || x >= width) {
|
366
|
+
return out;
|
367
|
+
}
|
368
|
+
|
369
|
+
for(int i = 0; i < out_height; i++) {
|
370
|
+
int oi = i * out_width;
|
371
|
+
int w = out_width;
|
372
|
+
int iy = y + i;
|
373
|
+
int ii = (iy) * width + x;
|
374
|
+
|
375
|
+
if(x < 0) {
|
376
|
+
ii = ii - x;
|
377
|
+
w = w + x;
|
378
|
+
oi = oi - x;
|
379
|
+
} else if(x + w > width) {
|
380
|
+
w = width - x;
|
381
|
+
}
|
382
|
+
|
383
|
+
if(iy < 0) {
|
384
|
+
continue;
|
385
|
+
} else if(iy >= height || ii >= in->length) {
|
386
|
+
break;
|
387
|
+
}
|
388
|
+
|
389
|
+
cudaError_t err = cudaMemcpy(out->data + oi, in->data + ii, w * sizeof(double), cudaMemcpyDeviceToDevice);
|
390
|
+
if(err != cudaSuccess) {
|
391
|
+
buffer_free(out);
|
392
|
+
return NULL;
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
return out;
|
397
|
+
}
|
398
|
+
|
399
|
+
cudaError_t buffer_set2d_inner(Buffer dest, size_t dest_width, const void *src, size_t src_width, size_t src_height, size_t x, size_t y, cudaMemcpyKind kind)
|
400
|
+
{
|
401
|
+
if(dest == NULL
|
402
|
+
|| dest->data == NULL
|
403
|
+
|| dest_width == 0
|
404
|
+
|| src == NULL
|
405
|
+
|| src_width == 0
|
406
|
+
|| src_height == 0) {
|
407
|
+
return cudaErrorInvalidValue;
|
408
|
+
}
|
409
|
+
|
410
|
+
size_t copy_width = src_width;
|
411
|
+
if(x + src_width > dest_width) {
|
412
|
+
copy_width = dest_width - x;
|
413
|
+
}
|
414
|
+
size_t dest_height = dest->length / dest_width;
|
415
|
+
size_t copy_height = src_height;
|
416
|
+
if(y + src_height > dest_height) {
|
417
|
+
copy_height = dest_height - y;
|
418
|
+
}
|
419
|
+
size_t idx = y * dest_width + x;
|
420
|
+
if(idx >= dest->length
|
421
|
+
|| (idx + copy_width * copy_height) > dest->length) {
|
422
|
+
return cudaSuccess;
|
423
|
+
}
|
424
|
+
cudaError_t err = cudaMemcpy2D(dest->data + idx,
|
425
|
+
dest_width * sizeof(double),
|
426
|
+
src, src_width * sizeof(double),
|
427
|
+
copy_width * sizeof(double),
|
428
|
+
copy_height,
|
429
|
+
kind);
|
430
|
+
return err;
|
431
|
+
}
|
432
|
+
|
433
|
+
PUBLIC cudaError_t buffer_set2d(Buffer dest, size_t dest_width, const Buffer src, size_t src_width, size_t x, size_t y)
|
434
|
+
{
|
435
|
+
if(dest->length == 0
|
436
|
+
|| src->length == 0
|
437
|
+
|| src_width == 0
|
438
|
+
|| src->length % src_width > 0) {
|
439
|
+
return cudaErrorInvalidValue;
|
440
|
+
}
|
441
|
+
return buffer_set2d_inner(dest, dest_width, src->data, src_width, src->length / src_width, x, y, cudaMemcpyDeviceToDevice);
|
442
|
+
}
|
443
|
+
|
444
|
+
PUBLIC cudaError_t buffer_set2dv(Buffer dest, size_t dest_width, const void *src, size_t src_width, size_t src_height, size_t x, size_t y)
|
445
|
+
{
|
446
|
+
return buffer_set2d_inner(dest, dest_width, src, src_width, src_height, x, y, cudaMemcpyHostToDevice);
|
447
|
+
}
|
448
|
+
|
449
|
+
typedef double (*ReduceOp)(double a, double b);
|
450
|
+
typedef void (*ReduceKernel)(const double *, double *, size_t, size_t);
|
451
|
+
|
452
|
+
#define REDUCE_KERNEL(NAME, OP, INITIAL) \
|
453
|
+
double NAME ## _initial_value = INITIAL; \
|
454
|
+
double NAME ## _op(double a, double b) \
|
455
|
+
{ \
|
456
|
+
return OP; \
|
457
|
+
} \
|
458
|
+
\
|
459
|
+
__global__ void NAME(const double *data, double *partial_sums, size_t length, size_t offset) \
|
460
|
+
{ \
|
461
|
+
extern __shared__ double sdata[]; \
|
462
|
+
int i = offset + grid(1); \
|
463
|
+
int n; \
|
464
|
+
\
|
465
|
+
sdata[threadIdx.x] = (i < length)? data[i] : INITIAL;\
|
466
|
+
__syncthreads();\
|
467
|
+
\
|
468
|
+
for(n = blockDim.x / 2; n > 0; n = n / 2) {\
|
469
|
+
if(threadIdx.x < n) {\
|
470
|
+
double a = sdata[threadIdx.x]; \
|
471
|
+
double b = sdata[threadIdx.x + n]; \
|
472
|
+
sdata[threadIdx.x] = OP;\
|
473
|
+
}\
|
474
|
+
__syncthreads();\
|
475
|
+
}\
|
476
|
+
\
|
477
|
+
if(threadIdx.x == 0) { \
|
478
|
+
partial_sums[blockIdx.x] = sdata[0]; \
|
479
|
+
} \
|
480
|
+
}
|
481
|
+
|
482
|
+
double launch_reduce_inner(ReduceOp op, ReduceKernel reduce_kernel, const Buffer b, double initial)
|
483
|
+
{
|
484
|
+
int i;
|
485
|
+
int grid_size = (b->length + _block_size - 1) / _block_size;
|
486
|
+
Buffer partial_buffer;
|
487
|
+
double *partial_sums;
|
488
|
+
double out = initial;
|
489
|
+
int usable_grid = grid_size;
|
490
|
+
|
491
|
+
if(b == NULL || b->length == 0) {
|
492
|
+
return NAN;
|
493
|
+
}
|
494
|
+
|
495
|
+
if(grid_size >= _max_grid_size) {
|
496
|
+
usable_grid = _max_grid_size;
|
497
|
+
}
|
498
|
+
|
499
|
+
partial_buffer = buffer_new(_block_size, initial);
|
500
|
+
if(partial_buffer == NULL) return NAN;
|
501
|
+
|
502
|
+
for(i = 0; i < grid_size / usable_grid; i++) {
|
503
|
+
reduce_kernel<<< usable_grid, _block_size, _block_size * sizeof(double) >>>(b->data, partial_buffer->data, b->length, i * _block_size);
|
504
|
+
}
|
505
|
+
|
506
|
+
partial_sums = (double *)malloc(sizeof(double) * _block_size);
|
507
|
+
if(partial_sums == NULL) {
|
508
|
+
buffer_free(partial_buffer);
|
509
|
+
return NAN;
|
510
|
+
}
|
511
|
+
|
512
|
+
buffer_get(partial_buffer, partial_sums, _block_size);
|
513
|
+
|
514
|
+
out = partial_sums[0];
|
515
|
+
for(i = 1; i < _block_size; i++) {
|
516
|
+
out = op(out, partial_sums[i]);
|
517
|
+
}
|
518
|
+
|
519
|
+
buffer_free(partial_buffer);
|
520
|
+
free(partial_sums);
|
521
|
+
|
522
|
+
return out;
|
523
|
+
}
|
524
|
+
|
525
|
+
#define launch_reduce(kernel, buffer) launch_reduce_inner(kernel ## _op, kernel, buffer, kernel ## _initial_value)
|
526
|
+
|
527
|
+
|
528
|
+
REDUCE_KERNEL(buffer_sum_kernel, a + b, 0.0);
|
529
|
+
|
530
|
+
PUBLIC double buffer_sum(const Buffer b)
|
531
|
+
{
|
532
|
+
return launch_reduce(buffer_sum_kernel, b);
|
533
|
+
}
|
534
|
+
|
535
|
+
REDUCE_KERNEL(buffer_min_kernel, fmin(a, b), NAN);
|
536
|
+
|
537
|
+
PUBLIC double buffer_min(const Buffer b)
|
538
|
+
{
|
539
|
+
return launch_reduce(buffer_min_kernel, b);
|
540
|
+
}
|
541
|
+
|
542
|
+
REDUCE_KERNEL(buffer_max_kernel, fmax(a, b), NAN);
|
543
|
+
PUBLIC double buffer_max(const Buffer b)
|
544
|
+
{
|
545
|
+
return launch_reduce(buffer_max_kernel, b);
|
546
|
+
}
|
547
|
+
|
548
|
+
#define BINARY_OP(name, operation) \
|
549
|
+
__global__ void buffer_ ## name ## _inner(int len, double *out, const double *a, const double *b, int grid_offset, void *) \
|
550
|
+
{ \
|
551
|
+
int i = grid_offset + grid(1); \
|
552
|
+
if(i < len) { \
|
553
|
+
operation; \
|
554
|
+
} \
|
555
|
+
} \
|
556
|
+
\
|
557
|
+
PUBLIC Buffer buffer_##name(const Buffer a, const Buffer b) \
|
558
|
+
{ \
|
559
|
+
return launch_kernel(buffer_ ## name ## _inner, a, b, NULL); \
|
560
|
+
}
|
561
|
+
|
562
|
+
BINARY_OP(add, { out[i] = a[i] + b[i]; });
|
563
|
+
BINARY_OP(sub, { out[i] = a[i] - b[i]; });
|
564
|
+
BINARY_OP(mul, { out[i] = a[i] * b[i]; });
|
565
|
+
BINARY_OP(pow, { out[i] = pow(a[i], b[i]); });
|
566
|
+
BINARY_OP(div, { out[i] = a[i] / b[i]; });
|
567
|
+
BINARY_OP(collect_eq, { out[i] = a[i] == b[i]; });
|
568
|
+
BINARY_OP(collect_neq, { out[i] = a[i] != b[i]; });
|
569
|
+
BINARY_OP(collect_lt, { out[i] = a[i] < b[i]; });
|
570
|
+
BINARY_OP(collect_lte, { out[i] = a[i] <= b[i]; });
|
571
|
+
BINARY_OP(collect_gt, { out[i] = a[i] > b[i]; });
|
572
|
+
BINARY_OP(collect_gte, { out[i] = a[i] >= b[i]; });
|
573
|
+
|
574
|
+
#define SCALAR_OP(name, operation) \
|
575
|
+
__global__ void buffer_ ## name ## d_inner(int len, double *out, const double *a, const double b, int grid_offset) \
|
576
|
+
{ \
|
577
|
+
int i = grid_offset + grid(1); \
|
578
|
+
if(i < len) { \
|
579
|
+
operation; \
|
580
|
+
} \
|
581
|
+
} \
|
582
|
+
\
|
583
|
+
PUBLIC Buffer buffer_##name ## d(const Buffer a, double b) \
|
584
|
+
{ \
|
585
|
+
return launchd_kernel(buffer_ ## name ## d_inner, a, b, 0); \
|
586
|
+
}
|
587
|
+
|
588
|
+
SCALAR_OP(add, { out[i] = a[i] + b; });
|
589
|
+
SCALAR_OP(sub, { out[i] = a[i] - b; });
|
590
|
+
SCALAR_OP(mul, { out[i] = a[i] * b; });
|
591
|
+
SCALAR_OP(pow, { out[i] = pow(a[i], b); });
|
592
|
+
SCALAR_OP(div, { out[i] = a[i] / b; });
|
593
|
+
SCALAR_OP(collect_eq, { out[i] = a[i] == b; });
|
594
|
+
SCALAR_OP(collect_neq, { out[i] = a[i] != b; });
|
595
|
+
SCALAR_OP(collect_lt, { out[i] = a[i] < b; });
|
596
|
+
SCALAR_OP(collect_lte, { out[i] = a[i] <= b; });
|
597
|
+
SCALAR_OP(collect_gt, { out[i] = a[i] > b; });
|
598
|
+
SCALAR_OP(collect_gte, { out[i] = a[i] >= b; });
|
599
|
+
|
600
|
+
PUBLIC int buffer_eq(const Buffer a, const Buffer b)
|
601
|
+
{
|
602
|
+
// compare
|
603
|
+
Buffer results = buffer_collect_eq(a, b);
|
604
|
+
if(results != NULL) {
|
605
|
+
// reduce
|
606
|
+
double sum = buffer_sum(results);
|
607
|
+
|
608
|
+
// clean up
|
609
|
+
buffer_free(results);
|
610
|
+
|
611
|
+
return sum == a->length;
|
612
|
+
} else {
|
613
|
+
return 0;
|
614
|
+
}
|
615
|
+
}
|
616
|
+
|
617
|
+
#define FUNCTION_OP(name, operation) \
|
618
|
+
__global__ void buffer_ ## name ## _inner(int len, double *out, const double *a, const double b, int grid_offset) \
|
619
|
+
{ \
|
620
|
+
int i = grid_offset + grid(1); \
|
621
|
+
if(i < len) { \
|
622
|
+
operation; \
|
623
|
+
} \
|
624
|
+
} \
|
625
|
+
\
|
626
|
+
PUBLIC Buffer buffer_##name(const Buffer a) \
|
627
|
+
{ \
|
628
|
+
return launchd_kernel(buffer_ ## name ## _inner, a, 0.0, 0); \
|
629
|
+
}
|
630
|
+
|
631
|
+
|
632
|
+
FUNCTION_OP(abs, { out[i] = abs(a[i]); });
|
633
|
+
FUNCTION_OP(exp, { out[i] = exp(a[i]); });
|
634
|
+
FUNCTION_OP(log, { out[i] = log(a[i]); });
|
635
|
+
FUNCTION_OP(log10, { out[i] = log10(a[i]); });
|
636
|
+
FUNCTION_OP(log2, { out[i] = log2(a[i]); });
|
637
|
+
FUNCTION_OP(sqrt, { out[i] = sqrt(a[i]); });
|
638
|
+
FUNCTION_OP(floor, { out[i] = floor(a[i]); });
|
639
|
+
FUNCTION_OP(ceil, { out[i] = ceil(a[i]); });
|
640
|
+
FUNCTION_OP(round, { out[i] = round(a[i]); });
|
641
|
+
FUNCTION_OP(sin, { out[i] = sin(a[i]); });
|
642
|
+
FUNCTION_OP(cos, { out[i] = cos(a[i]); });
|
643
|
+
FUNCTION_OP(tan, { out[i] = tan(a[i]); });
|
644
|
+
FUNCTION_OP(asin, { out[i] = asin(a[i]); });
|
645
|
+
FUNCTION_OP(acos, { out[i] = acos(a[i]); });
|
646
|
+
FUNCTION_OP(atan, { out[i] = atan(a[i]); });
|
647
|
+
FUNCTION_OP(sinh, { out[i] = sinh(a[i]); });
|
648
|
+
FUNCTION_OP(cosh, { out[i] = cosh(a[i]); });
|
649
|
+
FUNCTION_OP(tanh, { out[i] = tanh(a[i]); });
|
650
|
+
FUNCTION_OP(asinh, { out[i] = asinh(a[i]); });
|
651
|
+
FUNCTION_OP(acosh, { out[i] = acosh(a[i]); });
|
652
|
+
FUNCTION_OP(atanh, { out[i] = atanh(a[i]); });
|
653
|
+
FUNCTION_OP(collect_nan, { out[i] = isnan(a[i]); });
|
654
|
+
FUNCTION_OP(collect_inf, { out[i] = isinf(a[i]); });
|
655
|
+
|
656
|
+
__global__ void buffer_dot_inner(double *out, const double *a, const double *b, size_t aw, size_t ah, size_t bw, size_t bh)
|
657
|
+
{
|
658
|
+
size_t row = blockIdx.y * blockDim.y + threadIdx.y;
|
659
|
+
size_t col = blockIdx.x * blockDim.x + threadIdx.x;
|
660
|
+
|
661
|
+
double sum = 0.0;
|
662
|
+
|
663
|
+
if (row < ah && col < bw) {
|
664
|
+
for (size_t i = 0; i < bh; i++) {
|
665
|
+
sum += a[row * aw + i] * b[i * bw + col];
|
666
|
+
}
|
667
|
+
}
|
668
|
+
out[row * bw + col] = sum;
|
669
|
+
}
|
670
|
+
|
671
|
+
PUBLIC Buffer buffer_dot(const Buffer a, size_t aw, size_t ah, const Buffer b, size_t bw, size_t bh)
|
672
|
+
{
|
673
|
+
if(aw * ah != a->length && bw * bh != b-> length && aw != bh) {
|
674
|
+
return NULL;
|
675
|
+
} else {
|
676
|
+
Buffer out = buffer_new(ah * bw, 0.0);
|
677
|
+
if(out == NULL) return NULL;
|
678
|
+
|
679
|
+
dim3 dim(bw, ah);
|
680
|
+
|
681
|
+
buffer_dot_inner<<< dim, 1 >>>(out->data, a->data, b->data, aw, ah, bw, bh);
|
682
|
+
|
683
|
+
return out;
|
684
|
+
}
|
685
|
+
}
|
686
|
+
|
687
|
+
__global__ void buffer_identity_inner(double *data, size_t w, size_t h)
|
688
|
+
{
|
689
|
+
size_t row = blockIdx.y * blockDim.y + threadIdx.y;
|
690
|
+
size_t col = blockIdx.x * blockDim.x + threadIdx.x;
|
691
|
+
|
692
|
+
if(row < h && col < w) {
|
693
|
+
data[row * w + col] = (double)(row == col);
|
694
|
+
}
|
695
|
+
}
|
696
|
+
|
697
|
+
PUBLIC Buffer buffer_identity(size_t w, size_t h)
|
698
|
+
{
|
699
|
+
Buffer i = buffer_new(w * h, 0.0);
|
700
|
+
dim3 dim(w, h);
|
701
|
+
buffer_identity_inner<<< dim, 1 >>>(i->data, w, h);
|
702
|
+
return i;
|
703
|
+
}
|
704
|
+
|
705
|
+
__global__ void buffer_diagflat_inner(double *data, const double *a, size_t len)
|
706
|
+
{
|
707
|
+
size_t row = blockIdx.y * blockDim.y + threadIdx.y;
|
708
|
+
size_t col = blockIdx.x * blockDim.x + threadIdx.x;
|
709
|
+
|
710
|
+
if(row >= len || col >= len) {
|
711
|
+
return;
|
712
|
+
}
|
713
|
+
|
714
|
+
if(row == col) {
|
715
|
+
data[row * len + col] = a[row];
|
716
|
+
} else {
|
717
|
+
data[row * len + col] = 0.0;
|
718
|
+
}
|
719
|
+
}
|
720
|
+
|
721
|
+
PUBLIC Buffer buffer_diagflat(const Buffer a)
|
722
|
+
{
|
723
|
+
Buffer i = buffer_new(a->length * a->length, 0.0);
|
724
|
+
dim3 dim(a->length, a->length);
|
725
|
+
buffer_diagflat_inner<<< dim, 1 >>>(i->data, a->data, a->length);
|
726
|
+
return i;
|
727
|
+
}
|