cumo 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/runtime.h +3 -1
- data/ext/cumo/include/cumo/narray.h +20 -4
- data/ext/cumo/include/cumo/reduce_kernel.h +10 -7
- data/ext/cumo/narray/data.c +35 -12
- data/ext/cumo/narray/gen/tmpl/accum.c +6 -1
- data/ext/cumo/narray/gen/tmpl/accum_index.c +6 -1
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +4 -4
- data/ext/cumo/narray/index.c +55 -16
- data/ext/cumo/narray/narray.c +31 -24
- data/ext/cumo/narray/ndloop.c +6 -2
- data/ext/cumo/narray/struct.c +7 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4689762e94e91b3f359190225d9841fa8af8fb7eeff2aef028a0702b00f0787f
|
4
|
+
data.tar.gz: 0b3bf965295354246b19c8aaee26eb454d0a54f7a3ec4b01cf893688ce7fd19b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0313c37d6a9b19ae026f3831d2d92b160213a3d8a29a8b7bc63c8ef0123e0b5581e6d935c0de2ce9c36c2850820c7afee9fac19e40d6ebdeb6c983766c639ea3
|
7
|
+
data.tar.gz: 7595ee6b049e4a8697840bf2502db04e93fca86c3604e3bb2ed0505ed0f5c5647148cc1ee0e0cbc71ea63ed9e1592330a9a63c444c41791d9a75c2f4de2396d7
|
data/ext/cumo/include/cumo.h
CHANGED
@@ -41,7 +41,9 @@ static inline bool
|
|
41
41
|
cumo_cuda_runtime_is_device_memory(void* ptr)
|
42
42
|
{
|
43
43
|
struct cudaPointerAttributes attrs;
|
44
|
-
cudaError_t status
|
44
|
+
cudaError_t status;
|
45
|
+
if (!ptr) { return false; }
|
46
|
+
status = cudaPointerGetAttributes(&attrs, ptr);
|
45
47
|
cudaGetLastError(); // reset last error to success
|
46
48
|
return (status != cudaErrorInvalidValue);
|
47
49
|
}
|
@@ -427,10 +427,26 @@ _cumo_na_get_narray_t(VALUE obj, unsigned char cumo_na_type)
|
|
427
427
|
|
428
428
|
#define CUMO_DEBUG_PRINT(v) puts(StringValueCStr(rb_funcall(v,rb_intern("inspect"),0)))
|
429
429
|
|
430
|
-
#define CUMO_NA_CumoIsNArray(obj)
|
431
|
-
|
432
|
-
|
433
|
-
|
430
|
+
#define CUMO_NA_CumoIsNArray(obj) (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
|
431
|
+
#define CUMO_NA_IsArray(obj) (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
|
432
|
+
|
433
|
+
static inline bool
|
434
|
+
cumo_na_has_idx_p(VALUE obj)
|
435
|
+
{
|
436
|
+
cumo_narray_t *na;
|
437
|
+
cumo_narray_view_t *nv;
|
438
|
+
int i = 0;
|
439
|
+
CumoGetNArray(obj, na);
|
440
|
+
if (CUMO_NA_TYPE(na) == CUMO_NARRAY_VIEW_T) {
|
441
|
+
CumoGetNArrayView(obj, nv);
|
442
|
+
for (; i < nv->base.ndim; ++i) {
|
443
|
+
if (nv->stridx[i].index) {
|
444
|
+
return true;
|
445
|
+
}
|
446
|
+
}
|
447
|
+
}
|
448
|
+
return false;
|
449
|
+
}
|
434
450
|
|
435
451
|
#define CUMO_NUM2REAL(v) NUM2DBL( rb_funcall((v),cumo_na_id_real,0) )
|
436
452
|
#define CUMO_NUM2IMAG(v) NUM2DBL( rb_funcall((v),cumo_na_id_imag,0) )
|
@@ -33,7 +33,7 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
|
|
33
33
|
cumo_na_indexer_t& in_indexer = arg.in_indexer;
|
34
34
|
cumo_na_indexer_t& out_indexer = arg.out_indexer;
|
35
35
|
|
36
|
-
using TypeReduce = decltype(impl.Identity());
|
36
|
+
using TypeReduce = decltype(impl.Identity(0));
|
37
37
|
|
38
38
|
extern __shared__ __align__(8) char sdata_raw[];
|
39
39
|
TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
|
@@ -48,14 +48,17 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
|
|
48
48
|
|
49
49
|
for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
|
50
50
|
cumo_na_indexer_set_dim(&out_indexer, i_out);
|
51
|
-
TypeReduce accum = impl.Identity();
|
52
|
-
|
53
51
|
int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
|
52
|
+
|
53
|
+
// Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
|
54
|
+
// Cumo returns index of input elements, CuPy returns index of reduction axis.
|
55
|
+
cumo_na_indexer_set_dim(&in_indexer, i_in);
|
56
|
+
TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
57
|
+
TypeReduce accum = impl.Identity(in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr));
|
58
|
+
|
54
59
|
for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
|
55
60
|
cumo_na_indexer_set_dim(&in_indexer, i_in);
|
56
|
-
|
57
|
-
// Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
|
58
|
-
// Cumo returns index of input elements, CuPy returns index of reduction axis.
|
61
|
+
in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
59
62
|
impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
|
60
63
|
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
|
61
64
|
}
|
@@ -102,7 +105,7 @@ void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
|
|
102
105
|
|
103
106
|
int64_t block_size = cumo_detail::max_block_size;
|
104
107
|
int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
|
105
|
-
int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
|
108
|
+
int64_t shared_mem_size = sizeof(decltype(impl.Identity(0))) * block_size;
|
106
109
|
|
107
110
|
cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
|
108
111
|
}
|
data/ext/cumo/narray/data.c
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include "cumo.h"
|
3
|
+
#include "cumo/cuda/memory_pool.h"
|
4
|
+
#include "cumo/cuda/runtime.h"
|
2
5
|
#include "cumo/narray.h"
|
3
6
|
#include "cumo/template.h"
|
4
7
|
|
@@ -56,7 +59,8 @@ iter_copy_bytes(cumo_na_loop_t *const lp)
|
|
56
59
|
{
|
57
60
|
size_t e;
|
58
61
|
e = lp->args[0].elmsz;
|
59
|
-
|
62
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_copy_bytes", "any");
|
63
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
60
64
|
LOOP_UNARY_PTR(lp,m_memcpy);
|
61
65
|
}
|
62
66
|
|
@@ -99,6 +103,8 @@ iter_swap_byte(cumo_na_loop_t *const lp)
|
|
99
103
|
e = lp->args[0].elmsz;
|
100
104
|
b1 = ALLOCA_N(char, e);
|
101
105
|
b2 = ALLOCA_N(char, e);
|
106
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_swap_bytes", "any");
|
107
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
102
108
|
LOOP_UNARY_PTR(lp,m_swap_byte);
|
103
109
|
}
|
104
110
|
|
@@ -489,10 +495,12 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
489
495
|
for (i=0; i<sd; i++) {
|
490
496
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
491
497
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
492
|
-
idx2 = ALLOC_N(size_t, shape[i]);
|
493
|
-
for (j=0; j<shape[i]; j++) {
|
494
|
-
|
495
|
-
}
|
498
|
+
// idx2 = ALLOC_N(size_t, shape[i]);
|
499
|
+
// for (j=0; j<shape[i]; j++) {
|
500
|
+
// idx2[j] = idx1[j];
|
501
|
+
// }
|
502
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[i]);
|
503
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*shape[i],cudaMemcpyDeviceToDevice,0));
|
496
504
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
497
505
|
} else {
|
498
506
|
na2->stridx[i] = na1->stridx[i];
|
@@ -505,7 +513,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
505
513
|
na2->stridx[sd] = na1->stridx[nd-1];
|
506
514
|
} else {
|
507
515
|
// set index
|
508
|
-
idx2 = ALLOC_N(size_t, shape[sd]);
|
516
|
+
// idx2 = ALLOC_N(size_t, shape[sd]);
|
517
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[sd]);
|
509
518
|
CUMO_SDX_SET_INDEX(na2->stridx[sd],idx2);
|
510
519
|
// init for md-loop
|
511
520
|
fd = nd-sd;
|
@@ -514,6 +523,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
514
523
|
pos = ALLOC_N(size_t, fd+1);
|
515
524
|
pos[0] = 0;
|
516
525
|
// md-loop
|
526
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_flatten_dim", "any");
|
527
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
517
528
|
for (i=j=0;;) {
|
518
529
|
for (; i<fd; i++) {
|
519
530
|
sdx = na1->stridx[i+sd];
|
@@ -726,10 +737,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
726
737
|
if (i != ax[0] && i != ax[1]) {
|
727
738
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
728
739
|
idx0 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
729
|
-
idx1 = ALLOC_N(size_t, na->shape[i]);
|
730
|
-
for (j=0; j<na->shape[i]; j++) {
|
731
|
-
|
732
|
-
}
|
740
|
+
// idx1 = ALLOC_N(size_t, na->shape[i]);
|
741
|
+
// for (j=0; j<na->shape[i]; j++) {
|
742
|
+
// idx1[j] = idx0[j];
|
743
|
+
// }
|
744
|
+
idx1 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na->shape[i]);
|
745
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx1,idx0,sizeof(size_t)*na->shape[i],cudaMemcpyDeviceToDevice,0));
|
733
746
|
CUMO_SDX_SET_INDEX(na2->stridx[k],idx1);
|
734
747
|
} else {
|
735
748
|
na2->stridx[k] = na1->stridx[i];
|
@@ -739,7 +752,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
739
752
|
}
|
740
753
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[0]])) {
|
741
754
|
idx0 = CUMO_SDX_GET_INDEX(na1->stridx[ax[0]]);
|
742
|
-
diag_idx = ALLOC_N(size_t, diag_size);
|
755
|
+
// diag_idx = ALLOC_N(size_t, diag_size);
|
756
|
+
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
757
|
+
|
758
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
759
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
760
|
+
|
743
761
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
|
744
762
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
745
763
|
for (j=0; j<diag_size; j++) {
|
@@ -756,7 +774,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
756
774
|
stride0 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[0]]);
|
757
775
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
|
758
776
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
759
|
-
diag_idx = ALLOC_N(size_t, diag_size);
|
777
|
+
// diag_idx = ALLOC_N(size_t, diag_size);
|
778
|
+
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
779
|
+
|
780
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
781
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
782
|
+
|
760
783
|
for (j=0; j<diag_size; j++) {
|
761
784
|
diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
|
762
785
|
}
|
@@ -77,7 +77,12 @@ static VALUE
|
|
77
77
|
<% else %>
|
78
78
|
reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
|
79
79
|
<% end %>
|
80
|
-
|
80
|
+
if (cumo_na_has_idx_p(self)) {
|
81
|
+
VALUE copy = cumo_na_copy(self); // reduction does not support idx, make contiguous
|
82
|
+
v = cumo_na_ndloop(&ndf, 2, copy, reduce);
|
83
|
+
} else {
|
84
|
+
v = cumo_na_ndloop(&ndf, 2, self, reduce);
|
85
|
+
}
|
81
86
|
<% if result_class == "cT" %>
|
82
87
|
return <%=type_name%>_extract(v);
|
83
88
|
<% else %>
|
@@ -113,7 +113,12 @@ static VALUE
|
|
113
113
|
<% end %>
|
114
114
|
}
|
115
115
|
|
116
|
-
|
116
|
+
if (cumo_na_has_idx_p(self)) {
|
117
|
+
VALUE copy = cumo_na_copy(self); // reduction does not support idx, make conttiguous
|
118
|
+
return cumo_na_ndloop(&ndf, 2, copy, reduce);
|
119
|
+
} else {
|
120
|
+
return cumo_na_ndloop(&ndf, 2, self, reduce);
|
121
|
+
}
|
117
122
|
}
|
118
123
|
<% end %>
|
119
124
|
}
|
@@ -17,7 +17,7 @@ struct cumo_<%=type_name%>_min_index_int<%=i%>_impl {
|
|
17
17
|
dtype min;
|
18
18
|
idx_t argmin;
|
19
19
|
};
|
20
|
-
__device__ MinAndArgMin Identity() { return {DATA_MAX,
|
20
|
+
__device__ MinAndArgMin Identity(idx_t index) { return {DATA_MAX, index}; }
|
21
21
|
__device__ MinAndArgMin MapIn(dtype in, idx_t index) { return {in, index}; }
|
22
22
|
__device__ void Reduce(MinAndArgMin next, MinAndArgMin& accum) {
|
23
23
|
if (accum.min > next.min) {
|
@@ -32,7 +32,7 @@ struct cumo_<%=type_name%>_max_index_int<%=i%>_impl {
|
|
32
32
|
dtype max;
|
33
33
|
idx_t argmax;
|
34
34
|
};
|
35
|
-
__device__ MaxAndArgMax Identity() { return {DATA_MIN,
|
35
|
+
__device__ MaxAndArgMax Identity(idx_t index) { return {DATA_MIN, index}; }
|
36
36
|
__device__ MaxAndArgMax MapIn(dtype in, idx_t index) { return {in, index}; }
|
37
37
|
__device__ void Reduce(MaxAndArgMax next, MaxAndArgMax& accum) {
|
38
38
|
if (accum.max < next.max) {
|
@@ -6,14 +6,14 @@
|
|
6
6
|
#endif
|
7
7
|
|
8
8
|
struct cumo_<%=type_name%>_sum_impl {
|
9
|
-
__device__ <%=dtype%> Identity() { return m_zero; }
|
9
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
|
10
10
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
11
11
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_add(next, accum); }
|
12
12
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
13
13
|
};
|
14
14
|
|
15
15
|
struct cumo_<%=type_name%>_prod_impl {
|
16
|
-
__device__ <%=dtype%> Identity() { return m_one; }
|
16
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
|
17
17
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
18
18
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_mul(next, accum); }
|
19
19
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
@@ -6,28 +6,28 @@
|
|
6
6
|
#endif
|
7
7
|
|
8
8
|
struct cumo_<%=type_name%>_sum_impl {
|
9
|
-
__device__ <%=dtype%> Identity() { return m_zero; }
|
9
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
|
10
10
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
11
11
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum += next; }
|
12
12
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
13
13
|
};
|
14
14
|
|
15
15
|
struct cumo_<%=type_name%>_prod_impl {
|
16
|
-
__device__ <%=dtype%> Identity() { return m_one; }
|
16
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
|
17
17
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
18
18
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum *= next; }
|
19
19
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
20
20
|
};
|
21
21
|
|
22
22
|
struct cumo_<%=type_name%>_min_impl {
|
23
|
-
__device__ dtype Identity() { return DATA_MAX; }
|
23
|
+
__device__ dtype Identity(int64_t /*index*/) { return DATA_MAX; }
|
24
24
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
25
25
|
__device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? next : accum; }
|
26
26
|
__device__ dtype MapOut(dtype accum) { return accum; }
|
27
27
|
};
|
28
28
|
|
29
29
|
struct cumo_<%=type_name%>_max_impl {
|
30
|
-
__device__ dtype Identity() { return DATA_MIN; }
|
30
|
+
__device__ dtype Identity(int64_t /*index*/) { return DATA_MIN; }
|
31
31
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
32
32
|
__device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? accum : next; }
|
33
33
|
__device__ dtype MapOut(dtype accum) { return accum; }
|
data/ext/cumo/narray/index.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
#include "cumo.h"
|
4
4
|
#include "cumo/narray.h"
|
5
5
|
#include "cumo/cuda/runtime.h"
|
6
|
+
#include "cumo/cuda/memory_pool.h"
|
6
7
|
#include "cumo/template.h"
|
7
8
|
|
8
9
|
#if SIZEOF_VOIDP == 8
|
@@ -52,7 +53,8 @@ print_index_arg(cumo_na_index_arg_t *q, int n)
|
|
52
53
|
printf(" q[%d].n=%"SZF"d\n",i,q[i].n);
|
53
54
|
printf(" q[%d].beg=%"SZF"d\n",i,q[i].beg);
|
54
55
|
printf(" q[%d].step=%"SZF"d\n",i,q[i].step);
|
55
|
-
printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
|
56
|
+
printf(" q[%d].idx=0x%"SZF"x (cuda:%d)\n",i,(size_t)q[i].idx, cumo_cuda_runtime_is_device_memory(q[i].idx));
|
57
|
+
// printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
|
56
58
|
printf(" q[%d].reduce=0x%x\n",i,q[i].reduce);
|
57
59
|
printf(" q[%d].orig_dim=%d\n",i,q[i].orig_dim);
|
58
60
|
}
|
@@ -121,15 +123,38 @@ cumo_na_range_check(ssize_t pos, ssize_t size, int dim)
|
|
121
123
|
return idx;
|
122
124
|
}
|
123
125
|
|
126
|
+
static void CUDART_CB
|
127
|
+
cumo_na_parse_array_callback(cudaStream_t stream, cudaError_t status, void *data)
|
128
|
+
{
|
129
|
+
cudaFreeHost(data);
|
130
|
+
}
|
131
|
+
|
132
|
+
// copy ruby array to idx
|
124
133
|
static void
|
125
134
|
cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
|
126
135
|
{
|
127
136
|
int k;
|
137
|
+
size_t* idx;
|
138
|
+
cudaError_t status;
|
128
139
|
int n = RARRAY_LEN(ary);
|
129
|
-
q->idx = ALLOC_N(size_t, n);
|
140
|
+
//q->idx = ALLOC_N(size_t, n);
|
141
|
+
//for (k=0; k<n; k++) {
|
142
|
+
// q->idx[k] = na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
|
143
|
+
//}
|
144
|
+
// make a contiguous pinned memory on host => copy to device => release pinned memory after copy finished on callback
|
145
|
+
q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
146
|
+
cudaHostAlloc((void**)&idx, sizeof(size_t)*n, cudaHostAllocDefault);
|
130
147
|
for (k=0; k<n; k++) {
|
131
|
-
|
148
|
+
idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
|
149
|
+
}
|
150
|
+
status = cudaMemcpyAsync(q->idx,idx,sizeof(size_t)*n,cudaMemcpyHostToDevice,0);
|
151
|
+
if (status == 0) {
|
152
|
+
cumo_cuda_runtime_check_status(cudaStreamAddCallback(0,cumo_na_parse_array_callback,idx,0));
|
153
|
+
} else {
|
154
|
+
cudaFreeHost(idx);
|
132
155
|
}
|
156
|
+
cumo_cuda_runtime_check_status(status);
|
157
|
+
|
133
158
|
q->n = n;
|
134
159
|
q->beg = 0;
|
135
160
|
q->step = 1;
|
@@ -137,13 +162,14 @@ cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *
|
|
137
162
|
q->orig_dim = orig_dim;
|
138
163
|
}
|
139
164
|
|
165
|
+
// copy narray to idx
|
140
166
|
static void
|
141
167
|
cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
|
142
168
|
{
|
143
169
|
VALUE idx;
|
144
170
|
cumo_narray_t *na;
|
145
171
|
cumo_narray_data_t *nidx;
|
146
|
-
size_t
|
172
|
+
size_t n;
|
147
173
|
ssize_t *nidxp;
|
148
174
|
|
149
175
|
CumoGetNArray(a,na);
|
@@ -155,16 +181,14 @@ cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_ar
|
|
155
181
|
cumo_na_store(idx,a);
|
156
182
|
|
157
183
|
CumoGetNArrayData(idx,nidx);
|
158
|
-
nidxp = (ssize_t*)nidx->ptr;
|
159
|
-
q->idx = ALLOC_N(size_t, n);
|
160
|
-
|
161
|
-
//
|
162
|
-
|
163
|
-
|
184
|
+
nidxp = (ssize_t*)nidx->ptr; // Cumo::NArray data resides on GPU
|
185
|
+
//q->idx = ALLOC_N(size_t, n);
|
186
|
+
//for (k=0; k<n; k++) {
|
187
|
+
// q->idx[k] = na_range_check(nidxp[k], size, orig_dim);
|
188
|
+
//}
|
189
|
+
q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
190
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(q->idx,nidxp,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
164
191
|
|
165
|
-
for (k=0; k<n; k++) {
|
166
|
-
q->idx[k] = cumo_na_range_check(nidxp[k], size, orig_dim);
|
167
|
-
}
|
168
192
|
q->n = n;
|
169
193
|
q->beg = 0;
|
170
194
|
q->step = 1;
|
@@ -401,6 +425,9 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
|
|
401
425
|
|
402
426
|
// array index
|
403
427
|
if (q[i].idx != NULL) {
|
428
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
|
429
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
430
|
+
|
404
431
|
index = q[i].idx;
|
405
432
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
406
433
|
q[i].idx = NULL;
|
@@ -456,6 +483,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
456
483
|
// index <- index
|
457
484
|
int k;
|
458
485
|
size_t *index = q[i].idx;
|
486
|
+
|
487
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
488
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
489
|
+
|
459
490
|
CUMO_SDX_SET_INDEX(na2->stridx[j], index);
|
460
491
|
q[i].idx = NULL;
|
461
492
|
|
@@ -467,6 +498,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
467
498
|
// index <- step
|
468
499
|
ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
|
469
500
|
size_t *index = q[i].idx;
|
501
|
+
|
502
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
503
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
504
|
+
|
470
505
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
471
506
|
q[i].idx = NULL;
|
472
507
|
|
@@ -494,8 +529,13 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
494
529
|
int k;
|
495
530
|
size_t beg = q[i].beg;
|
496
531
|
ssize_t step = q[i].step;
|
497
|
-
size_t *index = ALLOC_N(size_t, size);
|
532
|
+
// size_t *index = ALLOC_N(size_t, size);
|
533
|
+
size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
|
498
534
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
535
|
+
|
536
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
537
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
538
|
+
|
499
539
|
for (k=0; k<size; k++) {
|
500
540
|
index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
|
501
541
|
}
|
@@ -515,7 +555,6 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
515
555
|
na2->base.size = total;
|
516
556
|
}
|
517
557
|
|
518
|
-
|
519
558
|
static int
|
520
559
|
cumo_na_ndim_new_narray(int ndim, const cumo_na_index_arg_t *q)
|
521
560
|
{
|
@@ -625,7 +664,7 @@ cumo_na_aref_md_ensure(VALUE data_value)
|
|
625
664
|
cumo_na_aref_md_data_t *data = (cumo_na_aref_md_data_t*)(data_value);
|
626
665
|
int i;
|
627
666
|
for (i=0; i<data->ndim; i++) {
|
628
|
-
|
667
|
+
cumo_cuda_runtime_free((char*)(data->q[i].idx));
|
629
668
|
}
|
630
669
|
if (data->q) xfree(data->q);
|
631
670
|
return Qnil;
|
data/ext/cumo/narray/narray.c
CHANGED
@@ -168,12 +168,8 @@ cumo_na_view_free(void* ptr)
|
|
168
168
|
if (na->stridx != NULL) {
|
169
169
|
for (i=0; i<na->base.ndim; i++) {
|
170
170
|
if (CUMO_SDX_IS_INDEX(na->stridx[i])) {
|
171
|
-
void *
|
172
|
-
|
173
|
-
cumo_cuda_runtime_free(p);
|
174
|
-
} else {
|
175
|
-
xfree(p);
|
176
|
-
}
|
171
|
+
void *idx = CUMO_SDX_GET_INDEX(na->stridx[i]);
|
172
|
+
cumo_cuda_runtime_free(idx);
|
177
173
|
}
|
178
174
|
}
|
179
175
|
xfree(na->stridx);
|
@@ -880,7 +876,6 @@ VALUE
|
|
880
876
|
cumo_na_make_view(VALUE self)
|
881
877
|
{
|
882
878
|
int i, nd;
|
883
|
-
size_t j;
|
884
879
|
size_t *idx1, *idx2;
|
885
880
|
ssize_t stride;
|
886
881
|
cumo_narray_t *na;
|
@@ -914,10 +909,12 @@ cumo_na_make_view(VALUE self)
|
|
914
909
|
for (i=0; i<nd; i++) {
|
915
910
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
916
911
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
917
|
-
idx2 = ALLOC_N(size_t,na1->base.shape[i]);
|
918
|
-
for (j=0; j<na1->base.shape[i]; j++) {
|
919
|
-
|
920
|
-
}
|
912
|
+
// idx2 = ALLOC_N(size_t,na1->base.shape[i]);
|
913
|
+
// for (j=0; j<na1->base.shape[i]; j++) {
|
914
|
+
// idx2[j] = idx1[j];
|
915
|
+
// }
|
916
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na1->base.shape[i]);
|
917
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*na1->base.shape[i],cudaMemcpyDeviceToDevice,0));
|
921
918
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
922
919
|
} else {
|
923
920
|
na2->stridx[i] = na1->stridx[i];
|
@@ -947,8 +944,8 @@ static VALUE
|
|
947
944
|
cumo_na_expand_dims(VALUE self, VALUE vdim)
|
948
945
|
{
|
949
946
|
int i, j, nd, dim;
|
950
|
-
size_t *shape, *
|
951
|
-
cumo_stridx_t *stridx, *
|
947
|
+
size_t *shape, *na2_shape;
|
948
|
+
cumo_stridx_t *stridx, *na2_stridx;
|
952
949
|
cumo_narray_t *na;
|
953
950
|
cumo_narray_view_t *na2;
|
954
951
|
VALUE view;
|
@@ -970,25 +967,25 @@ cumo_na_expand_dims(VALUE self, VALUE vdim)
|
|
970
967
|
|
971
968
|
shape = ALLOC_N(size_t,nd+1);
|
972
969
|
stridx = ALLOC_N(cumo_stridx_t,nd+1);
|
973
|
-
|
974
|
-
|
970
|
+
na2_shape = na2->base.shape;
|
971
|
+
na2_stridx = na2->stridx;
|
975
972
|
|
976
973
|
for (i=j=0; i<=nd; i++) {
|
977
974
|
if (i==dim) {
|
978
975
|
shape[i] = 1;
|
979
976
|
CUMO_SDX_SET_STRIDE(stridx[i],0);
|
980
977
|
} else {
|
981
|
-
shape[i] =
|
982
|
-
stridx[i] =
|
978
|
+
shape[i] = na2_shape[j];
|
979
|
+
stridx[i] = na2_stridx[j];
|
983
980
|
j++;
|
984
981
|
}
|
985
982
|
}
|
986
983
|
|
987
984
|
na2->stridx = stridx;
|
988
|
-
xfree(
|
985
|
+
xfree(na2_stridx);
|
989
986
|
na2->base.shape = shape;
|
990
|
-
if (
|
991
|
-
xfree(
|
987
|
+
if (na2_shape != &(na2->base.size)) {
|
988
|
+
xfree(na2_shape);
|
992
989
|
}
|
993
990
|
na2->base.ndim++;
|
994
991
|
return view;
|
@@ -1054,15 +1051,25 @@ cumo_na_reverse(int argc, VALUE *argv, VALUE self)
|
|
1054
1051
|
n = na1->base.shape[i];
|
1055
1052
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
1056
1053
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
1057
|
-
idx2 = ALLOC_N(size_t,n);
|
1054
|
+
// idx2 = ALLOC_N(size_t,n);
|
1055
|
+
// if (cumo_na_test_reduce(reduce,i)) {
|
1056
|
+
// for (j=0; j<n; j++) {
|
1057
|
+
// idx2[n-1-j] = idx1[j];
|
1058
|
+
// }
|
1059
|
+
// } else {
|
1060
|
+
// for (j=0; j<n; j++) {
|
1061
|
+
// idx2[j] = idx1[j];
|
1062
|
+
// }
|
1063
|
+
// }
|
1064
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
1058
1065
|
if (cumo_na_test_reduce(reduce,i)) {
|
1066
|
+
CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_reverse", "any");
|
1067
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
1059
1068
|
for (j=0; j<n; j++) {
|
1060
1069
|
idx2[n-1-j] = idx1[j];
|
1061
1070
|
}
|
1062
1071
|
} else {
|
1063
|
-
|
1064
|
-
idx2[j] = idx1[j];
|
1065
|
-
}
|
1072
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
1066
1073
|
}
|
1067
1074
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
1068
1075
|
} else {
|
data/ext/cumo/narray/ndloop.c
CHANGED
@@ -164,7 +164,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
|
|
164
164
|
printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
|
165
165
|
printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
|
166
166
|
printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
|
167
|
-
printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
|
167
|
+
printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)lp->user.args[j].iter[i].idx, cumo_cuda_runtime_is_device_memory(lp->user.args[j].iter[i].idx));
|
168
|
+
// printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
|
168
169
|
}
|
169
170
|
}
|
170
171
|
//
|
@@ -174,7 +175,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
|
|
174
175
|
printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
|
175
176
|
printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
|
176
177
|
printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
|
177
|
-
printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
|
178
|
+
printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)LITER(lp,i,j).idx, cumo_cuda_runtime_is_device_memory(LITER(lp,i,j).idx));
|
179
|
+
// printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
|
178
180
|
}
|
179
181
|
printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
|
180
182
|
if (lp->xargs[j].bufcp) {
|
@@ -1489,6 +1491,8 @@ loop_narray(cumo_ndfunc_t *nf, cumo_na_md_loop_t *lp)
|
|
1489
1491
|
// j-th argument
|
1490
1492
|
for (j=0; j<lp->narg; j++) {
|
1491
1493
|
if (LITER(lp,i,j).idx) {
|
1494
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("loop_narrayx", "any");
|
1495
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
1492
1496
|
LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
|
1493
1497
|
} else {
|
1494
1498
|
LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
|
data/ext/cumo/narray/struct.c
CHANGED
@@ -76,7 +76,7 @@ void cumo_na_copy_array_structure(VALUE self, VALUE view);
|
|
76
76
|
static VALUE
|
77
77
|
cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
|
78
78
|
{
|
79
|
-
size_t
|
79
|
+
size_t n;
|
80
80
|
int j, k, ndim;
|
81
81
|
size_t *shape;
|
82
82
|
size_t *idx1, *idx2;
|
@@ -147,10 +147,12 @@ cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
|
|
147
147
|
if (CUMO_SDX_IS_INDEX(na1->stridx[j])) {
|
148
148
|
n = na1->base.shape[j];
|
149
149
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[j]);
|
150
|
-
idx2 = ALLOC_N(size_t, na1->base.shape[j]);
|
151
|
-
for (i=0; i<n; i++) {
|
152
|
-
|
153
|
-
}
|
150
|
+
// idx2 = ALLOC_N(size_t, na1->base.shape[j]);
|
151
|
+
// for (i=0; i<n; i++) {
|
152
|
+
// idx2[i] = idx1[i];
|
153
|
+
// }
|
154
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
155
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
154
156
|
CUMO_SDX_SET_INDEX(na2->stridx[j],idx2);
|
155
157
|
} else {
|
156
158
|
na2->stridx[j] = na1->stridx[j];
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cumo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|