cumo 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/runtime.h +3 -1
- data/ext/cumo/include/cumo/narray.h +20 -4
- data/ext/cumo/include/cumo/reduce_kernel.h +10 -7
- data/ext/cumo/narray/data.c +35 -12
- data/ext/cumo/narray/gen/tmpl/accum.c +6 -1
- data/ext/cumo/narray/gen/tmpl/accum_index.c +6 -1
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +4 -4
- data/ext/cumo/narray/index.c +55 -16
- data/ext/cumo/narray/narray.c +31 -24
- data/ext/cumo/narray/ndloop.c +6 -2
- data/ext/cumo/narray/struct.c +7 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4689762e94e91b3f359190225d9841fa8af8fb7eeff2aef028a0702b00f0787f
|
4
|
+
data.tar.gz: 0b3bf965295354246b19c8aaee26eb454d0a54f7a3ec4b01cf893688ce7fd19b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0313c37d6a9b19ae026f3831d2d92b160213a3d8a29a8b7bc63c8ef0123e0b5581e6d935c0de2ce9c36c2850820c7afee9fac19e40d6ebdeb6c983766c639ea3
|
7
|
+
data.tar.gz: 7595ee6b049e4a8697840bf2502db04e93fca86c3604e3bb2ed0505ed0f5c5647148cc1ee0e0cbc71ea63ed9e1592330a9a63c444c41791d9a75c2f4de2396d7
|
data/ext/cumo/include/cumo.h
CHANGED
@@ -41,7 +41,9 @@ static inline bool
|
|
41
41
|
cumo_cuda_runtime_is_device_memory(void* ptr)
|
42
42
|
{
|
43
43
|
struct cudaPointerAttributes attrs;
|
44
|
-
cudaError_t status
|
44
|
+
cudaError_t status;
|
45
|
+
if (!ptr) { return false; }
|
46
|
+
status = cudaPointerGetAttributes(&attrs, ptr);
|
45
47
|
cudaGetLastError(); // reset last error to success
|
46
48
|
return (status != cudaErrorInvalidValue);
|
47
49
|
}
|
@@ -427,10 +427,26 @@ _cumo_na_get_narray_t(VALUE obj, unsigned char cumo_na_type)
|
|
427
427
|
|
428
428
|
#define CUMO_DEBUG_PRINT(v) puts(StringValueCStr(rb_funcall(v,rb_intern("inspect"),0)))
|
429
429
|
|
430
|
-
#define CUMO_NA_CumoIsNArray(obj)
|
431
|
-
|
432
|
-
|
433
|
-
|
430
|
+
#define CUMO_NA_CumoIsNArray(obj) (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
|
431
|
+
#define CUMO_NA_IsArray(obj) (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
|
432
|
+
|
433
|
+
static inline bool
|
434
|
+
cumo_na_has_idx_p(VALUE obj)
|
435
|
+
{
|
436
|
+
cumo_narray_t *na;
|
437
|
+
cumo_narray_view_t *nv;
|
438
|
+
int i = 0;
|
439
|
+
CumoGetNArray(obj, na);
|
440
|
+
if (CUMO_NA_TYPE(na) == CUMO_NARRAY_VIEW_T) {
|
441
|
+
CumoGetNArrayView(obj, nv);
|
442
|
+
for (; i < nv->base.ndim; ++i) {
|
443
|
+
if (nv->stridx[i].index) {
|
444
|
+
return true;
|
445
|
+
}
|
446
|
+
}
|
447
|
+
}
|
448
|
+
return false;
|
449
|
+
}
|
434
450
|
|
435
451
|
#define CUMO_NUM2REAL(v) NUM2DBL( rb_funcall((v),cumo_na_id_real,0) )
|
436
452
|
#define CUMO_NUM2IMAG(v) NUM2DBL( rb_funcall((v),cumo_na_id_imag,0) )
|
@@ -33,7 +33,7 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
|
|
33
33
|
cumo_na_indexer_t& in_indexer = arg.in_indexer;
|
34
34
|
cumo_na_indexer_t& out_indexer = arg.out_indexer;
|
35
35
|
|
36
|
-
using TypeReduce = decltype(impl.Identity());
|
36
|
+
using TypeReduce = decltype(impl.Identity(0));
|
37
37
|
|
38
38
|
extern __shared__ __align__(8) char sdata_raw[];
|
39
39
|
TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
|
@@ -48,14 +48,17 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
|
|
48
48
|
|
49
49
|
for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
|
50
50
|
cumo_na_indexer_set_dim(&out_indexer, i_out);
|
51
|
-
TypeReduce accum = impl.Identity();
|
52
|
-
|
53
51
|
int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
|
52
|
+
|
53
|
+
// Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
|
54
|
+
// Cumo returns index of input elements, CuPy returns index of reduction axis.
|
55
|
+
cumo_na_indexer_set_dim(&in_indexer, i_in);
|
56
|
+
TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
57
|
+
TypeReduce accum = impl.Identity(in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr));
|
58
|
+
|
54
59
|
for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
|
55
60
|
cumo_na_indexer_set_dim(&in_indexer, i_in);
|
56
|
-
|
57
|
-
// Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
|
58
|
-
// Cumo returns index of input elements, CuPy returns index of reduction axis.
|
61
|
+
in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
59
62
|
impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
|
60
63
|
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
|
61
64
|
}
|
@@ -102,7 +105,7 @@ void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
|
|
102
105
|
|
103
106
|
int64_t block_size = cumo_detail::max_block_size;
|
104
107
|
int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
|
105
|
-
int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
|
108
|
+
int64_t shared_mem_size = sizeof(decltype(impl.Identity(0))) * block_size;
|
106
109
|
|
107
110
|
cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
|
108
111
|
}
|
data/ext/cumo/narray/data.c
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include "cumo.h"
|
3
|
+
#include "cumo/cuda/memory_pool.h"
|
4
|
+
#include "cumo/cuda/runtime.h"
|
2
5
|
#include "cumo/narray.h"
|
3
6
|
#include "cumo/template.h"
|
4
7
|
|
@@ -56,7 +59,8 @@ iter_copy_bytes(cumo_na_loop_t *const lp)
|
|
56
59
|
{
|
57
60
|
size_t e;
|
58
61
|
e = lp->args[0].elmsz;
|
59
|
-
|
62
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_copy_bytes", "any");
|
63
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
60
64
|
LOOP_UNARY_PTR(lp,m_memcpy);
|
61
65
|
}
|
62
66
|
|
@@ -99,6 +103,8 @@ iter_swap_byte(cumo_na_loop_t *const lp)
|
|
99
103
|
e = lp->args[0].elmsz;
|
100
104
|
b1 = ALLOCA_N(char, e);
|
101
105
|
b2 = ALLOCA_N(char, e);
|
106
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_swap_bytes", "any");
|
107
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
102
108
|
LOOP_UNARY_PTR(lp,m_swap_byte);
|
103
109
|
}
|
104
110
|
|
@@ -489,10 +495,12 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
489
495
|
for (i=0; i<sd; i++) {
|
490
496
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
491
497
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
492
|
-
idx2 = ALLOC_N(size_t, shape[i]);
|
493
|
-
for (j=0; j<shape[i]; j++) {
|
494
|
-
|
495
|
-
}
|
498
|
+
// idx2 = ALLOC_N(size_t, shape[i]);
|
499
|
+
// for (j=0; j<shape[i]; j++) {
|
500
|
+
// idx2[j] = idx1[j];
|
501
|
+
// }
|
502
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[i]);
|
503
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*shape[i],cudaMemcpyDeviceToDevice,0));
|
496
504
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
497
505
|
} else {
|
498
506
|
na2->stridx[i] = na1->stridx[i];
|
@@ -505,7 +513,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
505
513
|
na2->stridx[sd] = na1->stridx[nd-1];
|
506
514
|
} else {
|
507
515
|
// set index
|
508
|
-
idx2 = ALLOC_N(size_t, shape[sd]);
|
516
|
+
// idx2 = ALLOC_N(size_t, shape[sd]);
|
517
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[sd]);
|
509
518
|
CUMO_SDX_SET_INDEX(na2->stridx[sd],idx2);
|
510
519
|
// init for md-loop
|
511
520
|
fd = nd-sd;
|
@@ -514,6 +523,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
|
|
514
523
|
pos = ALLOC_N(size_t, fd+1);
|
515
524
|
pos[0] = 0;
|
516
525
|
// md-loop
|
526
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_flatten_dim", "any");
|
527
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
517
528
|
for (i=j=0;;) {
|
518
529
|
for (; i<fd; i++) {
|
519
530
|
sdx = na1->stridx[i+sd];
|
@@ -726,10 +737,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
726
737
|
if (i != ax[0] && i != ax[1]) {
|
727
738
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
728
739
|
idx0 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
729
|
-
idx1 = ALLOC_N(size_t, na->shape[i]);
|
730
|
-
for (j=0; j<na->shape[i]; j++) {
|
731
|
-
|
732
|
-
}
|
740
|
+
// idx1 = ALLOC_N(size_t, na->shape[i]);
|
741
|
+
// for (j=0; j<na->shape[i]; j++) {
|
742
|
+
// idx1[j] = idx0[j];
|
743
|
+
// }
|
744
|
+
idx1 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na->shape[i]);
|
745
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx1,idx0,sizeof(size_t)*na->shape[i],cudaMemcpyDeviceToDevice,0));
|
733
746
|
CUMO_SDX_SET_INDEX(na2->stridx[k],idx1);
|
734
747
|
} else {
|
735
748
|
na2->stridx[k] = na1->stridx[i];
|
@@ -739,7 +752,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
739
752
|
}
|
740
753
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[0]])) {
|
741
754
|
idx0 = CUMO_SDX_GET_INDEX(na1->stridx[ax[0]]);
|
742
|
-
diag_idx = ALLOC_N(size_t, diag_size);
|
755
|
+
// diag_idx = ALLOC_N(size_t, diag_size);
|
756
|
+
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
757
|
+
|
758
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
759
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
760
|
+
|
743
761
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
|
744
762
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
745
763
|
for (j=0; j<diag_size; j++) {
|
@@ -756,7 +774,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
756
774
|
stride0 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[0]]);
|
757
775
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
|
758
776
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
759
|
-
diag_idx = ALLOC_N(size_t, diag_size);
|
777
|
+
// diag_idx = ALLOC_N(size_t, diag_size);
|
778
|
+
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
779
|
+
|
780
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
781
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
782
|
+
|
760
783
|
for (j=0; j<diag_size; j++) {
|
761
784
|
diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
|
762
785
|
}
|
@@ -77,7 +77,12 @@ static VALUE
|
|
77
77
|
<% else %>
|
78
78
|
reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
|
79
79
|
<% end %>
|
80
|
-
|
80
|
+
if (cumo_na_has_idx_p(self)) {
|
81
|
+
VALUE copy = cumo_na_copy(self); // reduction does not support idx, make contiguous
|
82
|
+
v = cumo_na_ndloop(&ndf, 2, copy, reduce);
|
83
|
+
} else {
|
84
|
+
v = cumo_na_ndloop(&ndf, 2, self, reduce);
|
85
|
+
}
|
81
86
|
<% if result_class == "cT" %>
|
82
87
|
return <%=type_name%>_extract(v);
|
83
88
|
<% else %>
|
@@ -113,7 +113,12 @@ static VALUE
|
|
113
113
|
<% end %>
|
114
114
|
}
|
115
115
|
|
116
|
-
|
116
|
+
if (cumo_na_has_idx_p(self)) {
|
117
|
+
VALUE copy = cumo_na_copy(self); // reduction does not support idx, make conttiguous
|
118
|
+
return cumo_na_ndloop(&ndf, 2, copy, reduce);
|
119
|
+
} else {
|
120
|
+
return cumo_na_ndloop(&ndf, 2, self, reduce);
|
121
|
+
}
|
117
122
|
}
|
118
123
|
<% end %>
|
119
124
|
}
|
@@ -17,7 +17,7 @@ struct cumo_<%=type_name%>_min_index_int<%=i%>_impl {
|
|
17
17
|
dtype min;
|
18
18
|
idx_t argmin;
|
19
19
|
};
|
20
|
-
__device__ MinAndArgMin Identity() { return {DATA_MAX,
|
20
|
+
__device__ MinAndArgMin Identity(idx_t index) { return {DATA_MAX, index}; }
|
21
21
|
__device__ MinAndArgMin MapIn(dtype in, idx_t index) { return {in, index}; }
|
22
22
|
__device__ void Reduce(MinAndArgMin next, MinAndArgMin& accum) {
|
23
23
|
if (accum.min > next.min) {
|
@@ -32,7 +32,7 @@ struct cumo_<%=type_name%>_max_index_int<%=i%>_impl {
|
|
32
32
|
dtype max;
|
33
33
|
idx_t argmax;
|
34
34
|
};
|
35
|
-
__device__ MaxAndArgMax Identity() { return {DATA_MIN,
|
35
|
+
__device__ MaxAndArgMax Identity(idx_t index) { return {DATA_MIN, index}; }
|
36
36
|
__device__ MaxAndArgMax MapIn(dtype in, idx_t index) { return {in, index}; }
|
37
37
|
__device__ void Reduce(MaxAndArgMax next, MaxAndArgMax& accum) {
|
38
38
|
if (accum.max < next.max) {
|
@@ -6,14 +6,14 @@
|
|
6
6
|
#endif
|
7
7
|
|
8
8
|
struct cumo_<%=type_name%>_sum_impl {
|
9
|
-
__device__ <%=dtype%> Identity() { return m_zero; }
|
9
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
|
10
10
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
11
11
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_add(next, accum); }
|
12
12
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
13
13
|
};
|
14
14
|
|
15
15
|
struct cumo_<%=type_name%>_prod_impl {
|
16
|
-
__device__ <%=dtype%> Identity() { return m_one; }
|
16
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
|
17
17
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
18
18
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_mul(next, accum); }
|
19
19
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
@@ -6,28 +6,28 @@
|
|
6
6
|
#endif
|
7
7
|
|
8
8
|
struct cumo_<%=type_name%>_sum_impl {
|
9
|
-
__device__ <%=dtype%> Identity() { return m_zero; }
|
9
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
|
10
10
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
11
11
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum += next; }
|
12
12
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
13
13
|
};
|
14
14
|
|
15
15
|
struct cumo_<%=type_name%>_prod_impl {
|
16
|
-
__device__ <%=dtype%> Identity() { return m_one; }
|
16
|
+
__device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
|
17
17
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
18
18
|
__device__ void Reduce(dtype next, <%=dtype%>& accum) { accum *= next; }
|
19
19
|
__device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
|
20
20
|
};
|
21
21
|
|
22
22
|
struct cumo_<%=type_name%>_min_impl {
|
23
|
-
__device__ dtype Identity() { return DATA_MAX; }
|
23
|
+
__device__ dtype Identity(int64_t /*index*/) { return DATA_MAX; }
|
24
24
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
25
25
|
__device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? next : accum; }
|
26
26
|
__device__ dtype MapOut(dtype accum) { return accum; }
|
27
27
|
};
|
28
28
|
|
29
29
|
struct cumo_<%=type_name%>_max_impl {
|
30
|
-
__device__ dtype Identity() { return DATA_MIN; }
|
30
|
+
__device__ dtype Identity(int64_t /*index*/) { return DATA_MIN; }
|
31
31
|
__device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
|
32
32
|
__device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? accum : next; }
|
33
33
|
__device__ dtype MapOut(dtype accum) { return accum; }
|
data/ext/cumo/narray/index.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
#include "cumo.h"
|
4
4
|
#include "cumo/narray.h"
|
5
5
|
#include "cumo/cuda/runtime.h"
|
6
|
+
#include "cumo/cuda/memory_pool.h"
|
6
7
|
#include "cumo/template.h"
|
7
8
|
|
8
9
|
#if SIZEOF_VOIDP == 8
|
@@ -52,7 +53,8 @@ print_index_arg(cumo_na_index_arg_t *q, int n)
|
|
52
53
|
printf(" q[%d].n=%"SZF"d\n",i,q[i].n);
|
53
54
|
printf(" q[%d].beg=%"SZF"d\n",i,q[i].beg);
|
54
55
|
printf(" q[%d].step=%"SZF"d\n",i,q[i].step);
|
55
|
-
printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
|
56
|
+
printf(" q[%d].idx=0x%"SZF"x (cuda:%d)\n",i,(size_t)q[i].idx, cumo_cuda_runtime_is_device_memory(q[i].idx));
|
57
|
+
// printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
|
56
58
|
printf(" q[%d].reduce=0x%x\n",i,q[i].reduce);
|
57
59
|
printf(" q[%d].orig_dim=%d\n",i,q[i].orig_dim);
|
58
60
|
}
|
@@ -121,15 +123,38 @@ cumo_na_range_check(ssize_t pos, ssize_t size, int dim)
|
|
121
123
|
return idx;
|
122
124
|
}
|
123
125
|
|
126
|
+
static void CUDART_CB
|
127
|
+
cumo_na_parse_array_callback(cudaStream_t stream, cudaError_t status, void *data)
|
128
|
+
{
|
129
|
+
cudaFreeHost(data);
|
130
|
+
}
|
131
|
+
|
132
|
+
// copy ruby array to idx
|
124
133
|
static void
|
125
134
|
cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
|
126
135
|
{
|
127
136
|
int k;
|
137
|
+
size_t* idx;
|
138
|
+
cudaError_t status;
|
128
139
|
int n = RARRAY_LEN(ary);
|
129
|
-
q->idx = ALLOC_N(size_t, n);
|
140
|
+
//q->idx = ALLOC_N(size_t, n);
|
141
|
+
//for (k=0; k<n; k++) {
|
142
|
+
// q->idx[k] = na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
|
143
|
+
//}
|
144
|
+
// make a contiguous pinned memory on host => copy to device => release pinned memory after copy finished on callback
|
145
|
+
q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
146
|
+
cudaHostAlloc((void**)&idx, sizeof(size_t)*n, cudaHostAllocDefault);
|
130
147
|
for (k=0; k<n; k++) {
|
131
|
-
|
148
|
+
idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
|
149
|
+
}
|
150
|
+
status = cudaMemcpyAsync(q->idx,idx,sizeof(size_t)*n,cudaMemcpyHostToDevice,0);
|
151
|
+
if (status == 0) {
|
152
|
+
cumo_cuda_runtime_check_status(cudaStreamAddCallback(0,cumo_na_parse_array_callback,idx,0));
|
153
|
+
} else {
|
154
|
+
cudaFreeHost(idx);
|
132
155
|
}
|
156
|
+
cumo_cuda_runtime_check_status(status);
|
157
|
+
|
133
158
|
q->n = n;
|
134
159
|
q->beg = 0;
|
135
160
|
q->step = 1;
|
@@ -137,13 +162,14 @@ cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *
|
|
137
162
|
q->orig_dim = orig_dim;
|
138
163
|
}
|
139
164
|
|
165
|
+
// copy narray to idx
|
140
166
|
static void
|
141
167
|
cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
|
142
168
|
{
|
143
169
|
VALUE idx;
|
144
170
|
cumo_narray_t *na;
|
145
171
|
cumo_narray_data_t *nidx;
|
146
|
-
size_t
|
172
|
+
size_t n;
|
147
173
|
ssize_t *nidxp;
|
148
174
|
|
149
175
|
CumoGetNArray(a,na);
|
@@ -155,16 +181,14 @@ cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_ar
|
|
155
181
|
cumo_na_store(idx,a);
|
156
182
|
|
157
183
|
CumoGetNArrayData(idx,nidx);
|
158
|
-
nidxp = (ssize_t*)nidx->ptr;
|
159
|
-
q->idx = ALLOC_N(size_t, n);
|
160
|
-
|
161
|
-
//
|
162
|
-
|
163
|
-
|
184
|
+
nidxp = (ssize_t*)nidx->ptr; // Cumo::NArray data resides on GPU
|
185
|
+
//q->idx = ALLOC_N(size_t, n);
|
186
|
+
//for (k=0; k<n; k++) {
|
187
|
+
// q->idx[k] = na_range_check(nidxp[k], size, orig_dim);
|
188
|
+
//}
|
189
|
+
q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
190
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(q->idx,nidxp,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
164
191
|
|
165
|
-
for (k=0; k<n; k++) {
|
166
|
-
q->idx[k] = cumo_na_range_check(nidxp[k], size, orig_dim);
|
167
|
-
}
|
168
192
|
q->n = n;
|
169
193
|
q->beg = 0;
|
170
194
|
q->step = 1;
|
@@ -401,6 +425,9 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
|
|
401
425
|
|
402
426
|
// array index
|
403
427
|
if (q[i].idx != NULL) {
|
428
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
|
429
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
430
|
+
|
404
431
|
index = q[i].idx;
|
405
432
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
406
433
|
q[i].idx = NULL;
|
@@ -456,6 +483,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
456
483
|
// index <- index
|
457
484
|
int k;
|
458
485
|
size_t *index = q[i].idx;
|
486
|
+
|
487
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
488
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
489
|
+
|
459
490
|
CUMO_SDX_SET_INDEX(na2->stridx[j], index);
|
460
491
|
q[i].idx = NULL;
|
461
492
|
|
@@ -467,6 +498,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
467
498
|
// index <- step
|
468
499
|
ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
|
469
500
|
size_t *index = q[i].idx;
|
501
|
+
|
502
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
503
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
504
|
+
|
470
505
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
471
506
|
q[i].idx = NULL;
|
472
507
|
|
@@ -494,8 +529,13 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
494
529
|
int k;
|
495
530
|
size_t beg = q[i].beg;
|
496
531
|
ssize_t step = q[i].step;
|
497
|
-
size_t *index = ALLOC_N(size_t, size);
|
532
|
+
// size_t *index = ALLOC_N(size_t, size);
|
533
|
+
size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
|
498
534
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
535
|
+
|
536
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
537
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
538
|
+
|
499
539
|
for (k=0; k<size; k++) {
|
500
540
|
index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
|
501
541
|
}
|
@@ -515,7 +555,6 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
515
555
|
na2->base.size = total;
|
516
556
|
}
|
517
557
|
|
518
|
-
|
519
558
|
static int
|
520
559
|
cumo_na_ndim_new_narray(int ndim, const cumo_na_index_arg_t *q)
|
521
560
|
{
|
@@ -625,7 +664,7 @@ cumo_na_aref_md_ensure(VALUE data_value)
|
|
625
664
|
cumo_na_aref_md_data_t *data = (cumo_na_aref_md_data_t*)(data_value);
|
626
665
|
int i;
|
627
666
|
for (i=0; i<data->ndim; i++) {
|
628
|
-
|
667
|
+
cumo_cuda_runtime_free((char*)(data->q[i].idx));
|
629
668
|
}
|
630
669
|
if (data->q) xfree(data->q);
|
631
670
|
return Qnil;
|
data/ext/cumo/narray/narray.c
CHANGED
@@ -168,12 +168,8 @@ cumo_na_view_free(void* ptr)
|
|
168
168
|
if (na->stridx != NULL) {
|
169
169
|
for (i=0; i<na->base.ndim; i++) {
|
170
170
|
if (CUMO_SDX_IS_INDEX(na->stridx[i])) {
|
171
|
-
void *
|
172
|
-
|
173
|
-
cumo_cuda_runtime_free(p);
|
174
|
-
} else {
|
175
|
-
xfree(p);
|
176
|
-
}
|
171
|
+
void *idx = CUMO_SDX_GET_INDEX(na->stridx[i]);
|
172
|
+
cumo_cuda_runtime_free(idx);
|
177
173
|
}
|
178
174
|
}
|
179
175
|
xfree(na->stridx);
|
@@ -880,7 +876,6 @@ VALUE
|
|
880
876
|
cumo_na_make_view(VALUE self)
|
881
877
|
{
|
882
878
|
int i, nd;
|
883
|
-
size_t j;
|
884
879
|
size_t *idx1, *idx2;
|
885
880
|
ssize_t stride;
|
886
881
|
cumo_narray_t *na;
|
@@ -914,10 +909,12 @@ cumo_na_make_view(VALUE self)
|
|
914
909
|
for (i=0; i<nd; i++) {
|
915
910
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
916
911
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
917
|
-
idx2 = ALLOC_N(size_t,na1->base.shape[i]);
|
918
|
-
for (j=0; j<na1->base.shape[i]; j++) {
|
919
|
-
|
920
|
-
}
|
912
|
+
// idx2 = ALLOC_N(size_t,na1->base.shape[i]);
|
913
|
+
// for (j=0; j<na1->base.shape[i]; j++) {
|
914
|
+
// idx2[j] = idx1[j];
|
915
|
+
// }
|
916
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na1->base.shape[i]);
|
917
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*na1->base.shape[i],cudaMemcpyDeviceToDevice,0));
|
921
918
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
922
919
|
} else {
|
923
920
|
na2->stridx[i] = na1->stridx[i];
|
@@ -947,8 +944,8 @@ static VALUE
|
|
947
944
|
cumo_na_expand_dims(VALUE self, VALUE vdim)
|
948
945
|
{
|
949
946
|
int i, j, nd, dim;
|
950
|
-
size_t *shape, *
|
951
|
-
cumo_stridx_t *stridx, *
|
947
|
+
size_t *shape, *na2_shape;
|
948
|
+
cumo_stridx_t *stridx, *na2_stridx;
|
952
949
|
cumo_narray_t *na;
|
953
950
|
cumo_narray_view_t *na2;
|
954
951
|
VALUE view;
|
@@ -970,25 +967,25 @@ cumo_na_expand_dims(VALUE self, VALUE vdim)
|
|
970
967
|
|
971
968
|
shape = ALLOC_N(size_t,nd+1);
|
972
969
|
stridx = ALLOC_N(cumo_stridx_t,nd+1);
|
973
|
-
|
974
|
-
|
970
|
+
na2_shape = na2->base.shape;
|
971
|
+
na2_stridx = na2->stridx;
|
975
972
|
|
976
973
|
for (i=j=0; i<=nd; i++) {
|
977
974
|
if (i==dim) {
|
978
975
|
shape[i] = 1;
|
979
976
|
CUMO_SDX_SET_STRIDE(stridx[i],0);
|
980
977
|
} else {
|
981
|
-
shape[i] =
|
982
|
-
stridx[i] =
|
978
|
+
shape[i] = na2_shape[j];
|
979
|
+
stridx[i] = na2_stridx[j];
|
983
980
|
j++;
|
984
981
|
}
|
985
982
|
}
|
986
983
|
|
987
984
|
na2->stridx = stridx;
|
988
|
-
xfree(
|
985
|
+
xfree(na2_stridx);
|
989
986
|
na2->base.shape = shape;
|
990
|
-
if (
|
991
|
-
xfree(
|
987
|
+
if (na2_shape != &(na2->base.size)) {
|
988
|
+
xfree(na2_shape);
|
992
989
|
}
|
993
990
|
na2->base.ndim++;
|
994
991
|
return view;
|
@@ -1054,15 +1051,25 @@ cumo_na_reverse(int argc, VALUE *argv, VALUE self)
|
|
1054
1051
|
n = na1->base.shape[i];
|
1055
1052
|
if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
|
1056
1053
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
|
1057
|
-
idx2 = ALLOC_N(size_t,n);
|
1054
|
+
// idx2 = ALLOC_N(size_t,n);
|
1055
|
+
// if (cumo_na_test_reduce(reduce,i)) {
|
1056
|
+
// for (j=0; j<n; j++) {
|
1057
|
+
// idx2[n-1-j] = idx1[j];
|
1058
|
+
// }
|
1059
|
+
// } else {
|
1060
|
+
// for (j=0; j<n; j++) {
|
1061
|
+
// idx2[j] = idx1[j];
|
1062
|
+
// }
|
1063
|
+
// }
|
1064
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
1058
1065
|
if (cumo_na_test_reduce(reduce,i)) {
|
1066
|
+
CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_reverse", "any");
|
1067
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
1059
1068
|
for (j=0; j<n; j++) {
|
1060
1069
|
idx2[n-1-j] = idx1[j];
|
1061
1070
|
}
|
1062
1071
|
} else {
|
1063
|
-
|
1064
|
-
idx2[j] = idx1[j];
|
1065
|
-
}
|
1072
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
1066
1073
|
}
|
1067
1074
|
CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
|
1068
1075
|
} else {
|
data/ext/cumo/narray/ndloop.c
CHANGED
@@ -164,7 +164,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
|
|
164
164
|
printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
|
165
165
|
printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
|
166
166
|
printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
|
167
|
-
printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
|
167
|
+
printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)lp->user.args[j].iter[i].idx, cumo_cuda_runtime_is_device_memory(lp->user.args[j].iter[i].idx));
|
168
|
+
// printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
|
168
169
|
}
|
169
170
|
}
|
170
171
|
//
|
@@ -174,7 +175,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
|
|
174
175
|
printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
|
175
176
|
printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
|
176
177
|
printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
|
177
|
-
printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
|
178
|
+
printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)LITER(lp,i,j).idx, cumo_cuda_runtime_is_device_memory(LITER(lp,i,j).idx));
|
179
|
+
// printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
|
178
180
|
}
|
179
181
|
printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
|
180
182
|
if (lp->xargs[j].bufcp) {
|
@@ -1489,6 +1491,8 @@ loop_narray(cumo_ndfunc_t *nf, cumo_na_md_loop_t *lp)
|
|
1489
1491
|
// j-th argument
|
1490
1492
|
for (j=0; j<lp->narg; j++) {
|
1491
1493
|
if (LITER(lp,i,j).idx) {
|
1494
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("loop_narrayx", "any");
|
1495
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
1492
1496
|
LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
|
1493
1497
|
} else {
|
1494
1498
|
LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
|
data/ext/cumo/narray/struct.c
CHANGED
@@ -76,7 +76,7 @@ void cumo_na_copy_array_structure(VALUE self, VALUE view);
|
|
76
76
|
static VALUE
|
77
77
|
cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
|
78
78
|
{
|
79
|
-
size_t
|
79
|
+
size_t n;
|
80
80
|
int j, k, ndim;
|
81
81
|
size_t *shape;
|
82
82
|
size_t *idx1, *idx2;
|
@@ -147,10 +147,12 @@ cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
|
|
147
147
|
if (CUMO_SDX_IS_INDEX(na1->stridx[j])) {
|
148
148
|
n = na1->base.shape[j];
|
149
149
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[j]);
|
150
|
-
idx2 = ALLOC_N(size_t, na1->base.shape[j]);
|
151
|
-
for (i=0; i<n; i++) {
|
152
|
-
|
153
|
-
}
|
150
|
+
// idx2 = ALLOC_N(size_t, na1->base.shape[j]);
|
151
|
+
// for (i=0; i<n; i++) {
|
152
|
+
// idx2[i] = idx1[i];
|
153
|
+
// }
|
154
|
+
idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
|
155
|
+
cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
|
154
156
|
CUMO_SDX_SET_INDEX(na2->stridx[j],idx2);
|
155
157
|
} else {
|
156
158
|
na2->stridx[j] = na1->stridx[j];
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cumo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|