cumo 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a00113b29c4cd47082953a1327d8b5c30c29543d42d7b9c3de1fb81a06d44c3
4
- data.tar.gz: 78014e61144693436adc2c822b40f4520615cbeda8a7776600a3cc53cbe39474
3
+ metadata.gz: 4689762e94e91b3f359190225d9841fa8af8fb7eeff2aef028a0702b00f0787f
4
+ data.tar.gz: 0b3bf965295354246b19c8aaee26eb454d0a54f7a3ec4b01cf893688ce7fd19b
5
5
  SHA512:
6
- metadata.gz: e93d0fb045838fd34a047488a287dfc6c777106255b7e0783e56148967f0c773c4e14a6f9590360f2521b277cdb46cd71bdbe307bd0cf68456942e8e26449a7c
7
- data.tar.gz: cab27fb6523bc0362b5b5bb8e144dbed26c7636bfa7adc8ba69be3f9ad408dd00c2b2a28f59a750108856bb50986f8a398adf0844cfed8150f79475002da56c3
6
+ metadata.gz: 0313c37d6a9b19ae026f3831d2d92b160213a3d8a29a8b7bc63c8ef0123e0b5581e6d935c0de2ce9c36c2850820c7afee9fac19e40d6ebdeb6c983766c639ea3
7
+ data.tar.gz: 7595ee6b049e4a8697840bf2502db04e93fca86c3604e3bb2ed0505ed0f5c5647148cc1ee0e0cbc71ea63ed9e1592330a9a63c444c41791d9a75c2f4de2396d7
@@ -10,8 +10,8 @@ extern "C" {
10
10
  #endif
11
11
  #endif
12
12
 
13
- #define CUMO_VERSION "0.1.2"
14
- #define CUMO_VERSION_CODE 12
13
+ #define CUMO_VERSION "0.2.0"
14
+ #define CUMO_VERSION_CODE 20
15
15
 
16
16
  bool cumo_compatible_mode_enabled_p();
17
17
  bool cumo_show_warning_enabled_p();
@@ -41,7 +41,9 @@ static inline bool
41
41
  cumo_cuda_runtime_is_device_memory(void* ptr)
42
42
  {
43
43
  struct cudaPointerAttributes attrs;
44
- cudaError_t status = cudaPointerGetAttributes(&attrs, ptr);
44
+ cudaError_t status;
45
+ if (!ptr) { return false; }
46
+ status = cudaPointerGetAttributes(&attrs, ptr);
45
47
  cudaGetLastError(); // reset last error to success
46
48
  return (status != cudaErrorInvalidValue);
47
49
  }
@@ -427,10 +427,26 @@ _cumo_na_get_narray_t(VALUE obj, unsigned char cumo_na_type)
427
427
 
428
428
  #define CUMO_DEBUG_PRINT(v) puts(StringValueCStr(rb_funcall(v,rb_intern("inspect"),0)))
429
429
 
430
- #define CUMO_NA_CumoIsNArray(obj) \
431
- (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
432
- #define CUMO_NA_IsArray(obj) \
433
- (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
430
+ #define CUMO_NA_CumoIsNArray(obj) (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
431
+ #define CUMO_NA_IsArray(obj) (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
432
+
433
+ static inline bool
434
+ cumo_na_has_idx_p(VALUE obj)
435
+ {
436
+ cumo_narray_t *na;
437
+ cumo_narray_view_t *nv;
438
+ int i = 0;
439
+ CumoGetNArray(obj, na);
440
+ if (CUMO_NA_TYPE(na) == CUMO_NARRAY_VIEW_T) {
441
+ CumoGetNArrayView(obj, nv);
442
+ for (; i < nv->base.ndim; ++i) {
443
+ if (nv->stridx[i].index) {
444
+ return true;
445
+ }
446
+ }
447
+ }
448
+ return false;
449
+ }
434
450
 
435
451
  #define CUMO_NUM2REAL(v) NUM2DBL( rb_funcall((v),cumo_na_id_real,0) )
436
452
  #define CUMO_NUM2IMAG(v) NUM2DBL( rb_funcall((v),cumo_na_id_imag,0) )
@@ -33,7 +33,7 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
33
33
  cumo_na_indexer_t& in_indexer = arg.in_indexer;
34
34
  cumo_na_indexer_t& out_indexer = arg.out_indexer;
35
35
 
36
- using TypeReduce = decltype(impl.Identity());
36
+ using TypeReduce = decltype(impl.Identity(0));
37
37
 
38
38
  extern __shared__ __align__(8) char sdata_raw[];
39
39
  TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
@@ -48,14 +48,17 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
48
48
 
49
49
  for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
50
50
  cumo_na_indexer_set_dim(&out_indexer, i_out);
51
- TypeReduce accum = impl.Identity();
52
-
53
51
  int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
52
+
53
+ // Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
54
+ // Cumo returns index of input elements, CuPy returns index of reduction axis.
55
+ cumo_na_indexer_set_dim(&in_indexer, i_in);
56
+ TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
57
+ TypeReduce accum = impl.Identity(in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr));
58
+
54
59
  for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
55
60
  cumo_na_indexer_set_dim(&in_indexer, i_in);
56
- TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
57
- // Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
58
- // Cumo returns index of input elements, CuPy returns index of reduction axis.
61
+ in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
59
62
  impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
60
63
  //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
61
64
  }
@@ -102,7 +105,7 @@ void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
102
105
 
103
106
  int64_t block_size = cumo_detail::max_block_size;
104
107
  int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
105
- int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
108
+ int64_t shared_mem_size = sizeof(decltype(impl.Identity(0))) * block_size;
106
109
 
107
110
  cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
108
111
  }
@@ -1,4 +1,7 @@
1
1
  #include <ruby.h>
2
+ #include "cumo.h"
3
+ #include "cumo/cuda/memory_pool.h"
4
+ #include "cumo/cuda/runtime.h"
2
5
  #include "cumo/narray.h"
3
6
  #include "cumo/template.h"
4
7
 
@@ -56,7 +59,8 @@ iter_copy_bytes(cumo_na_loop_t *const lp)
56
59
  {
57
60
  size_t e;
58
61
  e = lp->args[0].elmsz;
59
- // TODO(sonots): CUDA kernelize
62
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_copy_bytes", "any");
63
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
60
64
  LOOP_UNARY_PTR(lp,m_memcpy);
61
65
  }
62
66
 
@@ -99,6 +103,8 @@ iter_swap_byte(cumo_na_loop_t *const lp)
99
103
  e = lp->args[0].elmsz;
100
104
  b1 = ALLOCA_N(char, e);
101
105
  b2 = ALLOCA_N(char, e);
106
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_swap_bytes", "any");
107
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
102
108
  LOOP_UNARY_PTR(lp,m_swap_byte);
103
109
  }
104
110
 
@@ -489,10 +495,12 @@ cumo_na_flatten_dim(VALUE self, int sd)
489
495
  for (i=0; i<sd; i++) {
490
496
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
491
497
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
492
- idx2 = ALLOC_N(size_t, shape[i]);
493
- for (j=0; j<shape[i]; j++) {
494
- idx2[j] = idx1[j];
495
- }
498
+ // idx2 = ALLOC_N(size_t, shape[i]);
499
+ // for (j=0; j<shape[i]; j++) {
500
+ // idx2[j] = idx1[j];
501
+ // }
502
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[i]);
503
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*shape[i],cudaMemcpyDeviceToDevice,0));
496
504
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
497
505
  } else {
498
506
  na2->stridx[i] = na1->stridx[i];
@@ -505,7 +513,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
505
513
  na2->stridx[sd] = na1->stridx[nd-1];
506
514
  } else {
507
515
  // set index
508
- idx2 = ALLOC_N(size_t, shape[sd]);
516
+ // idx2 = ALLOC_N(size_t, shape[sd]);
517
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[sd]);
509
518
  CUMO_SDX_SET_INDEX(na2->stridx[sd],idx2);
510
519
  // init for md-loop
511
520
  fd = nd-sd;
@@ -514,6 +523,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
514
523
  pos = ALLOC_N(size_t, fd+1);
515
524
  pos[0] = 0;
516
525
  // md-loop
526
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_flatten_dim", "any");
527
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
517
528
  for (i=j=0;;) {
518
529
  for (; i<fd; i++) {
519
530
  sdx = na1->stridx[i+sd];
@@ -726,10 +737,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
726
737
  if (i != ax[0] && i != ax[1]) {
727
738
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
728
739
  idx0 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
729
- idx1 = ALLOC_N(size_t, na->shape[i]);
730
- for (j=0; j<na->shape[i]; j++) {
731
- idx1[j] = idx0[j];
732
- }
740
+ // idx1 = ALLOC_N(size_t, na->shape[i]);
741
+ // for (j=0; j<na->shape[i]; j++) {
742
+ // idx1[j] = idx0[j];
743
+ // }
744
+ idx1 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na->shape[i]);
745
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx1,idx0,sizeof(size_t)*na->shape[i],cudaMemcpyDeviceToDevice,0));
733
746
  CUMO_SDX_SET_INDEX(na2->stridx[k],idx1);
734
747
  } else {
735
748
  na2->stridx[k] = na1->stridx[i];
@@ -739,7 +752,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
739
752
  }
740
753
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[0]])) {
741
754
  idx0 = CUMO_SDX_GET_INDEX(na1->stridx[ax[0]]);
742
- diag_idx = ALLOC_N(size_t, diag_size);
755
+ // diag_idx = ALLOC_N(size_t, diag_size);
756
+ diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
757
+
758
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
759
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
760
+
743
761
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
744
762
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
745
763
  for (j=0; j<diag_size; j++) {
@@ -756,7 +774,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
756
774
  stride0 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[0]]);
757
775
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
758
776
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
759
- diag_idx = ALLOC_N(size_t, diag_size);
777
+ // diag_idx = ALLOC_N(size_t, diag_size);
778
+ diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
779
+
780
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
781
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
782
+
760
783
  for (j=0; j<diag_size; j++) {
761
784
  diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
762
785
  }
@@ -77,7 +77,12 @@ static VALUE
77
77
  <% else %>
78
78
  reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
79
79
  <% end %>
80
- v = cumo_na_ndloop(&ndf, 2, self, reduce);
80
+ if (cumo_na_has_idx_p(self)) {
81
+ VALUE copy = cumo_na_copy(self); // reduction does not support idx, make contiguous
82
+ v = cumo_na_ndloop(&ndf, 2, copy, reduce);
83
+ } else {
84
+ v = cumo_na_ndloop(&ndf, 2, self, reduce);
85
+ }
81
86
  <% if result_class == "cT" %>
82
87
  return <%=type_name%>_extract(v);
83
88
  <% else %>
@@ -113,7 +113,12 @@ static VALUE
113
113
  <% end %>
114
114
  }
115
115
 
116
- return cumo_na_ndloop(&ndf, 2, self, reduce);
116
+ if (cumo_na_has_idx_p(self)) {
117
+ VALUE copy = cumo_na_copy(self); // reduction does not support idx, make conttiguous
118
+ return cumo_na_ndloop(&ndf, 2, copy, reduce);
119
+ } else {
120
+ return cumo_na_ndloop(&ndf, 2, self, reduce);
121
+ }
117
122
  }
118
123
  <% end %>
119
124
  }
@@ -17,7 +17,7 @@ struct cumo_<%=type_name%>_min_index_int<%=i%>_impl {
17
17
  dtype min;
18
18
  idx_t argmin;
19
19
  };
20
- __device__ MinAndArgMin Identity() { return {DATA_MAX, 0}; }
20
+ __device__ MinAndArgMin Identity(idx_t index) { return {DATA_MAX, index}; }
21
21
  __device__ MinAndArgMin MapIn(dtype in, idx_t index) { return {in, index}; }
22
22
  __device__ void Reduce(MinAndArgMin next, MinAndArgMin& accum) {
23
23
  if (accum.min > next.min) {
@@ -32,7 +32,7 @@ struct cumo_<%=type_name%>_max_index_int<%=i%>_impl {
32
32
  dtype max;
33
33
  idx_t argmax;
34
34
  };
35
- __device__ MaxAndArgMax Identity() { return {DATA_MIN, 0}; }
35
+ __device__ MaxAndArgMax Identity(idx_t index) { return {DATA_MIN, index}; }
36
36
  __device__ MaxAndArgMax MapIn(dtype in, idx_t index) { return {in, index}; }
37
37
  __device__ void Reduce(MaxAndArgMax next, MaxAndArgMax& accum) {
38
38
  if (accum.max < next.max) {
@@ -6,14 +6,14 @@
6
6
  #endif
7
7
 
8
8
  struct cumo_<%=type_name%>_sum_impl {
9
- __device__ <%=dtype%> Identity() { return m_zero; }
9
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
10
10
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
11
11
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_add(next, accum); }
12
12
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
13
13
  };
14
14
 
15
15
  struct cumo_<%=type_name%>_prod_impl {
16
- __device__ <%=dtype%> Identity() { return m_one; }
16
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
17
17
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
18
18
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_mul(next, accum); }
19
19
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
@@ -6,28 +6,28 @@
6
6
  #endif
7
7
 
8
8
  struct cumo_<%=type_name%>_sum_impl {
9
- __device__ <%=dtype%> Identity() { return m_zero; }
9
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
10
10
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
11
11
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum += next; }
12
12
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
13
13
  };
14
14
 
15
15
  struct cumo_<%=type_name%>_prod_impl {
16
- __device__ <%=dtype%> Identity() { return m_one; }
16
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
17
17
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
18
18
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum *= next; }
19
19
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
20
20
  };
21
21
 
22
22
  struct cumo_<%=type_name%>_min_impl {
23
- __device__ dtype Identity() { return DATA_MAX; }
23
+ __device__ dtype Identity(int64_t /*index*/) { return DATA_MAX; }
24
24
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
25
25
  __device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? next : accum; }
26
26
  __device__ dtype MapOut(dtype accum) { return accum; }
27
27
  };
28
28
 
29
29
  struct cumo_<%=type_name%>_max_impl {
30
- __device__ dtype Identity() { return DATA_MIN; }
30
+ __device__ dtype Identity(int64_t /*index*/) { return DATA_MIN; }
31
31
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
32
32
  __device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? accum : next; }
33
33
  __device__ dtype MapOut(dtype accum) { return accum; }
@@ -3,6 +3,7 @@
3
3
  #include "cumo.h"
4
4
  #include "cumo/narray.h"
5
5
  #include "cumo/cuda/runtime.h"
6
+ #include "cumo/cuda/memory_pool.h"
6
7
  #include "cumo/template.h"
7
8
 
8
9
  #if SIZEOF_VOIDP == 8
@@ -52,7 +53,8 @@ print_index_arg(cumo_na_index_arg_t *q, int n)
52
53
  printf(" q[%d].n=%"SZF"d\n",i,q[i].n);
53
54
  printf(" q[%d].beg=%"SZF"d\n",i,q[i].beg);
54
55
  printf(" q[%d].step=%"SZF"d\n",i,q[i].step);
55
- printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
56
+ printf(" q[%d].idx=0x%"SZF"x (cuda:%d)\n",i,(size_t)q[i].idx, cumo_cuda_runtime_is_device_memory(q[i].idx));
57
+ // printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
56
58
  printf(" q[%d].reduce=0x%x\n",i,q[i].reduce);
57
59
  printf(" q[%d].orig_dim=%d\n",i,q[i].orig_dim);
58
60
  }
@@ -121,15 +123,38 @@ cumo_na_range_check(ssize_t pos, ssize_t size, int dim)
121
123
  return idx;
122
124
  }
123
125
 
126
+ static void CUDART_CB
127
+ cumo_na_parse_array_callback(cudaStream_t stream, cudaError_t status, void *data)
128
+ {
129
+ cudaFreeHost(data);
130
+ }
131
+
132
+ // copy ruby array to idx
124
133
  static void
125
134
  cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
126
135
  {
127
136
  int k;
137
+ size_t* idx;
138
+ cudaError_t status;
128
139
  int n = RARRAY_LEN(ary);
129
- q->idx = ALLOC_N(size_t, n);
140
+ //q->idx = ALLOC_N(size_t, n);
141
+ //for (k=0; k<n; k++) {
142
+ // q->idx[k] = na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
143
+ //}
144
+ // make a contiguous pinned memory on host => copy to device => release pinned memory after copy finished on callback
145
+ q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
146
+ cudaHostAlloc((void**)&idx, sizeof(size_t)*n, cudaHostAllocDefault);
130
147
  for (k=0; k<n; k++) {
131
- q->idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
148
+ idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
149
+ }
150
+ status = cudaMemcpyAsync(q->idx,idx,sizeof(size_t)*n,cudaMemcpyHostToDevice,0);
151
+ if (status == 0) {
152
+ cumo_cuda_runtime_check_status(cudaStreamAddCallback(0,cumo_na_parse_array_callback,idx,0));
153
+ } else {
154
+ cudaFreeHost(idx);
132
155
  }
156
+ cumo_cuda_runtime_check_status(status);
157
+
133
158
  q->n = n;
134
159
  q->beg = 0;
135
160
  q->step = 1;
@@ -137,13 +162,14 @@ cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *
137
162
  q->orig_dim = orig_dim;
138
163
  }
139
164
 
165
+ // copy narray to idx
140
166
  static void
141
167
  cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
142
168
  {
143
169
  VALUE idx;
144
170
  cumo_narray_t *na;
145
171
  cumo_narray_data_t *nidx;
146
- size_t k, n;
172
+ size_t n;
147
173
  ssize_t *nidxp;
148
174
 
149
175
  CumoGetNArray(a,na);
@@ -155,16 +181,14 @@ cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_ar
155
181
  cumo_na_store(idx,a);
156
182
 
157
183
  CumoGetNArrayData(idx,nidx);
158
- nidxp = (ssize_t*)nidx->ptr;
159
- q->idx = ALLOC_N(size_t, n);
160
-
161
- // ndixp is cuda memory (cuda narray)
162
- CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_parse_narray_index", "any");
163
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
184
+ nidxp = (ssize_t*)nidx->ptr; // Cumo::NArray data resides on GPU
185
+ //q->idx = ALLOC_N(size_t, n);
186
+ //for (k=0; k<n; k++) {
187
+ // q->idx[k] = na_range_check(nidxp[k], size, orig_dim);
188
+ //}
189
+ q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
190
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(q->idx,nidxp,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
164
191
 
165
- for (k=0; k<n; k++) {
166
- q->idx[k] = cumo_na_range_check(nidxp[k], size, orig_dim);
167
- }
168
192
  q->n = n;
169
193
  q->beg = 0;
170
194
  q->step = 1;
@@ -401,6 +425,9 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
401
425
 
402
426
  // array index
403
427
  if (q[i].idx != NULL) {
428
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
429
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
430
+
404
431
  index = q[i].idx;
405
432
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
406
433
  q[i].idx = NULL;
@@ -456,6 +483,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
456
483
  // index <- index
457
484
  int k;
458
485
  size_t *index = q[i].idx;
486
+
487
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
488
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
489
+
459
490
  CUMO_SDX_SET_INDEX(na2->stridx[j], index);
460
491
  q[i].idx = NULL;
461
492
 
@@ -467,6 +498,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
467
498
  // index <- step
468
499
  ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
469
500
  size_t *index = q[i].idx;
501
+
502
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
503
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
504
+
470
505
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
471
506
  q[i].idx = NULL;
472
507
 
@@ -494,8 +529,13 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
494
529
  int k;
495
530
  size_t beg = q[i].beg;
496
531
  ssize_t step = q[i].step;
497
- size_t *index = ALLOC_N(size_t, size);
532
+ // size_t *index = ALLOC_N(size_t, size);
533
+ size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
498
534
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
535
+
536
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
537
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
538
+
499
539
  for (k=0; k<size; k++) {
500
540
  index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
501
541
  }
@@ -515,7 +555,6 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
515
555
  na2->base.size = total;
516
556
  }
517
557
 
518
-
519
558
  static int
520
559
  cumo_na_ndim_new_narray(int ndim, const cumo_na_index_arg_t *q)
521
560
  {
@@ -625,7 +664,7 @@ cumo_na_aref_md_ensure(VALUE data_value)
625
664
  cumo_na_aref_md_data_t *data = (cumo_na_aref_md_data_t*)(data_value);
626
665
  int i;
627
666
  for (i=0; i<data->ndim; i++) {
628
- xfree(data->q[i].idx);
667
+ cumo_cuda_runtime_free((char*)(data->q[i].idx));
629
668
  }
630
669
  if (data->q) xfree(data->q);
631
670
  return Qnil;
@@ -168,12 +168,8 @@ cumo_na_view_free(void* ptr)
168
168
  if (na->stridx != NULL) {
169
169
  for (i=0; i<na->base.ndim; i++) {
170
170
  if (CUMO_SDX_IS_INDEX(na->stridx[i])) {
171
- void *p = CUMO_SDX_GET_INDEX(na->stridx[i]);
172
- if (cumo_cuda_runtime_is_device_memory(p)) {
173
- cumo_cuda_runtime_free(p);
174
- } else {
175
- xfree(p);
176
- }
171
+ void *idx = CUMO_SDX_GET_INDEX(na->stridx[i]);
172
+ cumo_cuda_runtime_free(idx);
177
173
  }
178
174
  }
179
175
  xfree(na->stridx);
@@ -880,7 +876,6 @@ VALUE
880
876
  cumo_na_make_view(VALUE self)
881
877
  {
882
878
  int i, nd;
883
- size_t j;
884
879
  size_t *idx1, *idx2;
885
880
  ssize_t stride;
886
881
  cumo_narray_t *na;
@@ -914,10 +909,12 @@ cumo_na_make_view(VALUE self)
914
909
  for (i=0; i<nd; i++) {
915
910
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
916
911
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
917
- idx2 = ALLOC_N(size_t,na1->base.shape[i]);
918
- for (j=0; j<na1->base.shape[i]; j++) {
919
- idx2[j] = idx1[j];
920
- }
912
+ // idx2 = ALLOC_N(size_t,na1->base.shape[i]);
913
+ // for (j=0; j<na1->base.shape[i]; j++) {
914
+ // idx2[j] = idx1[j];
915
+ // }
916
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na1->base.shape[i]);
917
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*na1->base.shape[i],cudaMemcpyDeviceToDevice,0));
921
918
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
922
919
  } else {
923
920
  na2->stridx[i] = na1->stridx[i];
@@ -947,8 +944,8 @@ static VALUE
947
944
  cumo_na_expand_dims(VALUE self, VALUE vdim)
948
945
  {
949
946
  int i, j, nd, dim;
950
- size_t *shape, *cumo_na_shape;
951
- cumo_stridx_t *stridx, *cumo_na_stridx;
947
+ size_t *shape, *na2_shape;
948
+ cumo_stridx_t *stridx, *na2_stridx;
952
949
  cumo_narray_t *na;
953
950
  cumo_narray_view_t *na2;
954
951
  VALUE view;
@@ -970,25 +967,25 @@ cumo_na_expand_dims(VALUE self, VALUE vdim)
970
967
 
971
968
  shape = ALLOC_N(size_t,nd+1);
972
969
  stridx = ALLOC_N(cumo_stridx_t,nd+1);
973
- cumo_na_shape = na2->base.shape;
974
- cumo_na_stridx = na2->stridx;
970
+ na2_shape = na2->base.shape;
971
+ na2_stridx = na2->stridx;
975
972
 
976
973
  for (i=j=0; i<=nd; i++) {
977
974
  if (i==dim) {
978
975
  shape[i] = 1;
979
976
  CUMO_SDX_SET_STRIDE(stridx[i],0);
980
977
  } else {
981
- shape[i] = cumo_na_shape[j];
982
- stridx[i] = cumo_na_stridx[j];
978
+ shape[i] = na2_shape[j];
979
+ stridx[i] = na2_stridx[j];
983
980
  j++;
984
981
  }
985
982
  }
986
983
 
987
984
  na2->stridx = stridx;
988
- xfree(cumo_na_stridx);
985
+ xfree(na2_stridx);
989
986
  na2->base.shape = shape;
990
- if (cumo_na_shape != &(na2->base.size)) {
991
- xfree(cumo_na_shape);
987
+ if (na2_shape != &(na2->base.size)) {
988
+ xfree(na2_shape);
992
989
  }
993
990
  na2->base.ndim++;
994
991
  return view;
@@ -1054,15 +1051,25 @@ cumo_na_reverse(int argc, VALUE *argv, VALUE self)
1054
1051
  n = na1->base.shape[i];
1055
1052
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
1056
1053
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
1057
- idx2 = ALLOC_N(size_t,n);
1054
+ // idx2 = ALLOC_N(size_t,n);
1055
+ // if (cumo_na_test_reduce(reduce,i)) {
1056
+ // for (j=0; j<n; j++) {
1057
+ // idx2[n-1-j] = idx1[j];
1058
+ // }
1059
+ // } else {
1060
+ // for (j=0; j<n; j++) {
1061
+ // idx2[j] = idx1[j];
1062
+ // }
1063
+ // }
1064
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
1058
1065
  if (cumo_na_test_reduce(reduce,i)) {
1066
+ CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_reverse", "any");
1067
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1059
1068
  for (j=0; j<n; j++) {
1060
1069
  idx2[n-1-j] = idx1[j];
1061
1070
  }
1062
1071
  } else {
1063
- for (j=0; j<n; j++) {
1064
- idx2[j] = idx1[j];
1065
- }
1072
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
1066
1073
  }
1067
1074
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
1068
1075
  } else {
@@ -164,7 +164,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
164
164
  printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
165
165
  printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
166
166
  printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
167
- printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
167
+ printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)lp->user.args[j].iter[i].idx, cumo_cuda_runtime_is_device_memory(lp->user.args[j].iter[i].idx));
168
+ // printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
168
169
  }
169
170
  }
170
171
  //
@@ -174,7 +175,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
174
175
  printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
175
176
  printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
176
177
  printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
177
- printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
178
+ printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)LITER(lp,i,j).idx, cumo_cuda_runtime_is_device_memory(LITER(lp,i,j).idx));
179
+ // printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
178
180
  }
179
181
  printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
180
182
  if (lp->xargs[j].bufcp) {
@@ -1489,6 +1491,8 @@ loop_narray(cumo_ndfunc_t *nf, cumo_na_md_loop_t *lp)
1489
1491
  // j-th argument
1490
1492
  for (j=0; j<lp->narg; j++) {
1491
1493
  if (LITER(lp,i,j).idx) {
1494
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("loop_narrayx", "any");
1495
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1492
1496
  LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1493
1497
  } else {
1494
1498
  LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
@@ -76,7 +76,7 @@ void cumo_na_copy_array_structure(VALUE self, VALUE view);
76
76
  static VALUE
77
77
  cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
78
78
  {
79
- size_t i, n;
79
+ size_t n;
80
80
  int j, k, ndim;
81
81
  size_t *shape;
82
82
  size_t *idx1, *idx2;
@@ -147,10 +147,12 @@ cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
147
147
  if (CUMO_SDX_IS_INDEX(na1->stridx[j])) {
148
148
  n = na1->base.shape[j];
149
149
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[j]);
150
- idx2 = ALLOC_N(size_t, na1->base.shape[j]);
151
- for (i=0; i<n; i++) {
152
- idx2[i] = idx1[i];
153
- }
150
+ // idx2 = ALLOC_N(size_t, na1->base.shape[j]);
151
+ // for (i=0; i<n; i++) {
152
+ // idx2[i] = idx1[i];
153
+ // }
154
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
155
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
154
156
  CUMO_SDX_SET_INDEX(na2->stridx[j],idx2);
155
157
  } else {
156
158
  na2->stridx[j] = na1->stridx[j];
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cumo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-11 00:00:00.000000000 Z
11
+ date: 2018-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray