cumo 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a00113b29c4cd47082953a1327d8b5c30c29543d42d7b9c3de1fb81a06d44c3
4
- data.tar.gz: 78014e61144693436adc2c822b40f4520615cbeda8a7776600a3cc53cbe39474
3
+ metadata.gz: 4689762e94e91b3f359190225d9841fa8af8fb7eeff2aef028a0702b00f0787f
4
+ data.tar.gz: 0b3bf965295354246b19c8aaee26eb454d0a54f7a3ec4b01cf893688ce7fd19b
5
5
  SHA512:
6
- metadata.gz: e93d0fb045838fd34a047488a287dfc6c777106255b7e0783e56148967f0c773c4e14a6f9590360f2521b277cdb46cd71bdbe307bd0cf68456942e8e26449a7c
7
- data.tar.gz: cab27fb6523bc0362b5b5bb8e144dbed26c7636bfa7adc8ba69be3f9ad408dd00c2b2a28f59a750108856bb50986f8a398adf0844cfed8150f79475002da56c3
6
+ metadata.gz: 0313c37d6a9b19ae026f3831d2d92b160213a3d8a29a8b7bc63c8ef0123e0b5581e6d935c0de2ce9c36c2850820c7afee9fac19e40d6ebdeb6c983766c639ea3
7
+ data.tar.gz: 7595ee6b049e4a8697840bf2502db04e93fca86c3604e3bb2ed0505ed0f5c5647148cc1ee0e0cbc71ea63ed9e1592330a9a63c444c41791d9a75c2f4de2396d7
@@ -10,8 +10,8 @@ extern "C" {
10
10
  #endif
11
11
  #endif
12
12
 
13
- #define CUMO_VERSION "0.1.2"
14
- #define CUMO_VERSION_CODE 12
13
+ #define CUMO_VERSION "0.2.0"
14
+ #define CUMO_VERSION_CODE 20
15
15
 
16
16
  bool cumo_compatible_mode_enabled_p();
17
17
  bool cumo_show_warning_enabled_p();
@@ -41,7 +41,9 @@ static inline bool
41
41
  cumo_cuda_runtime_is_device_memory(void* ptr)
42
42
  {
43
43
  struct cudaPointerAttributes attrs;
44
- cudaError_t status = cudaPointerGetAttributes(&attrs, ptr);
44
+ cudaError_t status;
45
+ if (!ptr) { return false; }
46
+ status = cudaPointerGetAttributes(&attrs, ptr);
45
47
  cudaGetLastError(); // reset last error to success
46
48
  return (status != cudaErrorInvalidValue);
47
49
  }
@@ -427,10 +427,26 @@ _cumo_na_get_narray_t(VALUE obj, unsigned char cumo_na_type)
427
427
 
428
428
  #define CUMO_DEBUG_PRINT(v) puts(StringValueCStr(rb_funcall(v,rb_intern("inspect"),0)))
429
429
 
430
- #define CUMO_NA_CumoIsNArray(obj) \
431
- (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
432
- #define CUMO_NA_IsArray(obj) \
433
- (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
430
+ #define CUMO_NA_CumoIsNArray(obj) (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
431
+ #define CUMO_NA_IsArray(obj) (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
432
+
433
+ static inline bool
434
+ cumo_na_has_idx_p(VALUE obj)
435
+ {
436
+ cumo_narray_t *na;
437
+ cumo_narray_view_t *nv;
438
+ int i = 0;
439
+ CumoGetNArray(obj, na);
440
+ if (CUMO_NA_TYPE(na) == CUMO_NARRAY_VIEW_T) {
441
+ CumoGetNArrayView(obj, nv);
442
+ for (; i < nv->base.ndim; ++i) {
443
+ if (nv->stridx[i].index) {
444
+ return true;
445
+ }
446
+ }
447
+ }
448
+ return false;
449
+ }
434
450
 
435
451
  #define CUMO_NUM2REAL(v) NUM2DBL( rb_funcall((v),cumo_na_id_real,0) )
436
452
  #define CUMO_NUM2IMAG(v) NUM2DBL( rb_funcall((v),cumo_na_id_imag,0) )
@@ -33,7 +33,7 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
33
33
  cumo_na_indexer_t& in_indexer = arg.in_indexer;
34
34
  cumo_na_indexer_t& out_indexer = arg.out_indexer;
35
35
 
36
- using TypeReduce = decltype(impl.Identity());
36
+ using TypeReduce = decltype(impl.Identity(0));
37
37
 
38
38
  extern __shared__ __align__(8) char sdata_raw[];
39
39
  TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
@@ -48,14 +48,17 @@ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_blo
48
48
 
49
49
  for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
50
50
  cumo_na_indexer_set_dim(&out_indexer, i_out);
51
- TypeReduce accum = impl.Identity();
52
-
53
51
  int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
52
+
53
+ // Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
54
+ // Cumo returns index of input elements, CuPy returns index of reduction axis.
55
+ cumo_na_indexer_set_dim(&in_indexer, i_in);
56
+ TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
57
+ TypeReduce accum = impl.Identity(in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr));
58
+
54
59
  for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
55
60
  cumo_na_indexer_set_dim(&in_indexer, i_in);
56
- TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
57
- // Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
58
- // Cumo returns index of input elements, CuPy returns index of reduction axis.
61
+ in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
59
62
  impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
60
63
  //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
61
64
  }
@@ -102,7 +105,7 @@ void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
102
105
 
103
106
  int64_t block_size = cumo_detail::max_block_size;
104
107
  int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
105
- int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
108
+ int64_t shared_mem_size = sizeof(decltype(impl.Identity(0))) * block_size;
106
109
 
107
110
  cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
108
111
  }
@@ -1,4 +1,7 @@
1
1
  #include <ruby.h>
2
+ #include "cumo.h"
3
+ #include "cumo/cuda/memory_pool.h"
4
+ #include "cumo/cuda/runtime.h"
2
5
  #include "cumo/narray.h"
3
6
  #include "cumo/template.h"
4
7
 
@@ -56,7 +59,8 @@ iter_copy_bytes(cumo_na_loop_t *const lp)
56
59
  {
57
60
  size_t e;
58
61
  e = lp->args[0].elmsz;
59
- // TODO(sonots): CUDA kernelize
62
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_copy_bytes", "any");
63
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
60
64
  LOOP_UNARY_PTR(lp,m_memcpy);
61
65
  }
62
66
 
@@ -99,6 +103,8 @@ iter_swap_byte(cumo_na_loop_t *const lp)
99
103
  e = lp->args[0].elmsz;
100
104
  b1 = ALLOCA_N(char, e);
101
105
  b2 = ALLOCA_N(char, e);
106
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_swap_bytes", "any");
107
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
102
108
  LOOP_UNARY_PTR(lp,m_swap_byte);
103
109
  }
104
110
 
@@ -489,10 +495,12 @@ cumo_na_flatten_dim(VALUE self, int sd)
489
495
  for (i=0; i<sd; i++) {
490
496
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
491
497
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
492
- idx2 = ALLOC_N(size_t, shape[i]);
493
- for (j=0; j<shape[i]; j++) {
494
- idx2[j] = idx1[j];
495
- }
498
+ // idx2 = ALLOC_N(size_t, shape[i]);
499
+ // for (j=0; j<shape[i]; j++) {
500
+ // idx2[j] = idx1[j];
501
+ // }
502
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[i]);
503
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*shape[i],cudaMemcpyDeviceToDevice,0));
496
504
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
497
505
  } else {
498
506
  na2->stridx[i] = na1->stridx[i];
@@ -505,7 +513,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
505
513
  na2->stridx[sd] = na1->stridx[nd-1];
506
514
  } else {
507
515
  // set index
508
- idx2 = ALLOC_N(size_t, shape[sd]);
516
+ // idx2 = ALLOC_N(size_t, shape[sd]);
517
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*shape[sd]);
509
518
  CUMO_SDX_SET_INDEX(na2->stridx[sd],idx2);
510
519
  // init for md-loop
511
520
  fd = nd-sd;
@@ -514,6 +523,8 @@ cumo_na_flatten_dim(VALUE self, int sd)
514
523
  pos = ALLOC_N(size_t, fd+1);
515
524
  pos[0] = 0;
516
525
  // md-loop
526
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_flatten_dim", "any");
527
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
517
528
  for (i=j=0;;) {
518
529
  for (; i<fd; i++) {
519
530
  sdx = na1->stridx[i+sd];
@@ -726,10 +737,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
726
737
  if (i != ax[0] && i != ax[1]) {
727
738
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
728
739
  idx0 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
729
- idx1 = ALLOC_N(size_t, na->shape[i]);
730
- for (j=0; j<na->shape[i]; j++) {
731
- idx1[j] = idx0[j];
732
- }
740
+ // idx1 = ALLOC_N(size_t, na->shape[i]);
741
+ // for (j=0; j<na->shape[i]; j++) {
742
+ // idx1[j] = idx0[j];
743
+ // }
744
+ idx1 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na->shape[i]);
745
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx1,idx0,sizeof(size_t)*na->shape[i],cudaMemcpyDeviceToDevice,0));
733
746
  CUMO_SDX_SET_INDEX(na2->stridx[k],idx1);
734
747
  } else {
735
748
  na2->stridx[k] = na1->stridx[i];
@@ -739,7 +752,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
739
752
  }
740
753
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[0]])) {
741
754
  idx0 = CUMO_SDX_GET_INDEX(na1->stridx[ax[0]]);
742
- diag_idx = ALLOC_N(size_t, diag_size);
755
+ // diag_idx = ALLOC_N(size_t, diag_size);
756
+ diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
757
+
758
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
759
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
760
+
743
761
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
744
762
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
745
763
  for (j=0; j<diag_size; j++) {
@@ -756,7 +774,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
756
774
  stride0 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[0]]);
757
775
  if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
758
776
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
759
- diag_idx = ALLOC_N(size_t, diag_size);
777
+ // diag_idx = ALLOC_N(size_t, diag_size);
778
+ diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
779
+
780
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
781
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
782
+
760
783
  for (j=0; j<diag_size; j++) {
761
784
  diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
762
785
  }
@@ -77,7 +77,12 @@ static VALUE
77
77
  <% else %>
78
78
  reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
79
79
  <% end %>
80
- v = cumo_na_ndloop(&ndf, 2, self, reduce);
80
+ if (cumo_na_has_idx_p(self)) {
81
+ VALUE copy = cumo_na_copy(self); // reduction does not support idx, make contiguous
82
+ v = cumo_na_ndloop(&ndf, 2, copy, reduce);
83
+ } else {
84
+ v = cumo_na_ndloop(&ndf, 2, self, reduce);
85
+ }
81
86
  <% if result_class == "cT" %>
82
87
  return <%=type_name%>_extract(v);
83
88
  <% else %>
@@ -113,7 +113,12 @@ static VALUE
113
113
  <% end %>
114
114
  }
115
115
 
116
- return cumo_na_ndloop(&ndf, 2, self, reduce);
116
+ if (cumo_na_has_idx_p(self)) {
117
+ VALUE copy = cumo_na_copy(self); // reduction does not support idx, make conttiguous
118
+ return cumo_na_ndloop(&ndf, 2, copy, reduce);
119
+ } else {
120
+ return cumo_na_ndloop(&ndf, 2, self, reduce);
121
+ }
117
122
  }
118
123
  <% end %>
119
124
  }
@@ -17,7 +17,7 @@ struct cumo_<%=type_name%>_min_index_int<%=i%>_impl {
17
17
  dtype min;
18
18
  idx_t argmin;
19
19
  };
20
- __device__ MinAndArgMin Identity() { return {DATA_MAX, 0}; }
20
+ __device__ MinAndArgMin Identity(idx_t index) { return {DATA_MAX, index}; }
21
21
  __device__ MinAndArgMin MapIn(dtype in, idx_t index) { return {in, index}; }
22
22
  __device__ void Reduce(MinAndArgMin next, MinAndArgMin& accum) {
23
23
  if (accum.min > next.min) {
@@ -32,7 +32,7 @@ struct cumo_<%=type_name%>_max_index_int<%=i%>_impl {
32
32
  dtype max;
33
33
  idx_t argmax;
34
34
  };
35
- __device__ MaxAndArgMax Identity() { return {DATA_MIN, 0}; }
35
+ __device__ MaxAndArgMax Identity(idx_t index) { return {DATA_MIN, index}; }
36
36
  __device__ MaxAndArgMax MapIn(dtype in, idx_t index) { return {in, index}; }
37
37
  __device__ void Reduce(MaxAndArgMax next, MaxAndArgMax& accum) {
38
38
  if (accum.max < next.max) {
@@ -6,14 +6,14 @@
6
6
  #endif
7
7
 
8
8
  struct cumo_<%=type_name%>_sum_impl {
9
- __device__ <%=dtype%> Identity() { return m_zero; }
9
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
10
10
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
11
11
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_add(next, accum); }
12
12
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
13
13
  };
14
14
 
15
15
  struct cumo_<%=type_name%>_prod_impl {
16
- __device__ <%=dtype%> Identity() { return m_one; }
16
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
17
17
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
18
18
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum = m_mul(next, accum); }
19
19
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
@@ -6,28 +6,28 @@
6
6
  #endif
7
7
 
8
8
  struct cumo_<%=type_name%>_sum_impl {
9
- __device__ <%=dtype%> Identity() { return m_zero; }
9
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_zero; }
10
10
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
11
11
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum += next; }
12
12
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
13
13
  };
14
14
 
15
15
  struct cumo_<%=type_name%>_prod_impl {
16
- __device__ <%=dtype%> Identity() { return m_one; }
16
+ __device__ <%=dtype%> Identity(int64_t /*index*/) { return m_one; }
17
17
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
18
18
  __device__ void Reduce(dtype next, <%=dtype%>& accum) { accum *= next; }
19
19
  __device__ <%=dtype%> MapOut(<%=dtype%> accum) { return accum; }
20
20
  };
21
21
 
22
22
  struct cumo_<%=type_name%>_min_impl {
23
- __device__ dtype Identity() { return DATA_MAX; }
23
+ __device__ dtype Identity(int64_t /*index*/) { return DATA_MAX; }
24
24
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
25
25
  __device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? next : accum; }
26
26
  __device__ dtype MapOut(dtype accum) { return accum; }
27
27
  };
28
28
 
29
29
  struct cumo_<%=type_name%>_max_impl {
30
- __device__ dtype Identity() { return DATA_MIN; }
30
+ __device__ dtype Identity(int64_t /*index*/) { return DATA_MIN; }
31
31
  __device__ dtype MapIn(dtype in, int64_t /*index*/) { return in; }
32
32
  __device__ void Reduce(dtype next, dtype& accum) { accum = next < accum ? accum : next; }
33
33
  __device__ dtype MapOut(dtype accum) { return accum; }
@@ -3,6 +3,7 @@
3
3
  #include "cumo.h"
4
4
  #include "cumo/narray.h"
5
5
  #include "cumo/cuda/runtime.h"
6
+ #include "cumo/cuda/memory_pool.h"
6
7
  #include "cumo/template.h"
7
8
 
8
9
  #if SIZEOF_VOIDP == 8
@@ -52,7 +53,8 @@ print_index_arg(cumo_na_index_arg_t *q, int n)
52
53
  printf(" q[%d].n=%"SZF"d\n",i,q[i].n);
53
54
  printf(" q[%d].beg=%"SZF"d\n",i,q[i].beg);
54
55
  printf(" q[%d].step=%"SZF"d\n",i,q[i].step);
55
- printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
56
+ printf(" q[%d].idx=0x%"SZF"x (cuda:%d)\n",i,(size_t)q[i].idx, cumo_cuda_runtime_is_device_memory(q[i].idx));
57
+ // printf(" q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
56
58
  printf(" q[%d].reduce=0x%x\n",i,q[i].reduce);
57
59
  printf(" q[%d].orig_dim=%d\n",i,q[i].orig_dim);
58
60
  }
@@ -121,15 +123,38 @@ cumo_na_range_check(ssize_t pos, ssize_t size, int dim)
121
123
  return idx;
122
124
  }
123
125
 
126
+ static void CUDART_CB
127
+ cumo_na_parse_array_callback(cudaStream_t stream, cudaError_t status, void *data)
128
+ {
129
+ cudaFreeHost(data);
130
+ }
131
+
132
+ // copy ruby array to idx
124
133
  static void
125
134
  cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
126
135
  {
127
136
  int k;
137
+ size_t* idx;
138
+ cudaError_t status;
128
139
  int n = RARRAY_LEN(ary);
129
- q->idx = ALLOC_N(size_t, n);
140
+ //q->idx = ALLOC_N(size_t, n);
141
+ //for (k=0; k<n; k++) {
142
+ // q->idx[k] = na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
143
+ //}
144
+ // make a contiguous pinned memory on host => copy to device => release pinned memory after copy finished on callback
145
+ q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
146
+ cudaHostAlloc((void**)&idx, sizeof(size_t)*n, cudaHostAllocDefault);
130
147
  for (k=0; k<n; k++) {
131
- q->idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
148
+ idx[k] = cumo_na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
149
+ }
150
+ status = cudaMemcpyAsync(q->idx,idx,sizeof(size_t)*n,cudaMemcpyHostToDevice,0);
151
+ if (status == 0) {
152
+ cumo_cuda_runtime_check_status(cudaStreamAddCallback(0,cumo_na_parse_array_callback,idx,0));
153
+ } else {
154
+ cudaFreeHost(idx);
132
155
  }
156
+ cumo_cuda_runtime_check_status(status);
157
+
133
158
  q->n = n;
134
159
  q->beg = 0;
135
160
  q->step = 1;
@@ -137,13 +162,14 @@ cumo_na_parse_array(VALUE ary, int orig_dim, ssize_t size, cumo_na_index_arg_t *
137
162
  q->orig_dim = orig_dim;
138
163
  }
139
164
 
165
+ // copy narray to idx
140
166
  static void
141
167
  cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
142
168
  {
143
169
  VALUE idx;
144
170
  cumo_narray_t *na;
145
171
  cumo_narray_data_t *nidx;
146
- size_t k, n;
172
+ size_t n;
147
173
  ssize_t *nidxp;
148
174
 
149
175
  CumoGetNArray(a,na);
@@ -155,16 +181,14 @@ cumo_na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, cumo_na_index_ar
155
181
  cumo_na_store(idx,a);
156
182
 
157
183
  CumoGetNArrayData(idx,nidx);
158
- nidxp = (ssize_t*)nidx->ptr;
159
- q->idx = ALLOC_N(size_t, n);
160
-
161
- // ndixp is cuda memory (cuda narray)
162
- CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_parse_narray_index", "any");
163
- cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
184
+ nidxp = (ssize_t*)nidx->ptr; // Cumo::NArray data resides on GPU
185
+ //q->idx = ALLOC_N(size_t, n);
186
+ //for (k=0; k<n; k++) {
187
+ // q->idx[k] = na_range_check(nidxp[k], size, orig_dim);
188
+ //}
189
+ q->idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
190
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(q->idx,nidxp,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
164
191
 
165
- for (k=0; k<n; k++) {
166
- q->idx[k] = cumo_na_range_check(nidxp[k], size, orig_dim);
167
- }
168
192
  q->n = n;
169
193
  q->beg = 0;
170
194
  q->step = 1;
@@ -401,6 +425,9 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
401
425
 
402
426
  // array index
403
427
  if (q[i].idx != NULL) {
428
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
429
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
430
+
404
431
  index = q[i].idx;
405
432
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
406
433
  q[i].idx = NULL;
@@ -456,6 +483,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
456
483
  // index <- index
457
484
  int k;
458
485
  size_t *index = q[i].idx;
486
+
487
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
488
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
489
+
459
490
  CUMO_SDX_SET_INDEX(na2->stridx[j], index);
460
491
  q[i].idx = NULL;
461
492
 
@@ -467,6 +498,10 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
467
498
  // index <- step
468
499
  ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
469
500
  size_t *index = q[i].idx;
501
+
502
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
503
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
504
+
470
505
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
471
506
  q[i].idx = NULL;
472
507
 
@@ -494,8 +529,13 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
494
529
  int k;
495
530
  size_t beg = q[i].beg;
496
531
  ssize_t step = q[i].step;
497
- size_t *index = ALLOC_N(size_t, size);
532
+ // size_t *index = ALLOC_N(size_t, size);
533
+ size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
498
534
  CUMO_SDX_SET_INDEX(na2->stridx[j],index);
535
+
536
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
537
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
538
+
499
539
  for (k=0; k<size; k++) {
500
540
  index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
501
541
  }
@@ -515,7 +555,6 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
515
555
  na2->base.size = total;
516
556
  }
517
557
 
518
-
519
558
  static int
520
559
  cumo_na_ndim_new_narray(int ndim, const cumo_na_index_arg_t *q)
521
560
  {
@@ -625,7 +664,7 @@ cumo_na_aref_md_ensure(VALUE data_value)
625
664
  cumo_na_aref_md_data_t *data = (cumo_na_aref_md_data_t*)(data_value);
626
665
  int i;
627
666
  for (i=0; i<data->ndim; i++) {
628
- xfree(data->q[i].idx);
667
+ cumo_cuda_runtime_free((char*)(data->q[i].idx));
629
668
  }
630
669
  if (data->q) xfree(data->q);
631
670
  return Qnil;
@@ -168,12 +168,8 @@ cumo_na_view_free(void* ptr)
168
168
  if (na->stridx != NULL) {
169
169
  for (i=0; i<na->base.ndim; i++) {
170
170
  if (CUMO_SDX_IS_INDEX(na->stridx[i])) {
171
- void *p = CUMO_SDX_GET_INDEX(na->stridx[i]);
172
- if (cumo_cuda_runtime_is_device_memory(p)) {
173
- cumo_cuda_runtime_free(p);
174
- } else {
175
- xfree(p);
176
- }
171
+ void *idx = CUMO_SDX_GET_INDEX(na->stridx[i]);
172
+ cumo_cuda_runtime_free(idx);
177
173
  }
178
174
  }
179
175
  xfree(na->stridx);
@@ -880,7 +876,6 @@ VALUE
880
876
  cumo_na_make_view(VALUE self)
881
877
  {
882
878
  int i, nd;
883
- size_t j;
884
879
  size_t *idx1, *idx2;
885
880
  ssize_t stride;
886
881
  cumo_narray_t *na;
@@ -914,10 +909,12 @@ cumo_na_make_view(VALUE self)
914
909
  for (i=0; i<nd; i++) {
915
910
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
916
911
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
917
- idx2 = ALLOC_N(size_t,na1->base.shape[i]);
918
- for (j=0; j<na1->base.shape[i]; j++) {
919
- idx2[j] = idx1[j];
920
- }
912
+ // idx2 = ALLOC_N(size_t,na1->base.shape[i]);
913
+ // for (j=0; j<na1->base.shape[i]; j++) {
914
+ // idx2[j] = idx1[j];
915
+ // }
916
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*na1->base.shape[i]);
917
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*na1->base.shape[i],cudaMemcpyDeviceToDevice,0));
921
918
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
922
919
  } else {
923
920
  na2->stridx[i] = na1->stridx[i];
@@ -947,8 +944,8 @@ static VALUE
947
944
  cumo_na_expand_dims(VALUE self, VALUE vdim)
948
945
  {
949
946
  int i, j, nd, dim;
950
- size_t *shape, *cumo_na_shape;
951
- cumo_stridx_t *stridx, *cumo_na_stridx;
947
+ size_t *shape, *na2_shape;
948
+ cumo_stridx_t *stridx, *na2_stridx;
952
949
  cumo_narray_t *na;
953
950
  cumo_narray_view_t *na2;
954
951
  VALUE view;
@@ -970,25 +967,25 @@ cumo_na_expand_dims(VALUE self, VALUE vdim)
970
967
 
971
968
  shape = ALLOC_N(size_t,nd+1);
972
969
  stridx = ALLOC_N(cumo_stridx_t,nd+1);
973
- cumo_na_shape = na2->base.shape;
974
- cumo_na_stridx = na2->stridx;
970
+ na2_shape = na2->base.shape;
971
+ na2_stridx = na2->stridx;
975
972
 
976
973
  for (i=j=0; i<=nd; i++) {
977
974
  if (i==dim) {
978
975
  shape[i] = 1;
979
976
  CUMO_SDX_SET_STRIDE(stridx[i],0);
980
977
  } else {
981
- shape[i] = cumo_na_shape[j];
982
- stridx[i] = cumo_na_stridx[j];
978
+ shape[i] = na2_shape[j];
979
+ stridx[i] = na2_stridx[j];
983
980
  j++;
984
981
  }
985
982
  }
986
983
 
987
984
  na2->stridx = stridx;
988
- xfree(cumo_na_stridx);
985
+ xfree(na2_stridx);
989
986
  na2->base.shape = shape;
990
- if (cumo_na_shape != &(na2->base.size)) {
991
- xfree(cumo_na_shape);
987
+ if (na2_shape != &(na2->base.size)) {
988
+ xfree(na2_shape);
992
989
  }
993
990
  na2->base.ndim++;
994
991
  return view;
@@ -1054,15 +1051,25 @@ cumo_na_reverse(int argc, VALUE *argv, VALUE self)
1054
1051
  n = na1->base.shape[i];
1055
1052
  if (CUMO_SDX_IS_INDEX(na1->stridx[i])) {
1056
1053
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[i]);
1057
- idx2 = ALLOC_N(size_t,n);
1054
+ // idx2 = ALLOC_N(size_t,n);
1055
+ // if (cumo_na_test_reduce(reduce,i)) {
1056
+ // for (j=0; j<n; j++) {
1057
+ // idx2[n-1-j] = idx1[j];
1058
+ // }
1059
+ // } else {
1060
+ // for (j=0; j<n; j++) {
1061
+ // idx2[j] = idx1[j];
1062
+ // }
1063
+ // }
1064
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
1058
1065
  if (cumo_na_test_reduce(reduce,i)) {
1066
+ CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("cumo_na_reverse", "any");
1067
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1059
1068
  for (j=0; j<n; j++) {
1060
1069
  idx2[n-1-j] = idx1[j];
1061
1070
  }
1062
1071
  } else {
1063
- for (j=0; j<n; j++) {
1064
- idx2[j] = idx1[j];
1065
- }
1072
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
1066
1073
  }
1067
1074
  CUMO_SDX_SET_INDEX(na2->stridx[i],idx2);
1068
1075
  } else {
@@ -164,7 +164,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
164
164
  printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
165
165
  printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
166
166
  printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
167
- printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
167
+ printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)lp->user.args[j].iter[i].idx, cumo_cuda_runtime_is_device_memory(lp->user.args[j].iter[i].idx));
168
+ // printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
168
169
  }
169
170
  }
170
171
  //
@@ -174,7 +175,8 @@ print_ndloop(cumo_na_md_loop_t *lp) {
174
175
  printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
175
176
  printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
176
177
  printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
177
- printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
178
+ printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x (cuda:%d)\n", j,i, (size_t)LITER(lp,i,j).idx, cumo_cuda_runtime_is_device_memory(LITER(lp,i,j).idx));
179
+ // printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
178
180
  }
179
181
  printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
180
182
  if (lp->xargs[j].bufcp) {
@@ -1489,6 +1491,8 @@ loop_narray(cumo_ndfunc_t *nf, cumo_na_md_loop_t *lp)
1489
1491
  // j-th argument
1490
1492
  for (j=0; j<lp->narg; j++) {
1491
1493
  if (LITER(lp,i,j).idx) {
1494
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("loop_narrayx", "any");
1495
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1492
1496
  LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1493
1497
  } else {
1494
1498
  LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
@@ -76,7 +76,7 @@ void cumo_na_copy_array_structure(VALUE self, VALUE view);
76
76
  static VALUE
77
77
  cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
78
78
  {
79
- size_t i, n;
79
+ size_t n;
80
80
  int j, k, ndim;
81
81
  size_t *shape;
82
82
  size_t *idx1, *idx2;
@@ -147,10 +147,12 @@ cumo_na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
147
147
  if (CUMO_SDX_IS_INDEX(na1->stridx[j])) {
148
148
  n = na1->base.shape[j];
149
149
  idx1 = CUMO_SDX_GET_INDEX(na1->stridx[j]);
150
- idx2 = ALLOC_N(size_t, na1->base.shape[j]);
151
- for (i=0; i<n; i++) {
152
- idx2[i] = idx1[i];
153
- }
150
+ // idx2 = ALLOC_N(size_t, na1->base.shape[j]);
151
+ // for (i=0; i<n; i++) {
152
+ // idx2[i] = idx1[i];
153
+ // }
154
+ idx2 = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*n);
155
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(idx2,idx1,sizeof(size_t)*n,cudaMemcpyDeviceToDevice,0));
154
156
  CUMO_SDX_SET_INDEX(na2->stridx[j],idx2);
155
157
  } else {
156
158
  na2->stridx[j] = na1->stridx[j];
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cumo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-11 00:00:00.000000000 Z
11
+ date: 2018-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray