cumo 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +15 -0
  4. data/.rubocop_todo.yml +1272 -0
  5. data/3rd_party/mkmf-cu/Gemfile +2 -0
  6. data/3rd_party/mkmf-cu/Rakefile +2 -1
  7. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +2 -0
  8. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +36 -7
  9. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +51 -45
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +2 -0
  11. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +3 -1
  12. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +5 -3
  13. data/CHANGELOG.md +69 -0
  14. data/Gemfile +6 -1
  15. data/README.md +2 -10
  16. data/Rakefile +8 -11
  17. data/bench/broadcast_fp32.rb +28 -26
  18. data/bench/cumo_bench.rb +18 -16
  19. data/bench/numo_bench.rb +18 -16
  20. data/bench/reduction_fp32.rb +14 -12
  21. data/bin/console +1 -0
  22. data/cumo.gemspec +5 -8
  23. data/ext/cumo/cuda/cudnn.c +2 -2
  24. data/ext/cumo/cumo.c +7 -3
  25. data/ext/cumo/depend.erb +15 -13
  26. data/ext/cumo/extconf.rb +32 -46
  27. data/ext/cumo/include/cumo/cuda/cudnn.h +3 -1
  28. data/ext/cumo/include/cumo/intern.h +1 -0
  29. data/ext/cumo/include/cumo/narray.h +13 -1
  30. data/ext/cumo/include/cumo/template.h +2 -4
  31. data/ext/cumo/include/cumo/types/complex_macro.h +1 -1
  32. data/ext/cumo/include/cumo/types/float_macro.h +2 -2
  33. data/ext/cumo/include/cumo/types/xint_macro.h +3 -2
  34. data/ext/cumo/include/cumo.h +2 -2
  35. data/ext/cumo/narray/array.c +3 -3
  36. data/ext/cumo/narray/data.c +23 -2
  37. data/ext/cumo/narray/gen/cogen.rb +8 -7
  38. data/ext/cumo/narray/gen/cogen_kernel.rb +8 -7
  39. data/ext/cumo/narray/gen/def/bit.rb +3 -1
  40. data/ext/cumo/narray/gen/def/dcomplex.rb +2 -0
  41. data/ext/cumo/narray/gen/def/dfloat.rb +2 -0
  42. data/ext/cumo/narray/gen/def/int16.rb +2 -0
  43. data/ext/cumo/narray/gen/def/int32.rb +2 -0
  44. data/ext/cumo/narray/gen/def/int64.rb +2 -0
  45. data/ext/cumo/narray/gen/def/int8.rb +2 -0
  46. data/ext/cumo/narray/gen/def/robject.rb +2 -0
  47. data/ext/cumo/narray/gen/def/scomplex.rb +2 -0
  48. data/ext/cumo/narray/gen/def/sfloat.rb +2 -0
  49. data/ext/cumo/narray/gen/def/uint16.rb +2 -0
  50. data/ext/cumo/narray/gen/def/uint32.rb +2 -0
  51. data/ext/cumo/narray/gen/def/uint64.rb +2 -0
  52. data/ext/cumo/narray/gen/def/uint8.rb +2 -0
  53. data/ext/cumo/narray/gen/erbln.rb +9 -7
  54. data/ext/cumo/narray/gen/erbpp2.rb +26 -24
  55. data/ext/cumo/narray/gen/narray_def.rb +13 -11
  56. data/ext/cumo/narray/gen/spec.rb +58 -55
  57. data/ext/cumo/narray/gen/tmpl/alloc_func.c +1 -1
  58. data/ext/cumo/narray/gen/tmpl/at.c +34 -0
  59. data/ext/cumo/narray/gen/tmpl/batch_norm.c +1 -1
  60. data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +2 -2
  61. data/ext/cumo/narray/gen/tmpl/conv.c +1 -1
  62. data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +3 -1
  63. data/ext/cumo/narray/gen/tmpl/conv_transpose.c +1 -1
  64. data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +1 -1
  65. data/ext/cumo/narray/gen/tmpl/init_class.c +1 -0
  66. data/ext/cumo/narray/gen/tmpl/pooling_backward.c +1 -1
  67. data/ext/cumo/narray/gen/tmpl/pooling_forward.c +1 -1
  68. data/ext/cumo/narray/gen/tmpl/qsort.c +1 -5
  69. data/ext/cumo/narray/gen/tmpl/sort.c +1 -1
  70. data/ext/cumo/narray/gen/tmpl_bit/binary.c +42 -14
  71. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +5 -0
  72. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +5 -0
  73. data/ext/cumo/narray/gen/tmpl_bit/mask.c +27 -7
  74. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +21 -7
  75. data/ext/cumo/narray/gen/tmpl_bit/unary.c +21 -7
  76. data/ext/cumo/narray/index.c +243 -39
  77. data/ext/cumo/narray/index_kernel.cu +84 -0
  78. data/ext/cumo/narray/narray.c +38 -1
  79. data/ext/cumo/narray/ndloop.c +1 -1
  80. data/ext/cumo/narray/struct.c +1 -1
  81. data/lib/cumo/cuda/compile_error.rb +1 -1
  82. data/lib/cumo/cuda/compiler.rb +23 -22
  83. data/lib/cumo/cuda/cudnn.rb +1 -1
  84. data/lib/cumo/cuda/device.rb +1 -1
  85. data/lib/cumo/cuda/link_state.rb +2 -2
  86. data/lib/cumo/cuda/module.rb +1 -2
  87. data/lib/cumo/cuda/nvrtc_program.rb +3 -2
  88. data/lib/cumo/cuda.rb +2 -0
  89. data/lib/cumo/linalg.rb +2 -0
  90. data/lib/cumo/narray/extra.rb +137 -185
  91. data/lib/cumo/narray.rb +2 -0
  92. data/lib/cumo.rb +3 -1
  93. data/test/bit_test.rb +157 -0
  94. data/test/cuda/compiler_test.rb +69 -0
  95. data/test/cuda/device_test.rb +30 -0
  96. data/test/cuda/memory_pool_test.rb +45 -0
  97. data/test/cuda/nvrtc_test.rb +51 -0
  98. data/test/cuda/runtime_test.rb +28 -0
  99. data/test/cudnn_test.rb +498 -0
  100. data/test/cumo_test.rb +27 -0
  101. data/test/narray_test.rb +745 -0
  102. data/test/ractor_test.rb +52 -0
  103. data/test/test_helper.rb +31 -0
  104. metadata +31 -54
  105. data/.travis.yml +0 -5
  106. data/numo-narray-version +0 -1
@@ -24,8 +24,7 @@ static void
24
24
  CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, y);
25
25
  }
26
26
  } else {
27
- o1 = p1 % CUMO_NB;
28
- o1 -= p3;
27
+ o1 = p1-p3;
29
28
  l1 = CUMO_NB+o1;
30
29
  r1 = CUMO_NB-o1;
31
30
  if (p3>0 || n<CUMO_NB) {
@@ -48,17 +47,32 @@ static void
48
47
  }
49
48
  } else {
50
49
  for (; n>=CUMO_NB; n-=CUMO_NB) {
51
- x = *a1>>o1;
52
- if (o1<0) x |= *(a1-1)>>l1;
53
- if (o1>0) x |= *(a1+1)<<r1;
50
+ if (o1==0) {
51
+ x = *a1;
52
+ } else if (o1>0) {
53
+ x = *a1>>o1 | *(a1+1)<<r1;
54
+ } else {
55
+ x = *a1<<-o1 | *(a1-1)>>l1;
56
+ }
54
57
  a1++;
55
58
  y = m_<%=name%>(x);
56
59
  *(a3++) = y;
57
60
  }
58
61
  }
59
62
  if (n>0) {
60
- x = *a1>>o1;
61
- if (o1<0) x |= *(a1-1)>>l1;
63
+ if (o1==0) {
64
+ x = *a1;
65
+ } else if (o1>0) {
66
+ x = *a1>>o1;
67
+ if ((int)n>r1) {
68
+ x |= *(a1+1)<<r1;
69
+ }
70
+ } else {
71
+ x = *(a1-1)>>l1;
72
+ if ((int)n>-o1) {
73
+ x |= *a1<<-o1;
74
+ }
75
+ }
62
76
  y = m_<%=name%>(x);
63
77
  *a3 = (y & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
64
78
  }
@@ -183,7 +183,6 @@ static void
183
183
  cumo_na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
184
184
  {
185
185
  int n;
186
- VALUE excl_end;
187
186
  ssize_t beg, end, beg_orig, end_orig;
188
187
  const char *dot = "..", *edot = "...";
189
188
 
@@ -197,10 +196,15 @@ cumo_na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, cumo_
197
196
  beg += size;
198
197
  }
199
198
  if (T_NIL == TYPE(x.end)) { // endless range
200
- end = size -1;
199
+ end = size - 1;
201
200
  if (RTEST(x.exclude_end)) {
202
201
  dot = edot;
203
202
  }
203
+ if (beg < 0 || beg >= size) {
204
+ rb_raise(rb_eRangeError,
205
+ "%"SZF"d%s is out of range for size=%"SZF"d",
206
+ beg_orig, dot, size);
207
+ }
204
208
  } else {
205
209
  end = end_orig = NUM2SSIZET(x.end);
206
210
  if (end < 0) {
@@ -210,19 +214,15 @@ cumo_na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, cumo_
210
214
  end--;
211
215
  dot = edot;
212
216
  }
213
- }
214
- if (beg < 0 || beg >= size || end < 0 || end >= size) {
215
- if (T_NIL == TYPE(x.end)) { // endless range
216
- rb_raise(rb_eRangeError,
217
- "%"SZF"d%s is out of range for size=%"SZF"d",
218
- beg_orig, dot, size);
219
- } else {
217
+ if (beg < 0 || beg >= size || end < 0 || end >= size) {
220
218
  rb_raise(rb_eRangeError,
221
219
  "%"SZF"d%s%"SZF"d is out of range for size=%"SZF"d",
222
220
  beg_orig, dot, end_orig, size);
223
221
  }
224
222
  }
225
223
  #else
224
+ VALUE excl_end;
225
+
226
226
  beg = beg_orig = NUM2SSIZET(rb_funcall(range,cumo_id_beg,0));
227
227
  if (beg < 0) {
228
228
  beg += size;
@@ -258,7 +258,7 @@ cumo_na_parse_enumerator_step(VALUE enum_obj, VALUE *pstep)
258
258
  if (!RB_TYPE_P(enum_obj, T_DATA)) {
259
259
  rb_raise(rb_eTypeError,"wrong argument type (not T_DATA)");
260
260
  }
261
- e = (cumo_enumerator_t *)DATA_PTR(enum_obj);
261
+ e = CUMO_RENUMERATOR_PTR(enum_obj);
262
262
 
263
263
  if (!rb_obj_is_kind_of(e->obj, rb_cRange)) {
264
264
  rb_raise(rb_eTypeError,"not Range object");
@@ -292,7 +292,7 @@ cumo_na_parse_enumerator(VALUE enum_obj, int orig_dim, ssize_t size, cumo_na_ind
292
292
  rb_raise(rb_eTypeError,"wrong argument type (not T_DATA)");
293
293
  }
294
294
  cumo_na_parse_enumerator_step(enum_obj, &step);
295
- e = (cumo_enumerator_t *)DATA_PTR(enum_obj);
295
+ e = CUMO_RENUMERATOR_PTR(enum_obj);
296
296
  cumo_na_parse_range(e->obj, NUM2SSIZET(step), orig_dim, size, q); // e->obj : Range Object
297
297
  }
298
298
 
@@ -568,6 +568,188 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
568
568
  na2->base.size = total;
569
569
  }
570
570
 
571
+ void cumo_na_index_at_nadata_index_stride_add_kernel_launch(size_t *idx, size_t *idx1, ssize_t s1, uint64_t n);
572
+ void cumo_na_index_at_nadata_index_beg_step_stride_kernel_launch(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n);
573
+ void cumo_na_index_at_nadata_index_beg_step_stride_add_kernel_launch(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n);
574
+
575
+ static void
576
+ cumo_na_index_at_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
577
+ cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
578
+ {
579
+ int i;
580
+ size_t size = q[ndim-1].n;
581
+ ssize_t stride1;
582
+ ssize_t *strides_na1;
583
+ size_t *index;
584
+ ssize_t beg, step;
585
+ int use_cumo_cuda_runtime_malloc = 0;
586
+
587
+ strides_na1 = ALLOCA_N(ssize_t, na1->base.ndim);
588
+ cumo_na_get_strides_nadata(na1, strides_na1, elmsz);
589
+
590
+ if (q[ndim-1].idx != NULL) {
591
+ index = q[ndim-1].idx;
592
+ } else {
593
+ //index = ALLOC_N(size_t, size);
594
+ index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
595
+ use_cumo_cuda_runtime_malloc = 1;
596
+ }
597
+ CUMO_SDX_SET_INDEX(na2->stridx[0], index);
598
+
599
+ for (i=ndim-1; i>=0; i--) {
600
+ stride1 = strides_na1[q[i].orig_dim];
601
+ if (i==ndim-1) {
602
+ if (size == 0) {
603
+ rb_raise(cumo_na_eShapeError, "cannot get element of empty array");
604
+ }
605
+ } else {
606
+ if (size != q[i].n) {
607
+ rb_raise(cumo_na_eShapeError, "index array sizes mismatch");
608
+ }
609
+ }
610
+
611
+ if (q[i].idx != NULL) {
612
+ if (i==ndim-1) {
613
+ cumo_na_index_aref_nadata_index_stride_kernel_launch(index, stride1, size);
614
+ q[i].idx = NULL;
615
+ } else {
616
+ cumo_na_index_at_nadata_index_stride_add_kernel_launch(index, q[i].idx, stride1, size);
617
+ }
618
+ } else {
619
+ beg = q[i].beg;
620
+ step = q[i].step;
621
+ if (i==ndim-1) {
622
+ cumo_na_index_at_nadata_index_beg_step_stride_kernel_launch(index, beg, step, stride1, size);
623
+ } else {
624
+ cumo_na_index_at_nadata_index_beg_step_stride_add_kernel_launch(index, beg, step, stride1, size);
625
+ }
626
+ }
627
+
628
+ }
629
+ na2->base.size = size;
630
+ na2->base.shape[0] = size;
631
+ if (use_cumo_cuda_runtime_malloc) {
632
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("index", "cumo_na_index_at_nadata");
633
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
634
+ }
635
+ }
636
+
637
+ void cumo_na_index_at_naview_index_index_index_add_kernel_launch(size_t *idx, size_t *idx1, size_t *idx2, uint64_t n);
638
+ void cumo_na_index_at_naview_index_index_beg_step_add_kernel_launch(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n);
639
+ void cumo_na_index_at_naview_index_stride_last_add_kernel_launch(size_t *idx, ssize_t s1, size_t last, uint64_t n);
640
+
641
+ static void
642
+ cumo_na_index_at_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
643
+ cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
644
+ {
645
+ int i;
646
+ size_t *index;
647
+ size_t size = q[ndim-1].n;
648
+ int use_cumo_cuda_runtime_malloc = 0;
649
+
650
+ if (q[ndim-1].idx != NULL) {
651
+ index = q[ndim-1].idx;
652
+ } else {
653
+ //index = ALLOC_N(size_t, size);
654
+ index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
655
+ use_cumo_cuda_runtime_malloc = 1;
656
+ }
657
+ CUMO_SDX_SET_INDEX(na2->stridx[0], index);
658
+
659
+ for (i=ndim-1; i>=0; i--) {
660
+ cumo_stridx_t sdx1 = na1->stridx[q[i].orig_dim];
661
+ if (i==ndim-1) {
662
+ if (size == 0) {
663
+ rb_raise(cumo_na_eShapeError, "cannot get element of empty array");
664
+ }
665
+ } else {
666
+ if (size != q[i].n) {
667
+ rb_raise(cumo_na_eShapeError, "index array sizes mismatch");
668
+ }
669
+ }
670
+
671
+ if (q[i].idx != NULL && CUMO_SDX_IS_INDEX(sdx1)) {
672
+ // index <- index
673
+ size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
674
+ if (i==ndim-1) {
675
+ cumo_na_index_aref_naview_index_index_kernel_launch(index, index1, size);
676
+ q[i].idx = NULL;
677
+ } else {
678
+ cumo_na_index_at_naview_index_index_index_add_kernel_launch(index, index1, q[i].idx, size);
679
+ }
680
+ }
681
+ else if (q[i].idx == NULL && CUMO_SDX_IS_INDEX(sdx1)) {
682
+ // step <- index
683
+ size_t beg = q[i].beg;
684
+ ssize_t step = q[i].step;
685
+ size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
686
+ if (i==ndim-1) {
687
+ cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(index, index1, beg, step, size);
688
+ } else {
689
+ cumo_na_index_at_naview_index_index_beg_step_add_kernel_launch(index, index1, beg, step, size);
690
+ }
691
+ }
692
+ else if (q[i].idx != NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
693
+ // index <- step
694
+ ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
695
+ if (stride1<0) {
696
+ size_t last;
697
+ stride1 = -stride1;
698
+ last = na1->base.shape[q[i].orig_dim] - 1;
699
+ if (na2->offset < last * stride1) {
700
+ rb_raise(rb_eStandardError,"bug: negative offset");
701
+ }
702
+ na2->offset -= last * stride1;
703
+ if (i==ndim-1) {
704
+ cumo_na_index_aref_naview_index_stride_last_kernel_launch(index, stride1, last, size);
705
+ q[i].idx = NULL;
706
+ } else {
707
+ cumo_na_index_at_naview_index_stride_last_add_kernel_launch(index, stride1, last, size);
708
+ }
709
+ } else {
710
+ if (i==ndim-1) {
711
+ cumo_na_index_aref_nadata_index_stride_kernel_launch(index, stride1, size);
712
+ q[i].idx = NULL;
713
+ } else {
714
+ cumo_na_index_at_nadata_index_stride_add_kernel_launch(index, q[i].idx, stride1, size);
715
+ }
716
+ }
717
+ }
718
+ else if (q[i].idx == NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
719
+ // step <- step
720
+ size_t beg = q[i].beg;
721
+ ssize_t step = q[i].step;
722
+ ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
723
+ if (stride1<0) {
724
+ size_t last;
725
+ stride1 = -stride1;
726
+ last = na1->base.shape[q[i].orig_dim] - 1;
727
+ if (na2->offset < last * stride1) {
728
+ rb_raise(rb_eStandardError,"bug: negative offset");
729
+ }
730
+ na2->offset -= last * stride1;
731
+ if (i==ndim-1) {
732
+ cumo_na_index_at_nadata_index_beg_step_stride_kernel_launch(index, last - beg, -step, stride1, size);
733
+ } else {
734
+ cumo_na_index_at_nadata_index_beg_step_stride_add_kernel_launch(index, last - beg, -step, stride1, size);
735
+ }
736
+ } else {
737
+ if (i==ndim-1) {
738
+ cumo_na_index_at_nadata_index_beg_step_stride_kernel_launch(index, beg, step, stride1, size);
739
+ } else {
740
+ cumo_na_index_at_nadata_index_beg_step_stride_add_kernel_launch(index, beg, step, stride1, size);
741
+ }
742
+ }
743
+ }
744
+ }
745
+ na2->base.size = size;
746
+ na2->base.shape[0] = size;
747
+ if (use_cumo_cuda_runtime_malloc) {
748
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("index", "cumo_na_index_at_naview");
749
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
750
+ }
751
+ }
752
+
571
753
  static int
572
754
  cumo_na_ndim_new_narray(int ndim, const cumo_na_index_arg_t *q)
573
755
  {
@@ -587,6 +769,7 @@ typedef struct {
587
769
  cumo_narray_t *na1;
588
770
  int keep_dim;
589
771
  size_t pos; // offset position for 0-dimensional narray. 0-dimensional array does not use q.
772
+ int at_mode; // 0: aref, 1: at
590
773
  } cumo_na_aref_md_data_t;
591
774
 
592
775
  static cumo_na_index_arg_t*
@@ -614,6 +797,7 @@ VALUE cumo_na_aref_md_protected(VALUE data_value)
614
797
  cumo_na_index_arg_t *q = data->q;
615
798
  cumo_narray_t *na1 = data->na1;
616
799
  int keep_dim = data->keep_dim;
800
+ int at_mode = data->at_mode;
617
801
 
618
802
  int ndim_new;
619
803
  VALUE view;
@@ -624,10 +808,14 @@ VALUE cumo_na_aref_md_protected(VALUE data_value)
624
808
 
625
809
  if (cumo_na_debug_flag) print_index_arg(q,ndim);
626
810
 
627
- if (keep_dim) {
628
- ndim_new = ndim;
811
+ if (at_mode) {
812
+ ndim_new = 1;
629
813
  } else {
630
- ndim_new = cumo_na_ndim_new_narray(ndim, q);
814
+ if (keep_dim) {
815
+ ndim_new = ndim;
816
+ } else {
817
+ ndim_new = cumo_na_ndim_new_narray(ndim, q);
818
+ }
631
819
  }
632
820
  view = cumo_na_s_allocate_view(rb_obj_class(self));
633
821
 
@@ -636,7 +824,7 @@ VALUE cumo_na_aref_md_protected(VALUE data_value)
636
824
 
637
825
  cumo_na_alloc_shape((cumo_narray_t*)na2, ndim_new);
638
826
 
639
- na2->stridx = ALLOC_N(cumo_stridx_t,ndim_new);
827
+ na2->stridx = ZALLOC_N(cumo_stridx_t,ndim_new);
640
828
 
641
829
  elmsz = cumo_na_element_stride(self);
642
830
 
@@ -647,7 +835,11 @@ VALUE cumo_na_aref_md_protected(VALUE data_value)
647
835
  na2->offset = data->pos;
648
836
  na2->base.size = 1;
649
837
  } else {
650
- cumo_na_index_aref_nadata((cumo_narray_data_t *)na1,na2,q,elmsz,ndim,keep_dim);
838
+ if (at_mode) {
839
+ cumo_na_index_at_nadata((cumo_narray_data_t *)na1,na2,q,elmsz,ndim,keep_dim);
840
+ } else {
841
+ cumo_na_index_aref_nadata((cumo_narray_data_t *)na1,na2,q,elmsz,ndim,keep_dim);
842
+ }
651
843
  }
652
844
  na2->data = self;
653
845
  break;
@@ -659,7 +851,11 @@ VALUE cumo_na_aref_md_protected(VALUE data_value)
659
851
  } else {
660
852
  na2->offset = ((cumo_narray_view_t *)na1)->offset;
661
853
  na2->data = ((cumo_narray_view_t *)na1)->data;
662
- cumo_na_index_aref_naview((cumo_narray_view_t *)na1,na2,q,elmsz,ndim,keep_dim);
854
+ if (at_mode) {
855
+ cumo_na_index_at_naview((cumo_narray_view_t *)na1,na2,q,elmsz,ndim,keep_dim);
856
+ } else {
857
+ cumo_na_index_aref_naview((cumo_narray_view_t *)na1,na2,q,elmsz,ndim,keep_dim);
858
+ }
663
859
  }
664
860
  break;
665
861
  }
@@ -684,7 +880,7 @@ cumo_na_aref_md_ensure(VALUE data_value)
684
880
  }
685
881
 
686
882
  static VALUE
687
- cumo_na_aref_md(int argc, VALUE *argv, VALUE self, int keep_dim, int result_nd, size_t pos)
883
+ cumo_na_aref_md(int argc, VALUE *argv, VALUE self, int keep_dim, int result_nd, size_t pos, int at_mode)
688
884
  {
689
885
  VALUE args; // should be GC protected
690
886
  cumo_narray_t *na1;
@@ -696,6 +892,9 @@ cumo_na_aref_md(int argc, VALUE *argv, VALUE self, int keep_dim, int result_nd,
696
892
  CumoGetNArray(self,na1);
697
893
 
698
894
  args = rb_ary_new4(argc,argv);
895
+ if (at_mode && na1->ndim == 0) {
896
+ rb_raise(cumo_na_eDimensionError,"argument length does not match dimension size");
897
+ }
699
898
 
700
899
  if (argc == 1 && result_nd == 1) {
701
900
  idx = argv[0];
@@ -724,6 +923,7 @@ cumo_na_aref_md(int argc, VALUE *argv, VALUE self, int keep_dim, int result_nd,
724
923
  data.q = cumo_na_allocate_index_args(result_nd);
725
924
  data.na1 = na1;
726
925
  data.keep_dim = keep_dim;
926
+ data.at_mode = at_mode;
727
927
 
728
928
  switch(na1->type) {
729
929
  case CUMO_NARRAY_DATA_T:
@@ -760,7 +960,15 @@ cumo_na_aref_main(int nidx, VALUE *idx, VALUE self, int keep_dim, int result_nd,
760
960
  return rb_funcall(*idx,cumo_id_mask,1,self);
761
961
  }
762
962
  }
763
- return cumo_na_aref_md(nidx, idx, self, keep_dim, result_nd, pos);
963
+ return cumo_na_aref_md(nidx, idx, self, keep_dim, result_nd, pos, 0);
964
+ }
965
+
966
+ /* method: at([idx1,idx2,...,idxN], [idx1,idx2,...,idxN]) */
967
+ VALUE
968
+ cumo_na_at_main(int nidx, VALUE *idx, VALUE self, int keep_dim, int result_nd, size_t pos)
969
+ {
970
+ cumo_na_index_arg_to_internal_order(nidx, idx, self);
971
+ return cumo_na_aref_md(nidx, idx, self, keep_dim, result_nd, pos, 1);
764
972
  }
765
973
 
766
974
 
@@ -782,16 +990,18 @@ check_index_count(int argc, int cumo_na_ndim, int count_new, int count_rest)
782
990
 
783
991
  switch(count_rest) {
784
992
  case 0:
785
- if (count_new == 0 && argc == 1) return 1;
993
+ if (argc == 1 && count_new == 0) return 1;
786
994
  if (argc == result_nd) return result_nd;
787
995
  rb_raise(rb_eIndexError,"# of index(=%i) should be "
788
- "equal to ndim(=%i)",argc,cumo_na_ndim);
996
+ "equal to ndim(=%i) or 1", argc,cumo_na_ndim);
789
997
  break;
790
998
  case 1:
791
999
  if (argc-1 <= result_nd) return result_nd;
792
1000
  rb_raise(rb_eIndexError,"# of index(=%i) > ndim(=%i) with :rest",
793
1001
  argc,cumo_na_ndim);
794
1002
  break;
1003
+ default:
1004
+ rb_raise(rb_eIndexError,"multiple rest-dimension is not allowd");
795
1005
  }
796
1006
  return -1;
797
1007
  }
@@ -802,7 +1012,6 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
802
1012
  int i, j;
803
1013
  int count_new=0;
804
1014
  int count_rest=0;
805
- int count_else=0;
806
1015
  ssize_t x, s, m, pos, *idx;
807
1016
  cumo_narray_t *na;
808
1017
  cumo_narray_view_t *nv;
@@ -811,8 +1020,7 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
811
1020
 
812
1021
  CumoGetNArray(self,na);
813
1022
  if (na->size == 0) {
814
- rb_raise(rb_eRuntimeError, "cannot get index of empty array");
815
- return -1;
1023
+ rb_raise(cumo_na_eShapeError, "cannot get element of empty array");
816
1024
  }
817
1025
  idx = ALLOCA_N(ssize_t, argc);
818
1026
  for (i=j=0; i<argc; i++) {
@@ -835,16 +1043,10 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
835
1043
  argv[i] = cumo_sym_new;
836
1044
  count_new++;
837
1045
  }
838
- // not break
839
- default:
840
- count_else++;
841
1046
  }
842
1047
  }
843
1048
 
844
- if (count_rest > 1) {
845
- rb_raise(rb_eIndexError,"multiple rest-dimension is not allowd");
846
- }
847
- if (count_else != 0) {
1049
+ if (j != argc) {
848
1050
  return check_index_count(argc, na->ndim, count_new, count_rest);
849
1051
  }
850
1052
 
@@ -865,8 +1067,9 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
865
1067
  }
866
1068
  }
867
1069
  *pos_idx = pos;
1070
+ return 0;
868
1071
  }
869
- else if (argc==1 && j==1) {
1072
+ if (j == 1) {
870
1073
  x = cumo_na_range_check(idx[0], na->size, 0);
871
1074
  for (i=na->ndim-1; i>=0; i--) {
872
1075
  s = na->shape[i];
@@ -882,19 +1085,19 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
882
1085
  }
883
1086
  }
884
1087
  *pos_idx = pos;
885
- } else {
886
- return check_index_count(argc, na->ndim, count_new, count_rest);
1088
+ return 0;
887
1089
  }
888
1090
  break;
889
1091
  default:
890
1092
  if (!stride) {
891
1093
  stride = cumo_na_element_stride(self);
892
1094
  }
893
- if (argc==1 && j==1) {
1095
+ if (j == 1) {
894
1096
  x = cumo_na_range_check(idx[0], na->size, 0);
895
1097
  *pos_idx = stride * x;
1098
+ return 0;
896
1099
  }
897
- else if (j == na->ndim) {
1100
+ if (j == na->ndim) {
898
1101
  pos = 0;
899
1102
  for (i=j-1; i>=0; i--) {
900
1103
  x = cumo_na_range_check(idx[i], na->shape[i], i);
@@ -902,11 +1105,12 @@ cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride,
902
1105
  stride *= na->shape[i];
903
1106
  }
904
1107
  *pos_idx = pos;
905
- } else {
906
- return check_index_count(argc, na->ndim, count_new, count_rest);
1108
+ return 0;
907
1109
  }
908
1110
  }
909
- return 0;
1111
+ rb_raise(rb_eIndexError,"# of index(=%i) should be "
1112
+ "equal to ndim(=%i) or 1", argc,na->ndim);
1113
+ return -1;
910
1114
  }
911
1115
 
912
1116
 
@@ -42,6 +42,48 @@ __global__ void cumo_na_index_aref_naview_index_index_beg_step_kernel(size_t *id
42
42
  }
43
43
  }
44
44
 
45
+ __global__ void cumo_na_index_at_nadata_index_beg_step_stride_kernel(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n)
46
+ {
47
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
48
+ idx[i] = (beg + step * i) * s1;
49
+ }
50
+ }
51
+
52
+ __global__ void cumo_na_index_at_nadata_index_beg_step_stride_add_kernel(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n)
53
+ {
54
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
55
+ idx[i] += (beg + step * i) * s1;
56
+ }
57
+ }
58
+
59
+ __global__ void cumo_na_index_at_nadata_index_stride_add_kernel(size_t *idx, size_t *idx1, ssize_t s1, uint64_t n)
60
+ {
61
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
62
+ idx[i] += idx1[i] * s1;
63
+ }
64
+ }
65
+
66
+ __global__ void cumo_na_index_at_naview_index_index_index_add_kernel(size_t *idx, size_t *idx1, size_t *idx2, uint64_t n)
67
+ {
68
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
69
+ idx[i] += idx1[idx2[i]];
70
+ }
71
+ }
72
+
73
+ __global__ void cumo_na_index_at_naview_index_index_beg_step_add_kernel(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n)
74
+ {
75
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
76
+ idx[i] += idx1[beg + step * i];
77
+ }
78
+ }
79
+
80
+ __global__ void cumo_na_index_at_naview_index_stride_last_add_kernel(size_t *idx, ssize_t s1, size_t last, uint64_t n)
81
+ {
82
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
83
+ idx[i] += (last - idx[i]) * s1;
84
+ }
85
+ }
86
+
45
87
  void cumo_na_index_aref_nadata_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n)
46
88
  {
47
89
  size_t grid_dim = cumo_get_grid_dim(n);
@@ -77,6 +119,48 @@ void cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(size_t *idx, s
77
119
  cumo_na_index_aref_naview_index_index_beg_step_kernel<<<grid_dim, block_dim>>>(idx, idx1, beg, step, n);
78
120
  }
79
121
 
122
+ void cumo_na_index_at_nadata_index_stride_add_kernel_launch(size_t *idx, size_t *idx1, ssize_t s1, uint64_t n)
123
+ {
124
+ size_t grid_dim = cumo_get_grid_dim(n);
125
+ size_t block_dim = cumo_get_block_dim(n);
126
+ cumo_na_index_at_nadata_index_stride_add_kernel<<<grid_dim, block_dim>>>(idx, idx1, s1, n);
127
+ }
128
+
129
+ void cumo_na_index_at_nadata_index_beg_step_stride_kernel_launch(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n)
130
+ {
131
+ size_t grid_dim = cumo_get_grid_dim(n);
132
+ size_t block_dim = cumo_get_block_dim(n);
133
+ cumo_na_index_at_nadata_index_beg_step_stride_kernel<<<grid_dim, block_dim>>>(idx, beg, step, s1, n);
134
+ }
135
+
136
+ void cumo_na_index_at_nadata_index_beg_step_stride_add_kernel_launch(size_t *idx, size_t beg, ssize_t step, ssize_t s1, uint64_t n)
137
+ {
138
+ size_t grid_dim = cumo_get_grid_dim(n);
139
+ size_t block_dim = cumo_get_block_dim(n);
140
+ cumo_na_index_at_nadata_index_beg_step_stride_add_kernel<<<grid_dim, block_dim>>>(idx, beg, step, s1, n);
141
+ }
142
+
143
+ void cumo_na_index_at_naview_index_index_index_add_kernel_launch(size_t *idx, size_t *idx1, size_t *idx2, uint64_t n)
144
+ {
145
+ size_t grid_dim = cumo_get_grid_dim(n);
146
+ size_t block_dim = cumo_get_block_dim(n);
147
+ cumo_na_index_at_naview_index_index_index_add_kernel<<<grid_dim, block_dim>>>(idx, idx1, idx2, n);
148
+ }
149
+
150
+ void cumo_na_index_at_naview_index_index_beg_step_add_kernel_launch(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n)
151
+ {
152
+ size_t grid_dim = cumo_get_grid_dim(n);
153
+ size_t block_dim = cumo_get_block_dim(n);
154
+ cumo_na_index_at_naview_index_index_beg_step_add_kernel<<<grid_dim, block_dim>>>(idx, idx1, beg, step, n);
155
+ }
156
+
157
+ void cumo_na_index_at_naview_index_stride_last_add_kernel_launch(size_t *idx, ssize_t s1, size_t last, uint64_t n)
158
+ {
159
+ size_t grid_dim = cumo_get_grid_dim(n);
160
+ size_t block_dim = cumo_get_block_dim(n);
161
+ cumo_na_index_at_naview_index_stride_last_add_kernel<<<grid_dim, block_dim>>>(idx, s1, last, n);
162
+ }
163
+
80
164
  #if defined(__cplusplus)
81
165
  #if 0
82
166
  { /* satisfy cc-mode */
@@ -889,6 +889,39 @@ cumo_na_check_contiguous(VALUE self)
889
889
  return Qfalse;
890
890
  }
891
891
 
892
+ VALUE
893
+ cumo_na_check_fortran_contiguous(VALUE self)
894
+ {
895
+ int i;
896
+ ssize_t st0;
897
+ cumo_narray_t *na;
898
+
899
+ switch(CUMO_RNARRAY_TYPE(self)) {
900
+ case CUMO_NARRAY_DATA_T:
901
+ case CUMO_NARRAY_FILEMAP_T:
902
+ return Qfalse;
903
+ case CUMO_NARRAY_VIEW_T:
904
+ CumoGetNArray(self,na);
905
+
906
+ // not contiguous if it has index
907
+ for (i=0; i < CUMO_NA_NDIM(na); i++) {
908
+ if (CUMO_NA_IS_INDEX_AT(na,i))
909
+ return Qfalse;
910
+ }
911
+
912
+ // check f-contiguous
913
+ st0 = cumo_na_element_stride(self); // elmsz
914
+ for (i=0; i < CUMO_NA_NDIM(na); i++) {
915
+ if (CUMO_NA_SHAPE(na)[i] == 1)
916
+ continue;
917
+ if (CUMO_NA_STRIDE_AT(na, i) != st0)
918
+ return Qfalse;
919
+ st0 *= CUMO_NA_SHAPE(na)[i];
920
+ }
921
+ }
922
+ return Qtrue;
923
+ }
924
+
892
925
  VALUE
893
926
  cumo_na_as_contiguous_array(VALUE a)
894
927
  {
@@ -1388,7 +1421,7 @@ static VALUE cumo_na_inplace( VALUE self );
1388
1421
  /*
1389
1422
  Load marshal data.
1390
1423
  @overload marshal_load(data)
1391
- @params [Array] Array containing marshal data.
1424
+ @param [Array] Array containing marshal data.
1392
1425
  @return [nil]
1393
1426
  */
1394
1427
  static VALUE
@@ -1833,6 +1866,9 @@ cumo_na_equal(VALUE self, volatile VALUE other)
1833
1866
  return Qfalse;
1834
1867
  }
1835
1868
  }
1869
+ if (na1->size == 0) {
1870
+ return Qtrue;
1871
+ }
1836
1872
  vbool = rb_funcall(self, cumo_id_eq, 1, other);
1837
1873
  return (rb_funcall(vbool, cumo_id_count_false_cpu, 0)==INT2FIX(0)) ? Qtrue : Qfalse;
1838
1874
  }
@@ -1929,6 +1965,7 @@ Init_cumo_narray()
1929
1965
  rb_define_method(cNArray, "debug_info", cumo_na_debug_info, 0);
1930
1966
 
1931
1967
  rb_define_method(cNArray, "contiguous?", cumo_na_check_contiguous, 0);
1968
+ rb_define_method(cNArray, "fortran_contiguous?", cumo_na_check_fortran_contiguous, 0);
1932
1969
 
1933
1970
  rb_define_method(cNArray, "view", cumo_na_make_view, 0);
1934
1971
  rb_define_method(cNArray, "expand_dims", cumo_na_expand_dims, 1);
@@ -56,7 +56,7 @@ typedef struct CUMO_NA_MD_LOOP {
56
56
  // [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
57
57
  VALUE loop_opt;
58
58
  cumo_ndfunc_t *ndfunc;
59
- void (*loop_func)();
59
+ void (*loop_func)(cumo_ndfunc_t *, struct CUMO_NA_MD_LOOP *);
60
60
  } cumo_na_md_loop_t;
61
61
 
62
62
  #define LARG(lp,iarg) ((lp)->user.args[iarg])