cumo 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/cumo/extconf.rb +2 -0
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/narray/data.c +21 -24
- data/ext/cumo/narray/data_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +1 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +1 -0
- data/ext/cumo/narray/index.c +15 -36
- data/ext/cumo/narray/index_kernel.cu +86 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ac2b9873bc48d45afcac57ff6e45ba84cc69ed1c61430cb13236a5c1ce018d0c
|
4
|
+
data.tar.gz: c001063b6a66de3055f98789420d5574b5b2d53357624dc6ffbe750ce2f727f1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f18aa1652ddd921ae91da6f75e28a5b9338091111a07b1cc97b586b19e0a755fcb195b2307626a27f95def01845629ed710786f12932888a14e7dcfc55b0d034
|
7
|
+
data.tar.gz: ed082b7188a9b517074eb78216fd0a15333ae84e7eeb271c6f7675d8c4bfecd0ef8ccf575017dfe450c75b85e5f275b801c11295e985be447cd9c08703601c86
|
data/CHANGELOG.md
CHANGED
data/ext/cumo/extconf.rb
CHANGED
data/ext/cumo/include/cumo.h
CHANGED
data/ext/cumo/narray/data.c
CHANGED
@@ -53,15 +53,23 @@ static ID cumo_id_swap_byte;
|
|
53
53
|
} \
|
54
54
|
}
|
55
55
|
|
56
|
-
|
56
|
+
void cumo_iter_copy_bytes_kernel_launch(char *p1, char *p2, ssize_t s1, ssize_t s2, size_t *idx1, size_t *idx2, size_t n, int elmsz);
|
57
|
+
// #define m_memcpy(src,dst) memcpy(dst,src,e)
|
58
|
+
|
57
59
|
static void
|
58
60
|
iter_copy_bytes(cumo_na_loop_t *const lp)
|
59
61
|
{
|
60
|
-
size_t
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
size_t n;
|
63
|
+
ssize_t s1, s2;
|
64
|
+
char *p1, *p2;
|
65
|
+
size_t *idx1, *idx2;
|
66
|
+
CUMO_INIT_COUNTER(lp, n);
|
67
|
+
CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
|
68
|
+
CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);
|
69
|
+
cumo_iter_copy_bytes_kernel_launch(p1, p2, s1, s2, idx1, idx2, n, lp->args[0].elmsz);
|
70
|
+
// size_t e;
|
71
|
+
// e = lp->args[0].elmsz;
|
72
|
+
// LOOP_UNARY_PTR(lp,m_memcpy);
|
65
73
|
}
|
66
74
|
|
67
75
|
VALUE
|
@@ -562,6 +570,10 @@ cumo_na_flatten(VALUE self)
|
|
562
570
|
|
563
571
|
#define MIN(a,b) (((a)<(b))?(a):(b))
|
564
572
|
|
573
|
+
void cumo_na_diagonal_index_index_kernel_launch(size_t *idx, size_t *idx0, size_t *idx1, size_t k0, size_t k1, uint64_t n);
|
574
|
+
void cumo_na_diagonal_index_stride_kernel_launch(size_t *idx, size_t *idx0, ssize_t s1, size_t k0, size_t k1, uint64_t n);
|
575
|
+
void cumo_na_diagonal_stride_index_kernel_launch(size_t *idx, ssize_t s0, size_t *idx1, size_t k0, size_t k1, uint64_t n);
|
576
|
+
|
565
577
|
/*
|
566
578
|
Returns a diagonal view of NArray
|
567
579
|
@overload diagonal([offset,axes])
|
@@ -601,7 +613,6 @@ static VALUE
|
|
601
613
|
cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
602
614
|
{
|
603
615
|
int i, k, nd;
|
604
|
-
size_t j;
|
605
616
|
size_t *idx0, *idx1, *diag_idx;
|
606
617
|
size_t *shape;
|
607
618
|
size_t diag_size;
|
@@ -754,20 +765,12 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
754
765
|
idx0 = CUMO_SDX_GET_INDEX(na1->stridx[ax[0]]);
|
755
766
|
// diag_idx = ALLOC_N(size_t, diag_size);
|
756
767
|
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
757
|
-
|
758
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
759
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
760
|
-
|
761
768
|
if (CUMO_SDX_IS_INDEX(na1->stridx[ax[1]])) {
|
762
769
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
763
|
-
|
764
|
-
diag_idx[j] = idx0[j+k0] + idx1[j+k1];
|
765
|
-
}
|
770
|
+
cumo_na_diagonal_index_index_kernel_launch(diag_idx, idx0, idx1, k0, k1, diag_size);
|
766
771
|
} else {
|
767
772
|
stride1 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[1]]);
|
768
|
-
|
769
|
-
diag_idx[j] = idx0[j+k0] + stride1*(j+k1);
|
770
|
-
}
|
773
|
+
cumo_na_diagonal_index_stride_kernel_launch(diag_idx, idx0, stride1, k0, k1, diag_size);
|
771
774
|
}
|
772
775
|
CUMO_SDX_SET_INDEX(na2->stridx[nd-2],diag_idx);
|
773
776
|
} else {
|
@@ -776,13 +779,7 @@ cumo_na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
776
779
|
idx1 = CUMO_SDX_GET_INDEX(na1->stridx[ax[1]]);
|
777
780
|
// diag_idx = ALLOC_N(size_t, diag_size);
|
778
781
|
diag_idx = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*diag_size);
|
779
|
-
|
780
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_diagonal", "any");
|
781
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
782
|
-
|
783
|
-
for (j=0; j<diag_size; j++) {
|
784
|
-
diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
|
785
|
-
}
|
782
|
+
cumo_na_diagonal_stride_index_kernel_launch(diag_idx, stride0, idx1, k0, k1, diag_size);
|
786
783
|
CUMO_SDX_SET_INDEX(na2->stridx[nd-2],diag_idx);
|
787
784
|
} else {
|
788
785
|
stride1 = CUMO_SDX_GET_STRIDE(na1->stridx[ax[1]]);
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#include "cumo/narray_kernel.h"
|
2
|
+
|
3
|
+
#if defined(__cplusplus)
|
4
|
+
extern "C" {
|
5
|
+
#if 0
|
6
|
+
} /* satisfy cc-mode */
|
7
|
+
#endif
|
8
|
+
#endif
|
9
|
+
|
10
|
+
__global__ void cumo_iter_copy_bytes_kernel(char *p1, char *p2, ssize_t s1, ssize_t s2, size_t *idx1, size_t *idx2, uint64_t n, ssize_t elmsz)
|
11
|
+
{
|
12
|
+
char *p1_ = NULL;
|
13
|
+
char *p2_ = NULL;
|
14
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
15
|
+
p1_ = p1 + (idx1 ? idx1[i] : i * s1);
|
16
|
+
p2_ = p2 + (idx2 ? idx2[i] : i * s2);
|
17
|
+
memcpy(p2_, p1_, elmsz);
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
__global__ void cumo_na_diagonal_index_index_kernel(size_t *idx, size_t *idx0, size_t *idx1, size_t k0, size_t k1, uint64_t n)
|
22
|
+
{
|
23
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
24
|
+
idx[i] = idx0[i+k0] + idx1[i+k1];
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
__global__ void cumo_na_diagonal_index_stride_kernel(size_t *idx, size_t *idx0, ssize_t s1, size_t k0, size_t k1, uint64_t n)
|
29
|
+
{
|
30
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
31
|
+
idx[i] = idx0[i+k0] + s1*(i+k1);
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
__global__ void cumo_na_diagonal_stride_index_kernel(size_t *idx, ssize_t s0, size_t *idx1, size_t k0, size_t k1, uint64_t n)
|
36
|
+
{
|
37
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
38
|
+
idx[i] = s0*(i+k0) + idx1[i+k1];
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
void cumo_iter_copy_bytes_kernel_launch(char *p1, char *p2, ssize_t s1, ssize_t s2, size_t *idx1, size_t *idx2, uint64_t n, ssize_t elmsz)
|
43
|
+
{
|
44
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
45
|
+
size_t block_dim = cumo_get_block_dim(n);
|
46
|
+
cumo_iter_copy_bytes_kernel<<<grid_dim, block_dim>>>(p1, p2, s1, s2, idx1, idx2, n, elmsz);
|
47
|
+
}
|
48
|
+
|
49
|
+
void cumo_na_diagonal_index_index_kernel_launch(size_t *idx, size_t *idx0, size_t *idx1, size_t k0, size_t k1, uint64_t n)
|
50
|
+
{
|
51
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
52
|
+
size_t block_dim = cumo_get_block_dim(n);
|
53
|
+
cumo_na_diagonal_index_index_kernel<<<grid_dim, block_dim>>>(idx, idx0, idx1, k0, k1, n);
|
54
|
+
}
|
55
|
+
|
56
|
+
void cumo_na_diagonal_index_stride_kernel_launch(size_t *idx, size_t *idx0, ssize_t s1, size_t k0, size_t k1, uint64_t n)
|
57
|
+
{
|
58
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
59
|
+
size_t block_dim = cumo_get_block_dim(n);
|
60
|
+
cumo_na_diagonal_index_stride_kernel<<<grid_dim, block_dim>>>(idx, idx0, s1, k0, k1, n);
|
61
|
+
}
|
62
|
+
|
63
|
+
void cumo_na_diagonal_stride_index_kernel_launch(size_t *idx, ssize_t s0, size_t *idx1, size_t k0, size_t k1, uint64_t n)
|
64
|
+
{
|
65
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
66
|
+
size_t block_dim = cumo_get_block_dim(n);
|
67
|
+
cumo_na_diagonal_stride_index_kernel<<<grid_dim, block_dim>>>(idx, s0, idx1, k0, k1, n);
|
68
|
+
}
|
69
|
+
|
70
|
+
#if defined(__cplusplus)
|
71
|
+
#if 0
|
72
|
+
{ /* satisfy cc-mode */
|
73
|
+
#endif
|
74
|
+
} /* extern "C" { */
|
75
|
+
#endif
|
@@ -9,6 +9,7 @@ static void
|
|
9
9
|
CUMO_INIT_COUNTER(lp, n);
|
10
10
|
CUMO_INIT_PTR(lp, 0, ptr, step);
|
11
11
|
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
12
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
12
13
|
<%=type_name%>_qsort<%=j%>(ptr, n, step);
|
13
14
|
}
|
14
15
|
<% end %>
|
data/ext/cumo/narray/index.c
CHANGED
@@ -391,12 +391,14 @@ cumo_na_get_strides_nadata(const cumo_narray_data_t *na, ssize_t *strides, ssize
|
|
391
391
|
}
|
392
392
|
}
|
393
393
|
|
394
|
+
void cumo_na_index_aref_nadata_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n);
|
395
|
+
|
394
396
|
static void
|
395
397
|
cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
|
396
398
|
cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
|
397
399
|
{
|
398
400
|
int i, j;
|
399
|
-
ssize_t size,
|
401
|
+
ssize_t size, total=1;
|
400
402
|
ssize_t stride1;
|
401
403
|
ssize_t *strides_na1;
|
402
404
|
size_t *index;
|
@@ -425,15 +427,10 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
|
|
425
427
|
|
426
428
|
// array index
|
427
429
|
if (q[i].idx != NULL) {
|
428
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_nadata", "any");
|
429
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
430
|
-
|
431
430
|
index = q[i].idx;
|
432
431
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
433
432
|
q[i].idx = NULL;
|
434
|
-
|
435
|
-
index[k] = index[k] * stride1;
|
436
|
-
}
|
433
|
+
cumo_na_index_aref_nadata_index_stride_kernel_launch(index, stride1, size);
|
437
434
|
} else {
|
438
435
|
beg = q[i].beg;
|
439
436
|
step = q[i].step;
|
@@ -447,6 +444,11 @@ cumo_na_index_aref_nadata(cumo_narray_data_t *na1, cumo_narray_view_t *na2,
|
|
447
444
|
}
|
448
445
|
|
449
446
|
|
447
|
+
void cumo_na_index_aref_naview_index_index_kernel_launch(size_t *idx, size_t *idx1, uint64_t n);
|
448
|
+
void cumo_na_index_aref_naview_index_stride_last_kernel_launch(size_t *idx, ssize_t s1, size_t last, uint64_t n);
|
449
|
+
void cumo_na_index_aref_naview_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n);
|
450
|
+
void cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n);
|
451
|
+
|
450
452
|
static void
|
451
453
|
cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
452
454
|
cumo_na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
|
@@ -481,64 +483,41 @@ cumo_na_index_aref_naview(cumo_narray_view_t *na1, cumo_narray_view_t *na2,
|
|
481
483
|
}
|
482
484
|
else if (q[i].idx != NULL && CUMO_SDX_IS_INDEX(sdx1)) {
|
483
485
|
// index <- index
|
484
|
-
int k;
|
485
486
|
size_t *index = q[i].idx;
|
486
|
-
|
487
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
488
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
489
|
-
|
487
|
+
size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
|
490
488
|
CUMO_SDX_SET_INDEX(na2->stridx[j], index);
|
491
489
|
q[i].idx = NULL;
|
492
|
-
|
493
|
-
for (k=0; k<size; k++) {
|
494
|
-
index[k] = CUMO_SDX_GET_INDEX(sdx1)[index[k]];
|
495
|
-
}
|
490
|
+
cumo_na_index_aref_naview_index_index_kernel_launch(index, index1, size);
|
496
491
|
}
|
497
492
|
else if (q[i].idx != NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
|
498
493
|
// index <- step
|
499
494
|
ssize_t stride1 = CUMO_SDX_GET_STRIDE(sdx1);
|
500
495
|
size_t *index = q[i].idx;
|
501
|
-
|
502
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
503
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
504
|
-
|
505
496
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
506
497
|
q[i].idx = NULL;
|
507
498
|
|
508
499
|
if (stride1<0) {
|
509
500
|
size_t last;
|
510
|
-
int k;
|
511
501
|
stride1 = -stride1;
|
512
502
|
last = na1->base.shape[q[i].orig_dim] - 1;
|
513
503
|
if (na2->offset < last * stride1) {
|
514
504
|
rb_raise(rb_eStandardError,"bug: negative offset");
|
515
505
|
}
|
516
506
|
na2->offset -= last * stride1;
|
517
|
-
|
518
|
-
index[k] = (last - index[k]) * stride1;
|
519
|
-
}
|
507
|
+
cumo_na_index_aref_naview_index_stride_last_kernel_launch(index, stride1, last, size);
|
520
508
|
} else {
|
521
|
-
|
522
|
-
for (k=0; k<size; k++) {
|
523
|
-
index[k] = index[k] * stride1;
|
524
|
-
}
|
509
|
+
cumo_na_index_aref_naview_index_stride_kernel_launch(index, stride1, size);
|
525
510
|
}
|
526
511
|
}
|
527
512
|
else if (q[i].idx == NULL && CUMO_SDX_IS_INDEX(sdx1)) {
|
528
513
|
// step <- index
|
529
|
-
int k;
|
530
514
|
size_t beg = q[i].beg;
|
531
515
|
ssize_t step = q[i].step;
|
532
516
|
// size_t *index = ALLOC_N(size_t, size);
|
533
517
|
size_t *index = (size_t*)cumo_cuda_runtime_malloc(sizeof(size_t)*size);
|
518
|
+
size_t *index1 = CUMO_SDX_GET_INDEX(sdx1);
|
534
519
|
CUMO_SDX_SET_INDEX(na2->stridx[j],index);
|
535
|
-
|
536
|
-
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("na_index_aref_naview", "any");
|
537
|
-
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
538
|
-
|
539
|
-
for (k=0; k<size; k++) {
|
540
|
-
index[k] = CUMO_SDX_GET_INDEX(sdx1)[beg+step*k];
|
541
|
-
}
|
520
|
+
cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(index, index1, beg, step, size);
|
542
521
|
}
|
543
522
|
else if (q[i].idx == NULL && CUMO_SDX_IS_STRIDE(sdx1)) {
|
544
523
|
// step <- step
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#include "cumo/narray_kernel.h"
|
2
|
+
|
3
|
+
#if defined(__cplusplus)
|
4
|
+
extern "C" {
|
5
|
+
#if 0
|
6
|
+
} /* satisfy cc-mode */
|
7
|
+
#endif
|
8
|
+
#endif
|
9
|
+
|
10
|
+
__global__ void cumo_na_index_aref_nadata_index_stride_kernel(size_t *idx, ssize_t s1, uint64_t n)
|
11
|
+
{
|
12
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
13
|
+
idx[i] = idx[i] * s1;
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
__global__ void cumo_na_index_aref_naview_index_index_kernel(size_t *idx, size_t *idx1, uint64_t n)
|
18
|
+
{
|
19
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
20
|
+
idx[i] = idx1[idx[i]];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
__global__ void cumo_na_index_aref_naview_index_stride_last_kernel(size_t *idx, ssize_t s1, size_t last, uint64_t n)
|
25
|
+
{
|
26
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
27
|
+
idx[i] = (last - idx[i]) * s1;
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
__global__ void cumo_na_index_aref_naview_index_stride_kernel(size_t *idx, ssize_t s1, uint64_t n)
|
32
|
+
{
|
33
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
34
|
+
idx[i] = idx[i] * s1;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
__global__ void cumo_na_index_aref_naview_index_index_beg_step_kernel(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n)
|
39
|
+
{
|
40
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
41
|
+
idx[i] = idx1[beg + step * i];
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
void cumo_na_index_aref_nadata_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n)
|
46
|
+
{
|
47
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
48
|
+
size_t block_dim = cumo_get_block_dim(n);
|
49
|
+
cumo_na_index_aref_nadata_index_stride_kernel<<<grid_dim, block_dim>>>(idx, s1, n);
|
50
|
+
}
|
51
|
+
|
52
|
+
void cumo_na_index_aref_naview_index_index_kernel_launch(size_t *idx, size_t *idx1, uint64_t n)
|
53
|
+
{
|
54
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
55
|
+
size_t block_dim = cumo_get_block_dim(n);
|
56
|
+
cumo_na_index_aref_naview_index_index_kernel<<<grid_dim, block_dim>>>(idx, idx1, n);
|
57
|
+
}
|
58
|
+
|
59
|
+
void cumo_na_index_aref_naview_index_stride_last_kernel_launch(size_t *idx, ssize_t s1, size_t last, uint64_t n)
|
60
|
+
{
|
61
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
62
|
+
size_t block_dim = cumo_get_block_dim(n);
|
63
|
+
cumo_na_index_aref_naview_index_stride_last_kernel<<<grid_dim, block_dim>>>(idx, s1, last, n);
|
64
|
+
}
|
65
|
+
|
66
|
+
void cumo_na_index_aref_naview_index_stride_kernel_launch(size_t *idx, ssize_t s1, uint64_t n)
|
67
|
+
{
|
68
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
69
|
+
size_t block_dim = cumo_get_block_dim(n);
|
70
|
+
cumo_na_index_aref_naview_index_stride_kernel<<<grid_dim, block_dim>>>(idx, s1, n);
|
71
|
+
}
|
72
|
+
|
73
|
+
void cumo_na_index_aref_naview_index_index_beg_step_kernel_launch(size_t *idx, size_t *idx1, size_t beg, ssize_t step, uint64_t n)
|
74
|
+
{
|
75
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
76
|
+
size_t block_dim = cumo_get_block_dim(n);
|
77
|
+
cumo_na_index_aref_naview_index_index_beg_step_kernel<<<grid_dim, block_dim>>>(idx, idx1, beg, step, n);
|
78
|
+
}
|
79
|
+
|
80
|
+
#if defined(__cplusplus)
|
81
|
+
#if 0
|
82
|
+
{ /* satisfy cc-mode */
|
83
|
+
#endif
|
84
|
+
} /* extern "C" { */
|
85
|
+
#endif
|
86
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cumo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
@@ -170,6 +170,7 @@ files:
|
|
170
170
|
- ext/cumo/narray/SFMT.h
|
171
171
|
- ext/cumo/narray/array.c
|
172
172
|
- ext/cumo/narray/data.c
|
173
|
+
- ext/cumo/narray/data_kernel.cu
|
173
174
|
- ext/cumo/narray/gen/cogen.rb
|
174
175
|
- ext/cumo/narray/gen/cogen_kernel.rb
|
175
176
|
- ext/cumo/narray/gen/def/bit.rb
|
@@ -305,6 +306,7 @@ files:
|
|
305
306
|
- ext/cumo/narray/gen/tmpl_bit/where.c
|
306
307
|
- ext/cumo/narray/gen/tmpl_bit/where2.c
|
307
308
|
- ext/cumo/narray/index.c
|
309
|
+
- ext/cumo/narray/index_kernel.cu
|
308
310
|
- ext/cumo/narray/kwargs.c
|
309
311
|
- ext/cumo/narray/math.c
|
310
312
|
- ext/cumo/narray/narray.c
|