cumo 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +23 -24
- data/bench/cumo_bench.rb +1 -0
- data/ext/cumo/cuda/memory_pool.cpp +9 -1
- data/ext/cumo/cuda/memory_pool_impl.cpp +2 -13
- data/ext/cumo/cumo.c +4 -4
- data/ext/cumo/depend.erb +1 -1
- data/ext/cumo/extconf.rb +2 -0
- data/ext/cumo/include/cumo.h +4 -4
- data/ext/cumo/include/cumo/indexer.h +50 -0
- data/ext/cumo/include/cumo/intern.h +1 -0
- data/ext/cumo/include/cumo/narray.h +20 -1
- data/ext/cumo/include/cumo/narray_kernel.h +10 -0
- data/ext/cumo/include/cumo/ndloop.h +1 -1
- data/ext/cumo/narray/array.c +8 -2
- data/ext/cumo/narray/gen/tmpl/store_array.c +15 -3
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +10 -2
- data/ext/cumo/narray/index.c +77 -43
- data/ext/cumo/narray/narray.c +11 -2
- data/ext/cumo/narray/ndloop.c +49 -1
- data/ext/cumo/narray/ndloop_kernel.cu +97 -0
- data/ext/cumo/narray/step.c +56 -250
- data/lib/cumo/narray/extra.rb +50 -1
- metadata +4 -4
    
        data/ext/cumo/narray/array.c
    CHANGED
    
    | @@ -117,10 +117,12 @@ static int cumo_na_mdai_object_type(int type, VALUE v) | |
| 117 117 | 
             
                if (rb_obj_is_kind_of(v, rb_cRange)) {
         | 
| 118 118 | 
             
                    MDAI_ATTR_TYPE(type,v,begin);
         | 
| 119 119 | 
             
                    MDAI_ATTR_TYPE(type,v,end);
         | 
| 120 | 
            -
             | 
| 120 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 121 | 
            +
                } else if (rb_obj_is_kind_of(v, rb_cArithSeq)) {
         | 
| 121 122 | 
             
                    MDAI_ATTR_TYPE(type,v,begin);
         | 
| 122 123 | 
             
                    MDAI_ATTR_TYPE(type,v,end);
         | 
| 123 124 | 
             
                    MDAI_ATTR_TYPE(type,v,step);
         | 
| 125 | 
            +
            #endif
         | 
| 124 126 | 
             
                } else {
         | 
| 125 127 | 
             
                    type = cumo_na_object_type(type,v);
         | 
| 126 128 | 
             
                }
         | 
| @@ -205,7 +207,11 @@ cumo_na_mdai_investigate(cumo_na_mdai_t *mdai, int ndim) | |
| 205 207 | 
             
                        }
         | 
| 206 208 | 
             
                    }
         | 
| 207 209 | 
             
                    else
         | 
| 208 | 
            -
             | 
| 210 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 211 | 
            +
                    if (rb_obj_is_kind_of(v, rb_cRange) || rb_obj_is_kind_of(v, rb_cArithSeq)) {
         | 
| 212 | 
            +
            #else
         | 
| 213 | 
            +
                    if (rb_obj_is_kind_of(v, rb_cRange) || rb_obj_is_kind_of(v, rb_cEnumerator)) {
         | 
| 214 | 
            +
            #endif
         | 
| 209 215 | 
             
                        cumo_na_step_sequence(v,&length,&dbeg,&dstep);
         | 
| 210 216 | 
             
                        len += length-1;
         | 
| 211 217 | 
             
                        mdai->type = cumo_na_mdai_object_type(mdai->type, v);
         | 
| @@ -65,7 +65,11 @@ static void | |
| 65 65 | 
             
                    if (idx1) {
         | 
| 66 66 | 
             
                        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
         | 
| 67 67 | 
             
                            x = ptr[i1];
         | 
| 68 | 
            -
             | 
| 68 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 69 | 
            +
                            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cArithSeq)) {
         | 
| 70 | 
            +
            #else
         | 
| 71 | 
            +
                            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cEnumerator)) {
         | 
| 72 | 
            +
            #endif
         | 
| 69 73 | 
             
                                cumo_na_step_sequence(x,&len,&beg,&step);
         | 
| 70 74 | 
             
                                for (c=0; c<len && i<n; c++,i++) {
         | 
| 71 75 | 
             
                                    y = beg + step * c;
         | 
| @@ -81,7 +85,11 @@ static void | |
| 81 85 | 
             
                    } else {
         | 
| 82 86 | 
             
                        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
         | 
| 83 87 | 
             
                            x = ptr[i1];
         | 
| 84 | 
            -
             | 
| 88 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 89 | 
            +
                            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cArithSeq)) {
         | 
| 90 | 
            +
            #else
         | 
| 91 | 
            +
                            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cEnumerator)) {
         | 
| 92 | 
            +
            #endif
         | 
| 85 93 | 
             
                                cumo_na_step_sequence(x,&len,&beg,&step);
         | 
| 86 94 | 
             
                                for (c=0; c<len && i<n; c++,i++) {
         | 
| 87 95 | 
             
                                    y = beg + step * c;
         | 
| @@ -110,7 +118,11 @@ static void | |
| 110 118 | 
             
                    dtype* host_z = ALLOC_N(dtype, n);
         | 
| 111 119 | 
             
                    for (i=i1=0; i1<n1 && i<n; i1++) {
         | 
| 112 120 | 
             
                        x = ptr[i1];
         | 
| 113 | 
            -
             | 
| 121 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 122 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cArithSeq)) {
         | 
| 123 | 
            +
            #else
         | 
| 124 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cEnumerator)) {
         | 
| 125 | 
            +
            #endif
         | 
| 114 126 | 
             
                            cumo_na_step_sequence(x,&len,&beg,&step);
         | 
| 115 127 | 
             
                            for (c=0; c<len && i<n; c++,i++) {
         | 
| 116 128 | 
             
                                y = beg + step * c;
         | 
| @@ -52,7 +52,11 @@ static void | |
| 52 52 | 
             
                if (idx1) {
         | 
| 53 53 | 
             
                    for (i=i1=0; i1<n1 && i<n; i++,i1++) {
         | 
| 54 54 | 
             
                        x = ptr[i1];
         | 
| 55 | 
            -
             | 
| 55 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 56 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cArithSeq)) {
         | 
| 57 | 
            +
            #else
         | 
| 58 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cEnumerator)) {
         | 
| 59 | 
            +
            #endif
         | 
| 56 60 | 
             
                            cumo_na_step_sequence(x,&len,&beg,&step);
         | 
| 57 61 | 
             
                            for (c=0; c<len && i<n; c++,i++) {
         | 
| 58 62 | 
             
                                y = beg + step * c;
         | 
| @@ -69,7 +73,11 @@ static void | |
| 69 73 | 
             
                } else {
         | 
| 70 74 | 
             
                    for (i=i1=0; i1<n1 && i<n; i++,i1++) {
         | 
| 71 75 | 
             
                        x = ptr[i1];
         | 
| 72 | 
            -
             | 
| 76 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 77 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cArithSeq)) {
         | 
| 78 | 
            +
            #else
         | 
| 79 | 
            +
                        if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, rb_cEnumerator)) {
         | 
| 80 | 
            +
            #endif
         | 
| 73 81 | 
             
                            cumo_na_step_sequence(x,&len,&beg,&step);
         | 
| 74 82 | 
             
                            for (c=0; c<len && i<n; c++,i++) {
         | 
| 75 83 | 
             
                                y = beg + step * c;
         | 
    
        data/ext/cumo/narray/index.c
    CHANGED
    
    | @@ -12,23 +12,6 @@ | |
| 12 12 | 
             
            #define cIndex cumo_cInt32
         | 
| 13 13 | 
             
            #endif
         | 
| 14 14 |  | 
| 15 | 
            -
            // from ruby/enumerator.c
         | 
| 16 | 
            -
            struct enumerator {
         | 
| 17 | 
            -
                VALUE obj;
         | 
| 18 | 
            -
                ID    meth;
         | 
| 19 | 
            -
                VALUE args;
         | 
| 20 | 
            -
                // use only above in this source
         | 
| 21 | 
            -
                VALUE fib;
         | 
| 22 | 
            -
                VALUE dst;
         | 
| 23 | 
            -
                VALUE lookahead;
         | 
| 24 | 
            -
                VALUE feedvalue;
         | 
| 25 | 
            -
                VALUE stop_exc;
         | 
| 26 | 
            -
                VALUE size;
         | 
| 27 | 
            -
                // incompatible below depending on ruby version
         | 
| 28 | 
            -
                //VALUE procs;                      // ruby 2.4
         | 
| 29 | 
            -
                //rb_enumerator_size_func *size_fn; // ruby 2.1-2.4
         | 
| 30 | 
            -
                //VALUE (*size_fn)(ANYARGS);        // ruby 2.0
         | 
| 31 | 
            -
            };
         | 
| 32 15 |  | 
| 33 16 | 
             
            // note: the memory refed by this pointer is not freed and causes memroy leak.
         | 
| 34 17 | 
             
            //
         | 
| @@ -204,6 +187,42 @@ cumo_na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, cumo_ | |
| 204 187 | 
             
                ssize_t beg, end, beg_orig, end_orig;
         | 
| 205 188 | 
             
                const char *dot = "..", *edot = "...";
         | 
| 206 189 |  | 
| 190 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 191 | 
            +
                rb_arithmetic_sequence_components_t x;
         | 
| 192 | 
            +
                rb_arithmetic_sequence_extract(range, &x);
         | 
| 193 | 
            +
                step = NUM2SSIZET(x.step);
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                beg = beg_orig = NUM2SSIZET(x.begin);
         | 
| 196 | 
            +
                if (beg < 0) {
         | 
| 197 | 
            +
                    beg += size;
         | 
| 198 | 
            +
                }
         | 
| 199 | 
            +
                if (T_NIL == TYPE(x.end)) { // endless range
         | 
| 200 | 
            +
                    end = size -1;
         | 
| 201 | 
            +
                    if (RTEST(x.exclude_end)) {
         | 
| 202 | 
            +
                        dot = edot;
         | 
| 203 | 
            +
                    }
         | 
| 204 | 
            +
                } else {
         | 
| 205 | 
            +
                    end = end_orig = NUM2SSIZET(x.end);
         | 
| 206 | 
            +
                    if (end < 0) {
         | 
| 207 | 
            +
                        end += size;
         | 
| 208 | 
            +
                    }
         | 
| 209 | 
            +
                    if (RTEST(x.exclude_end)) {
         | 
| 210 | 
            +
                        end--;
         | 
| 211 | 
            +
                        dot = edot;
         | 
| 212 | 
            +
                    }
         | 
| 213 | 
            +
                }
         | 
| 214 | 
            +
                if (beg < 0 || beg >= size || end < 0 || end >= size) {
         | 
| 215 | 
            +
                    if (T_NIL == TYPE(x.end)) { // endless range
         | 
| 216 | 
            +
                        rb_raise(rb_eRangeError,
         | 
| 217 | 
            +
                                 "%"SZF"d%s is out of range for size=%"SZF"d",
         | 
| 218 | 
            +
                                 beg_orig, dot, size);
         | 
| 219 | 
            +
                    } else {
         | 
| 220 | 
            +
                        rb_raise(rb_eRangeError,
         | 
| 221 | 
            +
                                 "%"SZF"d%s%"SZF"d is out of range for size=%"SZF"d",
         | 
| 222 | 
            +
                                 beg_orig, dot, end_orig, size);
         | 
| 223 | 
            +
                    }
         | 
| 224 | 
            +
                }
         | 
| 225 | 
            +
            #else
         | 
| 207 226 | 
             
                beg = beg_orig = NUM2SSIZET(rb_funcall(range,cumo_id_beg,0));
         | 
| 208 227 | 
             
                if (beg < 0) {
         | 
| 209 228 | 
             
                    beg += size;
         | 
| @@ -222,44 +241,59 @@ cumo_na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, cumo_ | |
| 222 241 | 
             
                             "%"SZF"d%s%"SZF"d is out of range for size=%"SZF"d",
         | 
| 223 242 | 
             
                             beg_orig, dot, end_orig, size);
         | 
| 224 243 | 
             
                }
         | 
| 244 | 
            +
            #endif
         | 
| 225 245 | 
             
                n = (end-beg)/step+1;
         | 
| 226 246 | 
             
                if (n<0) n=0;
         | 
| 227 247 | 
             
                cumo_na_index_set_step(q,orig_dim,n,beg,step);
         | 
| 228 248 |  | 
| 229 249 | 
             
            }
         | 
| 230 250 |  | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 251 | 
            +
            void
         | 
| 252 | 
            +
            cumo_na_parse_enumerator_step(VALUE enum_obj, VALUE *pstep)
         | 
| 233 253 | 
             
            {
         | 
| 234 254 | 
             
                int len;
         | 
| 235 | 
            -
                 | 
| 236 | 
            -
                 | 
| 255 | 
            +
                VALUE step;
         | 
| 256 | 
            +
                cumo_enumerator_t *e;
         | 
| 237 257 |  | 
| 238 258 | 
             
                if (!RB_TYPE_P(enum_obj, T_DATA)) {
         | 
| 239 259 | 
             
                    rb_raise(rb_eTypeError,"wrong argument type (not T_DATA)");
         | 
| 240 260 | 
             
                }
         | 
| 241 | 
            -
                e = ( | 
| 261 | 
            +
                e = (cumo_enumerator_t *)DATA_PTR(enum_obj);
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                if (!rb_obj_is_kind_of(e->obj, rb_cRange)) {
         | 
| 264 | 
            +
                    rb_raise(rb_eTypeError,"not Range object");
         | 
| 265 | 
            +
                }
         | 
| 242 266 |  | 
| 243 | 
            -
                if ( | 
| 244 | 
            -
                     | 
| 245 | 
            -
             | 
| 267 | 
            +
                if (e->meth == cumo_id_each) {
         | 
| 268 | 
            +
                    step = INT2NUM(1);
         | 
| 269 | 
            +
                }
         | 
| 270 | 
            +
                else if (e->meth == cumo_id_step) {
         | 
| 271 | 
            +
                    if (TYPE(e->args) != T_ARRAY) {
         | 
| 272 | 
            +
                        rb_raise(rb_eArgError,"no argument for step");
         | 
| 246 273 | 
             
                    }
         | 
| 247 | 
            -
                     | 
| 248 | 
            -
             | 
| 249 | 
            -
             | 
| 250 | 
            -
                        }
         | 
| 251 | 
            -
                        len = RARRAY_LEN(e->args);
         | 
| 252 | 
            -
                        if (len != 1) {
         | 
| 253 | 
            -
                            rb_raise(rb_eArgError,"invalid number of step argument (1 for %d)",len);
         | 
| 254 | 
            -
                        }
         | 
| 255 | 
            -
                        step = NUM2SSIZET(RARRAY_AREF(e->args,0));
         | 
| 256 | 
            -
                        cumo_na_parse_range(e->obj, step, orig_dim, size, q);
         | 
| 257 | 
            -
                    } else {
         | 
| 258 | 
            -
                        rb_raise(rb_eTypeError,"unknown Range method: %s",rb_id2name(e->meth));
         | 
| 274 | 
            +
                    len = RARRAY_LEN(e->args);
         | 
| 275 | 
            +
                    if (len != 1) {
         | 
| 276 | 
            +
                        rb_raise(rb_eArgError,"invalid number of step argument (1 for %d)",len);
         | 
| 259 277 | 
             
                    }
         | 
| 278 | 
            +
                    step = RARRAY_AREF(e->args,0);
         | 
| 260 279 | 
             
                } else {
         | 
| 261 | 
            -
                    rb_raise(rb_eTypeError," | 
| 280 | 
            +
                    rb_raise(rb_eTypeError,"unknown Range method: %s",rb_id2name(e->meth));
         | 
| 262 281 | 
             
                }
         | 
| 282 | 
            +
                if (pstep) *pstep = step;
         | 
| 283 | 
            +
            }
         | 
| 284 | 
            +
             | 
| 285 | 
            +
            static void
         | 
| 286 | 
            +
            cumo_na_parse_enumerator(VALUE enum_obj, int orig_dim, ssize_t size, cumo_na_index_arg_t *q)
         | 
| 287 | 
            +
            {
         | 
| 288 | 
            +
                VALUE step;
         | 
| 289 | 
            +
                cumo_enumerator_t *e;
         | 
| 290 | 
            +
             | 
| 291 | 
            +
                if (!RB_TYPE_P(enum_obj, T_DATA)) {
         | 
| 292 | 
            +
                    rb_raise(rb_eTypeError,"wrong argument type (not T_DATA)");
         | 
| 293 | 
            +
                }
         | 
| 294 | 
            +
                cumo_na_parse_enumerator_step(enum_obj, &step);
         | 
| 295 | 
            +
                e = (cumo_enumerator_t *)DATA_PTR(enum_obj);
         | 
| 296 | 
            +
                cumo_na_parse_range(e->obj, NUM2SSIZET(step), orig_dim, size, q); // e->obj : Range Object
         | 
| 263 297 | 
             
            }
         | 
| 264 298 |  | 
| 265 299 | 
             
            // Analyze *a* which is *i*-th index object and store the information to q
         | 
| @@ -316,14 +350,14 @@ cumo_na_index_parse_each(volatile VALUE a, ssize_t size, int i, cumo_na_index_ar | |
| 316 350 | 
             
                    if (rb_obj_is_kind_of(a, rb_cRange)) {
         | 
| 317 351 | 
             
                        cumo_na_parse_range(a, 1, i, size, q);
         | 
| 318 352 | 
             
                    }
         | 
| 353 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 354 | 
            +
                    else if (rb_obj_is_kind_of(a, rb_cArithSeq)) {
         | 
| 355 | 
            +
                        cumo_na_parse_range(a, 1, i, size, q);
         | 
| 356 | 
            +
                    }
         | 
| 357 | 
            +
            #endif
         | 
| 319 358 | 
             
                    else if (rb_obj_is_kind_of(a, rb_cEnumerator)) {
         | 
| 320 359 | 
             
                        cumo_na_parse_enumerator(a, i, size, q);
         | 
| 321 360 | 
             
                    }
         | 
| 322 | 
            -
                    else if (rb_obj_is_kind_of(a, cumo_na_cStep)) {
         | 
| 323 | 
            -
                        ssize_t beg, step, n;
         | 
| 324 | 
            -
                        cumo_na_step_array_index(a, size, (size_t*)(&n), &beg, &step);
         | 
| 325 | 
            -
                        cumo_na_index_set_step(q,i,n,beg,step);
         | 
| 326 | 
            -
                    }
         | 
| 327 361 | 
             
                    // NArray index
         | 
| 328 362 | 
             
                    else if (CUMO_NA_CumoIsNArray(a)) {
         | 
| 329 363 | 
             
                        cumo_na_parse_narray_index(a, i, size, q);
         | 
    
        data/ext/cumo/narray/narray.c
    CHANGED
    
    | @@ -40,10 +40,12 @@ VALUE cumo_sym_option; | |
| 40 40 | 
             
            VALUE cumo_sym_loop_opt;
         | 
| 41 41 | 
             
            VALUE cumo_sym_init;
         | 
| 42 42 |  | 
| 43 | 
            -
            VALUE cumo_na_cStep;
         | 
| 44 43 | 
             
            #ifndef HAVE_RB_CCOMPLEX
         | 
| 45 44 | 
             
            VALUE rb_cComplex;
         | 
| 46 45 | 
             
            #endif
         | 
| 46 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 47 | 
            +
            VALUE rb_cArithSeq;
         | 
| 48 | 
            +
            #endif
         | 
| 47 49 |  | 
| 48 50 | 
             
            int cumo_na_inspect_rows_=20;
         | 
| 49 51 | 
             
            int cumo_na_inspect_cols_=80;
         | 
| @@ -1512,7 +1514,11 @@ cumo_na_get_reduce_flag_from_axes(VALUE cumo_na_obj, VALUE axes) | |
| 1512 1514 | 
             
                        step = 0;
         | 
| 1513 1515 | 
             
                        //printf("beg=%d step=%d len=%d\n",beg,step,len);
         | 
| 1514 1516 | 
             
                    } else if (rb_obj_is_kind_of(v,rb_cRange) ||
         | 
| 1515 | 
            -
             | 
| 1517 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 1518 | 
            +
                               rb_obj_is_kind_of(v,rb_cArithSeq)) {
         | 
| 1519 | 
            +
            #else
         | 
| 1520 | 
            +
                               rb_obj_is_kind_of(v,rb_cEnumerator)) {
         | 
| 1521 | 
            +
            #endif
         | 
| 1516 1522 | 
             
                        cumo_na_step_array_index( v, ndim, &len, &beg, &step );
         | 
| 1517 1523 | 
             
                    } else {
         | 
| 1518 1524 | 
             
                        rb_raise(cumo_na_eDimensionError, "invalid dimension argument %s",
         | 
| @@ -1849,6 +1855,9 @@ Init_cumo_narray() | |
| 1849 1855 | 
             
                rb_require("complex");
         | 
| 1850 1856 | 
             
                rb_cComplex = rb_const_get(rb_cObject, rb_intern("Complex"));
         | 
| 1851 1857 | 
             
            #endif
         | 
| 1858 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 1859 | 
            +
                rb_cArithSeq = rb_path2class("Enumerator::ArithmeticSequence");
         | 
| 1860 | 
            +
            #endif
         | 
| 1852 1861 |  | 
| 1853 1862 | 
             
                rb_define_const(cNArray, "VERSION", rb_str_new2(CUMO_VERSION));
         | 
| 1854 1863 |  | 
    
        data/ext/cumo/narray/ndloop.c
    CHANGED
    
    | @@ -1,5 +1,6 @@ | |
| 1 1 | 
             
            #include <ruby.h>
         | 
| 2 2 | 
             
            #include "cumo.h"
         | 
| 3 | 
            +
            #include "cumo/indexer.h"
         | 
| 3 4 | 
             
            #include "cumo/narray.h"
         | 
| 4 5 | 
             
            #include "cumo/cuda/memory_pool.h"
         | 
| 5 6 | 
             
            #include "cumo/cuda/runtime.h"
         | 
| @@ -1164,11 +1165,48 @@ cumo_ndfunc_set_bufcp(cumo_ndfunc_t *nf, cumo_na_md_loop_t *lp) | |
| 1164 1165 | 
             
                }
         | 
| 1165 1166 | 
             
            }
         | 
| 1166 1167 |  | 
| 1168 | 
            +
            static cumo_na_iarray_stridx_t
         | 
| 1169 | 
            +
            cumo_na_make_iarray_buffer_copy(cumo_na_buffer_copy_t* lp)
         | 
| 1170 | 
            +
            {
         | 
| 1171 | 
            +
                cumo_na_iarray_stridx_t iarray;
         | 
| 1172 | 
            +
                int i;
         | 
| 1173 | 
            +
                int ndim = lp->ndim;
         | 
| 1174 | 
            +
                iarray.ptr = lp->src_ptr + lp->src_iter[0].pos;
         | 
| 1175 | 
            +
                for (i = 0; i < ndim; ++i) {
         | 
| 1176 | 
            +
                    if (LITER_SRC(lp,i).idx) {
         | 
| 1177 | 
            +
                        CUMO_SDX_SET_INDEX(iarray.stridx[i], LITER_SRC(lp,i).idx);
         | 
| 1178 | 
            +
                    } else {
         | 
| 1179 | 
            +
                        CUMO_SDX_SET_STRIDE(iarray.stridx[i], LITER_SRC(lp,i).step);
         | 
| 1180 | 
            +
                    }
         | 
| 1181 | 
            +
                }
         | 
| 1182 | 
            +
                return iarray;
         | 
| 1183 | 
            +
            }
         | 
| 1184 | 
            +
             | 
| 1185 | 
            +
            static cumo_na_indexer_t
         | 
| 1186 | 
            +
            cumo_na_make_indexer_buffer_copy(cumo_na_buffer_copy_t* lp)
         | 
| 1187 | 
            +
            {
         | 
| 1188 | 
            +
                cumo_na_indexer_t indexer;
         | 
| 1189 | 
            +
                int i;
         | 
| 1190 | 
            +
                indexer.ndim = lp->ndim;
         | 
| 1191 | 
            +
                indexer.total_size = 1;
         | 
| 1192 | 
            +
                for (i = 0; i< lp->ndim; ++i) {
         | 
| 1193 | 
            +
                    indexer.shape[i] = lp->n[i];
         | 
| 1194 | 
            +
                    indexer.total_size *= lp->n[i];
         | 
| 1195 | 
            +
                }
         | 
| 1196 | 
            +
                return indexer;
         | 
| 1197 | 
            +
            }
         | 
| 1198 | 
            +
             | 
| 1199 | 
            +
            void cumo_ndloop_copy_to_buffer_kernel_launch(cumo_na_iarray_stridx_t *a, cumo_na_indexer_t* indexer, char *buf, size_t elmsz);
         | 
| 1167 1200 |  | 
| 1168 1201 | 
             
            // Make contiguous memory for ops not supporting index or stride (step) loop
         | 
| 1169 1202 | 
             
            static void
         | 
| 1170 1203 | 
             
            ndloop_copy_to_buffer(cumo_na_buffer_copy_t *lp)
         | 
| 1171 1204 | 
             
            {
         | 
| 1205 | 
            +
                cumo_na_iarray_stridx_t a = cumo_na_make_iarray_buffer_copy(lp);
         | 
| 1206 | 
            +
                cumo_na_indexer_t indexer = cumo_na_make_indexer_buffer_copy(lp);
         | 
| 1207 | 
            +
                cumo_ndloop_copy_to_buffer_kernel_launch(&a, &indexer, lp->buf_ptr, lp->elmsz);
         | 
| 1208 | 
            +
             | 
| 1209 | 
            +
            #if 0
         | 
| 1172 1210 | 
             
                size_t *c;
         | 
| 1173 1211 | 
             
                char *src, *buf;
         | 
| 1174 1212 | 
             
                int  i;
         | 
| @@ -1230,11 +1268,19 @@ ndloop_copy_to_buffer(cumo_na_buffer_copy_t *lp) | |
| 1230 1268 | 
             
             loop_end:
         | 
| 1231 1269 | 
             
                ;
         | 
| 1232 1270 | 
             
                DBG(printf("]\n"));
         | 
| 1271 | 
            +
            #endif
         | 
| 1233 1272 | 
             
            }
         | 
| 1234 1273 |  | 
| 1274 | 
            +
            void cumo_ndloop_copy_from_buffer_kernel_launch(cumo_na_iarray_stridx_t *a, cumo_na_indexer_t* indexer, char *buf, size_t elmsz);
         | 
| 1275 | 
            +
             | 
| 1235 1276 | 
             
            static void
         | 
| 1236 1277 | 
             
            ndloop_copy_from_buffer(cumo_na_buffer_copy_t *lp)
         | 
| 1237 1278 | 
             
            {
         | 
| 1279 | 
            +
                cumo_na_iarray_stridx_t a = cumo_na_make_iarray_buffer_copy(lp);
         | 
| 1280 | 
            +
                cumo_na_indexer_t indexer = cumo_na_make_indexer_buffer_copy(lp);
         | 
| 1281 | 
            +
                cumo_ndloop_copy_from_buffer_kernel_launch(&a, &indexer, lp->buf_ptr, lp->elmsz);
         | 
| 1282 | 
            +
             | 
| 1283 | 
            +
            #if 0
         | 
| 1238 1284 | 
             
                size_t *c;
         | 
| 1239 1285 | 
             
                char *src, *buf;
         | 
| 1240 1286 | 
             
                int  i;
         | 
| @@ -1291,12 +1337,14 @@ ndloop_copy_from_buffer(cumo_na_buffer_copy_t *lp) | |
| 1291 1337 | 
             
                    for (;;) {
         | 
| 1292 1338 | 
             
                        if (i<=0) goto loop_end;
         | 
| 1293 1339 | 
             
                        i--;
         | 
| 1294 | 
            -
                         | 
| 1340 | 
            +
                        ++c[i];
         | 
| 1341 | 
            +
                        if (c[i] < lp->n[i]) break;
         | 
| 1295 1342 | 
             
                        c[i] = 0;
         | 
| 1296 1343 | 
             
                    }
         | 
| 1297 1344 | 
             
                }
         | 
| 1298 1345 | 
             
             loop_end:
         | 
| 1299 1346 | 
             
                DBG(printf("]\n"));
         | 
| 1347 | 
            +
            #endif
         | 
| 1300 1348 | 
             
            }
         | 
| 1301 1349 |  | 
| 1302 1350 |  | 
| @@ -0,0 +1,97 @@ | |
| 1 | 
            +
            #include "cumo/narray_kernel.h"
         | 
| 2 | 
            +
            #include "cumo/indexer.h"
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #if defined(__cplusplus)
         | 
| 5 | 
            +
            extern "C" {
         | 
| 6 | 
            +
            #if 0
         | 
| 7 | 
            +
            } /* satisfy cc-mode */
         | 
| 8 | 
            +
            #endif
         | 
| 9 | 
            +
            #endif
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            #define CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL(NDIM) \
         | 
| 12 | 
            +
            __global__ void cumo_ndloop_copy_from_buffer_kernel_dim##NDIM( \
         | 
| 13 | 
            +
                    cumo_na_iarray_stridx_t a, cumo_na_indexer_t indexer, char *buf, size_t elmsz) { \
         | 
| 14 | 
            +
                for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < indexer.total_size; i += blockDim.x * gridDim.x) { \
         | 
| 15 | 
            +
                    cumo_na_indexer_set_dim##NDIM(&indexer, i); \
         | 
| 16 | 
            +
                    char* p = cumo_na_iarray_stridx_at_dim##NDIM(&a, &indexer); \
         | 
| 17 | 
            +
                    memcpy(p, buf + i * elmsz, elmsz); \
         | 
| 18 | 
            +
                } \
         | 
| 19 | 
            +
            }
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            #define CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL(NDIM) \
         | 
| 22 | 
            +
            __global__ void cumo_ndloop_copy_to_buffer_kernel_dim##NDIM( \
         | 
| 23 | 
            +
                    cumo_na_iarray_stridx_t a, cumo_na_indexer_t indexer, char *buf, size_t elmsz) { \
         | 
| 24 | 
            +
                for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < indexer.total_size; i += blockDim.x * gridDim.x) { \
         | 
| 25 | 
            +
                    cumo_na_indexer_set_dim##NDIM(&indexer, i); \
         | 
| 26 | 
            +
                    char* p = cumo_na_iarray_stridx_at_dim##NDIM(&a, &indexer); \
         | 
| 27 | 
            +
                    memcpy(buf + i * elmsz, p, elmsz); \
         | 
| 28 | 
            +
                } \
         | 
| 29 | 
            +
            }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL(1)
         | 
| 32 | 
            +
            CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL(2)
         | 
| 33 | 
            +
            CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL(3)
         | 
| 34 | 
            +
            CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL(4)
         | 
| 35 | 
            +
            CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL()
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL(1)
         | 
| 38 | 
            +
            CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL(2)
         | 
| 39 | 
            +
            CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL(3)
         | 
| 40 | 
            +
            CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL(4)
         | 
| 41 | 
            +
            CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL()
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            #undef CUMO_NDLOOP_COPY_FROM_BUFFER_KERNEL
         | 
| 44 | 
            +
            #undef CUMO_NDLOOP_COPY_TO_BUFFER_KERNEL
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            void cumo_ndloop_copy_from_buffer_kernel_launch(cumo_na_iarray_stridx_t *a, cumo_na_indexer_t* indexer, char *buf, size_t elmsz)
         | 
| 47 | 
            +
            {
         | 
| 48 | 
            +
                size_t grid_dim = cumo_get_grid_dim(indexer->total_size);
         | 
| 49 | 
            +
                size_t block_dim = cumo_get_block_dim(indexer->total_size);
         | 
| 50 | 
            +
                switch (indexer->ndim) {
         | 
| 51 | 
            +
                    case 1:
         | 
| 52 | 
            +
                        cumo_ndloop_copy_from_buffer_kernel_dim1<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 53 | 
            +
                        break;
         | 
| 54 | 
            +
                    case 2:
         | 
| 55 | 
            +
                        cumo_ndloop_copy_from_buffer_kernel_dim2<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 56 | 
            +
                        break;
         | 
| 57 | 
            +
                    case 3:
         | 
| 58 | 
            +
                        cumo_ndloop_copy_from_buffer_kernel_dim3<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 59 | 
            +
                        break;
         | 
| 60 | 
            +
                    case 4:
         | 
| 61 | 
            +
                        cumo_ndloop_copy_from_buffer_kernel_dim4<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 62 | 
            +
                        break;
         | 
| 63 | 
            +
                    default:
         | 
| 64 | 
            +
                        cumo_ndloop_copy_from_buffer_kernel_dim<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 65 | 
            +
                        break;
         | 
| 66 | 
            +
                }
         | 
| 67 | 
            +
            }
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            void cumo_ndloop_copy_to_buffer_kernel_launch(cumo_na_iarray_stridx_t *a, cumo_na_indexer_t* indexer, char *buf, size_t elmsz)
         | 
| 70 | 
            +
            {
         | 
| 71 | 
            +
                size_t grid_dim = cumo_get_grid_dim(indexer->total_size);
         | 
| 72 | 
            +
                size_t block_dim = cumo_get_block_dim(indexer->total_size);
         | 
| 73 | 
            +
                switch (indexer->ndim) {
         | 
| 74 | 
            +
                    case 1:
         | 
| 75 | 
            +
                        cumo_ndloop_copy_to_buffer_kernel_dim1<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 76 | 
            +
                        break;
         | 
| 77 | 
            +
                    case 2:
         | 
| 78 | 
            +
                        cumo_ndloop_copy_to_buffer_kernel_dim2<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 79 | 
            +
                        break;
         | 
| 80 | 
            +
                    case 3:
         | 
| 81 | 
            +
                        cumo_ndloop_copy_to_buffer_kernel_dim3<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 82 | 
            +
                        break;
         | 
| 83 | 
            +
                    case 4:
         | 
| 84 | 
            +
                        cumo_ndloop_copy_to_buffer_kernel_dim4<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 85 | 
            +
                        break;
         | 
| 86 | 
            +
                    default:
         | 
| 87 | 
            +
                        cumo_ndloop_copy_to_buffer_kernel_dim<<<grid_dim, block_dim>>>(*a,*indexer,buf,elmsz);
         | 
| 88 | 
            +
                        break;
         | 
| 89 | 
            +
                }
         | 
| 90 | 
            +
            }
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            #if defined(__cplusplus)
         | 
| 93 | 
            +
            #if 0
         | 
| 94 | 
            +
            { /* satisfy cc-mode */
         | 
| 95 | 
            +
            #endif
         | 
| 96 | 
            +
            }  /* extern "C" { */
         | 
| 97 | 
            +
            #endif
         |