cumo 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +23 -24
- data/bench/cumo_bench.rb +1 -0
- data/ext/cumo/cuda/memory_pool.cpp +9 -1
- data/ext/cumo/cuda/memory_pool_impl.cpp +2 -13
- data/ext/cumo/cumo.c +4 -4
- data/ext/cumo/depend.erb +1 -1
- data/ext/cumo/extconf.rb +2 -0
- data/ext/cumo/include/cumo.h +4 -4
- data/ext/cumo/include/cumo/indexer.h +50 -0
- data/ext/cumo/include/cumo/intern.h +1 -0
- data/ext/cumo/include/cumo/narray.h +20 -1
- data/ext/cumo/include/cumo/narray_kernel.h +10 -0
- data/ext/cumo/include/cumo/ndloop.h +1 -1
- data/ext/cumo/narray/array.c +8 -2
- data/ext/cumo/narray/gen/tmpl/store_array.c +15 -3
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +10 -2
- data/ext/cumo/narray/index.c +77 -43
- data/ext/cumo/narray/narray.c +11 -2
- data/ext/cumo/narray/ndloop.c +49 -1
- data/ext/cumo/narray/ndloop_kernel.cu +97 -0
- data/ext/cumo/narray/step.c +56 -250
- data/lib/cumo/narray/extra.rb +50 -1
- metadata +4 -4
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
         | 
| 4 | 
            +
              data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
         | 
| 7 | 
            +
              data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -1,7 +1,6 @@ | |
| 1 1 | 
             
            # Cumo
         | 
| 2 2 |  | 
| 3 | 
            -
            Cumo (pronounced  | 
| 4 | 
            -
            This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
         | 
| 3 | 
            +
            Cumo (pronounced "koomo") is a CUDA-aware, GPU-optimized numerical library that offers a significant performance boost over [Ruby Numo](https://github.com/ruby-numo), while (mostly) maintaining drop-in compatibility.
         | 
| 5 4 |  | 
| 6 5 | 
             
            <img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
         | 
| 7 6 |  | 
| @@ -13,7 +12,7 @@ This library provides the benefit of speedup using GPU by replacing Numo with on | |
| 13 12 |  | 
| 14 13 | 
             
            ## Preparation
         | 
| 15 14 |  | 
| 16 | 
            -
            Install CUDA and  | 
| 15 | 
            +
            Install CUDA and set your environment variables as follows:
         | 
| 17 16 |  | 
| 18 17 | 
             
            ```bash
         | 
| 19 18 | 
             
            export CUDA_PATH="/usr/local/cuda"
         | 
| @@ -25,7 +24,7 @@ export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH" | |
| 25 24 |  | 
| 26 25 | 
             
            ## Installation
         | 
| 27 26 |  | 
| 28 | 
            -
            Add  | 
| 27 | 
            +
            Add the following line to your Gemfile:
         | 
| 29 28 |  | 
| 30 29 | 
             
            ```ruby
         | 
| 31 30 | 
             
            gem 'cumo'
         | 
| @@ -63,15 +62,15 @@ An example: | |
| 63 62 | 
             
            => 15
         | 
| 64 63 | 
             
            ```
         | 
| 65 64 |  | 
| 66 | 
            -
            ###  | 
| 65 | 
            +
            ### Switching from Numo to Cumo
         | 
| 67 66 |  | 
| 68 | 
            -
             | 
| 67 | 
            +
            The following find-and-replace should just work:
         | 
| 69 68 |  | 
| 70 69 | 
             
            ```
         | 
| 71 70 | 
             
            find . -type f | xargs sed -i -e 's/Numo/Cumo/g' -e 's/numo/cumo/g'
         | 
| 72 71 | 
             
            ```
         | 
| 73 72 |  | 
| 74 | 
            -
            If you want to switch Numo and Cumo | 
| 73 | 
            +
            If you want to dynamically switch between Numo and Cumo, something like the following will work:
         | 
| 75 74 |  | 
| 76 75 | 
             
            ```ruby
         | 
| 77 76 | 
             
            if gpu
         | 
| @@ -87,17 +86,17 @@ a = xm::DFloat.new(3,5).seq | |
| 87 86 |  | 
| 88 87 | 
             
            ### Incompatibility With Numo
         | 
| 89 88 |  | 
| 90 | 
            -
             | 
| 89 | 
            +
            The following methods behave incompatibly with Numo by default for performance reasons:
         | 
| 91 90 |  | 
| 92 91 | 
             
            * `extract`
         | 
| 93 92 | 
             
            * `[]`
         | 
| 94 93 | 
             
            * `count_true`
         | 
| 95 94 | 
             
            * `count_false`
         | 
| 96 95 |  | 
| 97 | 
            -
            Numo returns a Ruby numeric object for 0-dimensional NArray,  | 
| 98 | 
            -
             | 
| 96 | 
            +
            Numo returns a Ruby numeric object for 0-dimensional NArray, while Cumo returns the 0-dimensional NArray instead of a Ruby numeric object.
         | 
| 97 | 
            +
            Cumo differs in this way to avoid synchronization and minimize CPU ⇄ GPU data transfer.
         | 
| 99 98 |  | 
| 100 | 
            -
             | 
| 99 | 
            +
            Set the `CUMO_COMPATIBLE_MODE` environment variable to `ON` to force Numo NArray compatibility (for worse performance).
         | 
| 101 100 |  | 
| 102 101 | 
             
            You may enable or disable `compatible_mode` as:
         | 
| 103 102 |  | 
| @@ -109,7 +108,7 @@ Cumo.disable_compatible_mode # disable | |
| 109 108 | 
             
            Cumo.compatible_mode_enabled? #=> false
         | 
| 110 109 | 
             
            ```
         | 
| 111 110 |  | 
| 112 | 
            -
            You can also use following methods which  | 
| 111 | 
            +
            You can also use the following methods which behave like Numo's NArray methods. The behavior of these methods does not depend on `compatible_mode`.
         | 
| 113 112 |  | 
| 114 113 | 
             
            * `extract_cpu`
         | 
| 115 114 | 
             
            * `aref_cpu(*idx)`
         | 
| @@ -118,7 +117,7 @@ You can also use following methods which behaves as Numo NArray's methods. Behav | |
| 118 117 |  | 
| 119 118 | 
             
            ### Select a GPU device ID
         | 
| 120 119 |  | 
| 121 | 
            -
            Set `CUDA_VISIBLE_DEVICES=id` environment variable, or
         | 
| 120 | 
            +
            Set the `CUDA_VISIBLE_DEVICES=id` environment variable, or
         | 
| 122 121 |  | 
| 123 122 | 
             
            ```
         | 
| 124 123 | 
             
            require 'cumo'
         | 
| @@ -129,7 +128,7 @@ where `id` is an integer. | |
| 129 128 |  | 
| 130 129 | 
             
            ### Disable GPU Memory Pool
         | 
| 131 130 |  | 
| 132 | 
            -
            GPU memory pool is enabled  | 
| 131 | 
            +
            GPU memory pool is enabled by default. To disable it, set `CUMO_MEMORY_POOL=OFF`, or:
         | 
| 133 132 |  | 
| 134 133 | 
             
            ```
         | 
| 135 134 | 
             
            require 'cumo'
         | 
| @@ -138,11 +137,11 @@ Cumo::CUDA::MemoryPool.disable | |
| 138 137 |  | 
| 139 138 | 
             
            ## Documentation
         | 
| 140 139 |  | 
| 141 | 
            -
            See https://github.com/ruby-numo/numo-narray#documentation  | 
| 140 | 
            +
            See https://github.com/ruby-numo/numo-narray#documentation, replacing Numo with Cumo.
         | 
| 142 141 |  | 
| 143 142 | 
             
            ## Contributions
         | 
| 144 143 |  | 
| 145 | 
            -
            This project is  | 
| 144 | 
            +
            This project is under active development. See [issues](https://github.com/sonots/cumo/issues) for future works.
         | 
| 146 145 |  | 
| 147 146 | 
             
            ## Development
         | 
| 148 147 |  | 
| @@ -170,12 +169,12 @@ Generate docs: | |
| 170 169 | 
             
            bundle exec rake docs
         | 
| 171 170 | 
             
            ```
         | 
| 172 171 |  | 
| 173 | 
            -
            ## Advanced Tips | 
| 172 | 
            +
            ## Advanced Development Tips
         | 
| 174 173 |  | 
| 175 174 | 
             
            ### ccache
         | 
| 176 175 |  | 
| 177 176 | 
             
            [ccache](https://ccache.samba.org/) would be useful to speedup compilation time.
         | 
| 178 | 
            -
            Install ccache and  | 
| 177 | 
            +
            Install ccache and configure with:
         | 
| 179 178 |  | 
| 180 179 |  | 
| 181 180 | 
             
            ```bash
         | 
| @@ -187,7 +186,7 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc" | |
| 187 186 |  | 
| 188 187 | 
             
            ### Build in parallel
         | 
| 189 188 |  | 
| 190 | 
            -
             | 
| 189 | 
            +
            Set `MAKEFLAGS` to specify `make` command options. You can build in parallel as:
         | 
| 191 190 |  | 
| 192 191 | 
             
            ```
         | 
| 193 192 | 
             
            bundle exec env MAKEFLAG=-j8 rake compile
         | 
| @@ -199,11 +198,11 @@ bundle exec env MAKEFLAG=-j8 rake compile | |
| 199 198 | 
             
            bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
         | 
| 200 199 | 
             
            ```
         | 
| 201 200 |  | 
| 202 | 
            -
            This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin  | 
| 201 | 
            +
            This is useful even on development because it makes it possible to skip JIT compilation of PTX to cubin during runtime.
         | 
| 203 202 |  | 
| 204 203 | 
             
            ### Run tests with gdb
         | 
| 205 204 |  | 
| 206 | 
            -
            Compile with  | 
| 205 | 
            +
            Compile with debugging enabled:
         | 
| 207 206 |  | 
| 208 207 | 
             
            ```
         | 
| 209 208 | 
             
            bundle exec DEBUG=1 rake compile
         | 
| @@ -242,7 +241,7 @@ bundle exec DTYPE=dfloat ruby test/narray_test.rb | |
| 242 241 | 
             
            bundle exec CUDA_LAUNCH_BLOCKING=1
         | 
| 243 242 | 
             
            ```
         | 
| 244 243 |  | 
| 245 | 
            -
            ### Show GPU  | 
| 244 | 
            +
            ### Show GPU synchronization warnings
         | 
| 246 245 |  | 
| 247 246 | 
             
            Cumo shows warnings if CPU and GPU synchronization occurs if:
         | 
| 248 247 |  | 
| @@ -250,8 +249,8 @@ Cumo shows warnings if CPU and GPU synchronization occurs if: | |
| 250 249 | 
             
            export CUMO_SHOW_WARNING=ON
         | 
| 251 250 | 
             
            ```
         | 
| 252 251 |  | 
| 253 | 
            -
             | 
| 254 | 
            -
             | 
| 252 | 
            +
            By default, Cumo shows warnings that occurred at the same place only once.
         | 
| 253 | 
            +
            To show all, multiple warnings, set:
         | 
| 255 254 |  | 
| 256 255 | 
             
            ```
         | 
| 257 256 | 
             
            export CUMO_SHOW_WARNING=ON
         | 
    
        data/bench/cumo_bench.rb
    CHANGED
    
    
| @@ -29,7 +29,15 @@ cumo_cuda_runtime_malloc(size_t size) | |
| 29 29 | 
             
                    } catch (const cumo::internal::CUDARuntimeError& e) {
         | 
| 30 30 | 
             
                        cumo_cuda_runtime_check_status(e.status());
         | 
| 31 31 | 
             
                    } catch (const cumo::internal::OutOfMemoryError& e) {
         | 
| 32 | 
            -
                         | 
| 32 | 
            +
                        // retry after GC
         | 
| 33 | 
            +
                        rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
         | 
| 34 | 
            +
                        try {
         | 
| 35 | 
            +
                            return reinterpret_cast<char*>(pool.Malloc(size));
         | 
| 36 | 
            +
                        } catch (const cumo::internal::CUDARuntimeError& e) {
         | 
| 37 | 
            +
                            cumo_cuda_runtime_check_status(e.status());
         | 
| 38 | 
            +
                        } catch (const cumo::internal::OutOfMemoryError& e) {
         | 
| 39 | 
            +
                            rb_raise(cumo_cuda_eOutOfMemoryError, "%s", e.what());
         | 
| 40 | 
            +
                        }
         | 
| 33 41 | 
             
                    }
         | 
| 34 42 | 
             
                } else {
         | 
| 35 43 | 
             
                    void *ptr = 0;
         | 
| @@ -139,6 +139,8 @@ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) { | |
| 139 139 | 
             
                        if (e.status() != cudaErrorMemoryAllocation) {
         | 
| 140 140 | 
             
                            throw;
         | 
| 141 141 | 
             
                        }
         | 
| 142 | 
            +
                        // Retry after free all free blocks.
         | 
| 143 | 
            +
                        // NOTE: Anotehr retry after GC is done at cumo_cuda_runtime_malloc.
         | 
| 142 144 | 
             
                        FreeAllBlocks();
         | 
| 143 145 | 
             
                        try {
         | 
| 144 146 | 
             
                            mem = std::make_shared<Memory>(size);
         | 
| @@ -146,21 +148,8 @@ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) { | |
| 146 148 | 
             
                            if (e.status() != cudaErrorMemoryAllocation) {
         | 
| 147 149 | 
             
                                throw;
         | 
| 148 150 | 
             
                            }
         | 
| 149 | 
            -
            #ifdef NO_RUBY // cpp test does not bind with libruby
         | 
| 150 151 | 
             
                            size_t total = size + GetTotalBytes();
         | 
| 151 152 | 
             
                            throw OutOfMemoryError(size, total);
         | 
| 152 | 
            -
            #else
         | 
| 153 | 
            -
                            rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
         | 
| 154 | 
            -
                            try {
         | 
| 155 | 
            -
                                mem = std::make_shared<Memory>(size);
         | 
| 156 | 
            -
                            } catch (const CUDARuntimeError& e) {
         | 
| 157 | 
            -
                                if (e.status() != cudaErrorMemoryAllocation) {
         | 
| 158 | 
            -
                                    throw;
         | 
| 159 | 
            -
                                }
         | 
| 160 | 
            -
                                size_t total = size + GetTotalBytes();
         | 
| 161 | 
            -
                                throw OutOfMemoryError(size, total);
         | 
| 162 | 
            -
                            }
         | 
| 163 | 
            -
            #endif
         | 
| 164 153 | 
             
                        }
         | 
| 165 154 | 
             
                    }
         | 
| 166 155 | 
             
                    chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
         | 
    
        data/ext/cumo/cumo.c
    CHANGED
    
    | @@ -54,11 +54,11 @@ bool cumo_show_warning_enabled_p() | |
| 54 54 | 
             
                return cumo_show_warning_enabled;
         | 
| 55 55 | 
             
            }
         | 
| 56 56 |  | 
| 57 | 
            -
            static bool  | 
| 57 | 
            +
            static bool cumo_show_warning_once_enabled;
         | 
| 58 58 |  | 
| 59 | 
            -
            bool  | 
| 59 | 
            +
            bool cumo_show_warning_once_enabled_p()
         | 
| 60 60 | 
             
            {
         | 
| 61 | 
            -
                return  | 
| 61 | 
            +
                return cumo_show_warning_once_enabled;
         | 
| 62 62 | 
             
            }
         | 
| 63 63 |  | 
| 64 64 | 
             
            /*
         | 
| @@ -130,7 +130,7 @@ Init_cumo() | |
| 130 130 |  | 
| 131 131 | 
             
                // default is true
         | 
| 132 132 | 
             
                env = getenv("CUMO_SHOW_WARNING_ONCE");
         | 
| 133 | 
            -
                 | 
| 133 | 
            +
                cumo_show_warning_once_enabled = env == NULL || (strcmp(env, "OFF") != 0 && strcmp(env, "0") != 0 && strcmp(env, "NO") != 0);
         | 
| 134 134 |  | 
| 135 135 | 
             
                Init_cumo_narray();
         | 
| 136 136 |  | 
    
        data/ext/cumo/depend.erb
    CHANGED
    
    | @@ -53,6 +53,6 @@ run-ctest : cuda/memory_pool_impl_test.exe | |
| 53 53 | 
             
            	./$<
         | 
| 54 54 |  | 
| 55 55 | 
             
            cuda/memory_pool_impl_test.exe: cuda/memory_pool_impl_test.cpp cuda/memory_pool_impl.cpp cuda/memory_pool_impl.hpp
         | 
| 56 | 
            -
            	nvcc - | 
| 56 | 
            +
            	nvcc -std=c++14 <%= ENV['DEBUG'] ? '-g -O0 --compiler-options -Wall' : '' %> -L. -L$(libdir) -I. $(INCFLAGS) -o $@ $< cuda/memory_pool_impl.cpp
         | 
| 57 57 |  | 
| 58 58 | 
             
            CLEANOBJS = *.o */*.o */*/*.o *.bak narray/types/*.c narray/types/*_kernel.cu *.exe */*.exe
         | 
    
        data/ext/cumo/extconf.rb
    CHANGED
    
    | @@ -68,6 +68,7 @@ narray/step | |
| 68 68 | 
             
            narray/index
         | 
| 69 69 | 
             
            narray/index_kernel
         | 
| 70 70 | 
             
            narray/ndloop
         | 
| 71 | 
            +
            narray/ndloop_kernel
         | 
| 71 72 | 
             
            narray/data
         | 
| 72 73 | 
             
            narray/data_kernel
         | 
| 73 74 | 
             
            narray/types/bit
         | 
| @@ -158,6 +159,7 @@ unless have_type("u_int64_t", stdint) | |
| 158 159 | 
             
              have_type("uint64_t", stdint)
         | 
| 159 160 | 
             
            end
         | 
| 160 161 | 
             
            have_func("exp10")
         | 
| 162 | 
            +
            have_func("rb_arithmetic_sequence_extract")
         | 
| 161 163 |  | 
| 162 164 | 
             
            have_var("rb_cComplex")
         | 
| 163 165 | 
             
            have_func("rb_thread_call_without_gvl")
         | 
    
        data/ext/cumo/include/cumo.h
    CHANGED
    
    | @@ -10,17 +10,17 @@ extern "C" { | |
| 10 10 | 
             
            #endif
         | 
| 11 11 | 
             
            #endif
         | 
| 12 12 |  | 
| 13 | 
            -
            #define CUMO_VERSION "0.2. | 
| 14 | 
            -
            #define CUMO_VERSION_CODE  | 
| 13 | 
            +
            #define CUMO_VERSION "0.2.5"
         | 
| 14 | 
            +
            #define CUMO_VERSION_CODE 25
         | 
| 15 15 |  | 
| 16 16 | 
             
            bool cumo_compatible_mode_enabled_p();
         | 
| 17 17 | 
             
            bool cumo_show_warning_enabled_p();
         | 
| 18 | 
            -
            bool  | 
| 18 | 
            +
            bool cumo_show_warning_once_enabled_p();
         | 
| 19 19 |  | 
| 20 20 | 
             
            #define CUMO_SHOW_WARNING_ONCE( c_str ) \
         | 
| 21 21 | 
             
                { \
         | 
| 22 22 | 
             
                    if (cumo_show_warning_enabled_p()) { \
         | 
| 23 | 
            -
                        if ( | 
| 23 | 
            +
                        if (cumo_show_warning_once_enabled_p()) { \
         | 
| 24 24 | 
             
                            static bool show_warning = true; \
         | 
| 25 25 | 
             
                            if (show_warning) { \
         | 
| 26 26 | 
             
                                fprintf(stderr, (c_str)); \
         | 
| @@ -30,6 +30,11 @@ typedef struct { | |
| 30 30 | 
             
                ssize_t step[CUMO_NA_MAX_DIMENSION]; // or strides
         | 
| 31 31 | 
             
            } cumo_na_iarray_t;
         | 
| 32 32 |  | 
| 33 | 
            +
            typedef struct {
         | 
| 34 | 
            +
                char* ptr;
         | 
| 35 | 
            +
                cumo_stridx_t stridx[CUMO_NA_MAX_DIMENSION];
         | 
| 36 | 
            +
            } cumo_na_iarray_stridx_t;
         | 
| 37 | 
            +
             | 
| 33 38 | 
             
            typedef struct {
         | 
| 34 39 | 
             
                cumo_na_iarray_t in;
         | 
| 35 40 | 
             
                cumo_na_iarray_t out;
         | 
| @@ -216,6 +221,51 @@ cumo_na_iarray_at_dim1(cumo_na_iarray_t* iarray, cumo_na_indexer_t* indexer) { | |
| 216 221 | 
             
                return iarray->ptr + iarray->step[0] * indexer->raw_index;
         | 
| 217 222 | 
             
            }
         | 
| 218 223 |  | 
| 224 | 
            +
            __host__ __device__
         | 
| 225 | 
            +
            static inline char*
         | 
| 226 | 
            +
            cumo_na_iarray_stridx_at_dim(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) {
         | 
| 227 | 
            +
                char* ptr = iarray->ptr;
         | 
| 228 | 
            +
                for (int idim = 0; idim < indexer->ndim; ++idim) {
         | 
| 229 | 
            +
                    if (CUMO_SDX_IS_INDEX(iarray->stridx[idim])) {
         | 
| 230 | 
            +
                        ptr += CUMO_SDX_GET_INDEX(iarray->stridx[idim])[indexer->index[idim]];
         | 
| 231 | 
            +
                    } else {
         | 
| 232 | 
            +
                        ptr += CUMO_SDX_GET_STRIDE(iarray->stridx[idim]) * indexer->index[idim];
         | 
| 233 | 
            +
                    }
         | 
| 234 | 
            +
                }
         | 
| 235 | 
            +
                return ptr;
         | 
| 236 | 
            +
            }
         | 
| 237 | 
            +
             | 
| 238 | 
            +
            // Let compiler optimize
         | 
| 239 | 
            +
            #define CUMO_NA_IARRAY_STRIDX_AT(NDIM) \
         | 
| 240 | 
            +
            __host__ __device__ \
         | 
| 241 | 
            +
            static inline char* \
         | 
| 242 | 
            +
            cumo_na_iarray_stridx_at_dim##NDIM(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) { \
         | 
| 243 | 
            +
                char* ptr = iarray->ptr; \
         | 
| 244 | 
            +
                for (int idim = 0; idim < NDIM; ++idim) { \
         | 
| 245 | 
            +
                    if (CUMO_SDX_IS_INDEX(iarray->stridx[idim])) { \
         | 
| 246 | 
            +
                        ptr += CUMO_SDX_GET_INDEX(iarray->stridx[idim])[indexer->index[idim]]; \
         | 
| 247 | 
            +
                    } else { \
         | 
| 248 | 
            +
                        ptr += CUMO_SDX_GET_STRIDE(iarray->stridx[idim]) * indexer->index[idim]; \
         | 
| 249 | 
            +
                    } \
         | 
| 250 | 
            +
                } \
         | 
| 251 | 
            +
                return ptr; \
         | 
| 252 | 
            +
            }
         | 
| 253 | 
            +
             | 
| 254 | 
            +
            CUMO_NA_IARRAY_STRIDX_AT(4)
         | 
| 255 | 
            +
            CUMO_NA_IARRAY_STRIDX_AT(3)
         | 
| 256 | 
            +
            CUMO_NA_IARRAY_STRIDX_AT(2)
         | 
| 257 | 
            +
            CUMO_NA_IARRAY_STRIDX_AT(0)
         | 
| 258 | 
            +
             | 
| 259 | 
            +
            __host__ __device__
         | 
| 260 | 
            +
            static inline char*
         | 
| 261 | 
            +
            cumo_na_iarray_stridx_at_dim1(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) {
         | 
| 262 | 
            +
                if (CUMO_SDX_IS_INDEX(iarray->stridx[0])) {
         | 
| 263 | 
            +
                    return iarray->ptr + CUMO_SDX_GET_INDEX(iarray->stridx[0])[indexer->raw_index];
         | 
| 264 | 
            +
                } else {
         | 
| 265 | 
            +
                    return iarray->ptr + CUMO_SDX_GET_STRIDE(iarray->stridx[0]) * indexer->raw_index;
         | 
| 266 | 
            +
                }
         | 
| 267 | 
            +
            }
         | 
| 268 | 
            +
             | 
| 219 269 | 
             
            #endif // #ifdef __CUDACC__
         | 
| 220 270 |  | 
| 221 271 | 
             
            #endif // CUMO_INDEXER_H
         | 
| @@ -69,6 +69,7 @@ bool cumo_na_test_reduce(VALUE reduce, int dim); | |
| 69 69 |  | 
| 70 70 | 
             
            void cumo_na_step_array_index(VALUE self, size_t ary_size, size_t *plen, ssize_t *pbeg, ssize_t *pstep);
         | 
| 71 71 | 
             
            void cumo_na_step_sequence(VALUE self, size_t *plen, double *pbeg, double *pstep);
         | 
| 72 | 
            +
            void cumo_na_parse_enumerator_step(VALUE enum_obj, VALUE *pstep);
         | 
| 72 73 |  | 
| 73 74 | 
             
            // used in aref, aset
         | 
| 74 75 | 
             
            int cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride, size_t *pos_idx);
         | 
| @@ -196,10 +196,12 @@ extern VALUE cumo_cUInt32; | |
| 196 196 | 
             
            extern VALUE cumo_cUInt16;
         | 
| 197 197 | 
             
            extern VALUE cumo_cUInt8;
         | 
| 198 198 | 
             
            extern VALUE cumo_cRObject;
         | 
| 199 | 
            -
            extern VALUE cumo_na_cStep;
         | 
| 200 199 | 
             
            #ifndef HAVE_RB_CCOMPLEX
         | 
| 201 200 | 
             
            extern VALUE rb_cComplex;
         | 
| 202 201 | 
             
            #endif
         | 
| 202 | 
            +
            #ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
         | 
| 203 | 
            +
            extern VALUE rb_cArithSeq;
         | 
| 204 | 
            +
            #endif
         | 
| 203 205 |  | 
| 204 206 | 
             
            extern VALUE cumo_sym_reduce;
         | 
| 205 207 | 
             
            extern VALUE cumo_sym_option;
         | 
| @@ -265,6 +267,23 @@ typedef struct { | |
| 265 267 | 
             
                unsigned int element_stride;
         | 
| 266 268 | 
             
            } cumo_narray_type_info_t;
         | 
| 267 269 |  | 
| 270 | 
            +
            // from ruby/enumerator.c
         | 
| 271 | 
            +
            typedef struct {
         | 
| 272 | 
            +
                VALUE obj;
         | 
| 273 | 
            +
                ID    meth;
         | 
| 274 | 
            +
                VALUE args;
         | 
| 275 | 
            +
                // use only above in this source
         | 
| 276 | 
            +
                VALUE fib;
         | 
| 277 | 
            +
                VALUE dst;
         | 
| 278 | 
            +
                VALUE lookahead;
         | 
| 279 | 
            +
                VALUE feedvalue;
         | 
| 280 | 
            +
                VALUE stop_exc;
         | 
| 281 | 
            +
                VALUE size;
         | 
| 282 | 
            +
                // incompatible below depending on ruby version
         | 
| 283 | 
            +
                //VALUE procs;                      // ruby 2.4
         | 
| 284 | 
            +
                //rb_enumerator_size_func *size_fn; // ruby 2.1-2.4
         | 
| 285 | 
            +
                //VALUE (*size_fn)(ANYARGS);        // ruby 2.0
         | 
| 286 | 
            +
            } cumo_enumerator_t;
         | 
| 268 287 |  | 
| 269 288 | 
             
            static inline cumo_narray_t *
         | 
| 270 289 | 
             
            cumo_na_get_narray_t(VALUE obj)
         | 
| @@ -165,6 +165,16 @@ typedef unsigned int CUMO_BIT_DIGIT; | |
| 165 165 | 
             
            #define CUMO_BALL   (~(CUMO_BIT_DIGIT)0)
         | 
| 166 166 | 
             
            #define CUMO_SLB(n) (((n)==CUMO_NB)?~(CUMO_BIT_DIGIT)0:(~(~(CUMO_BIT_DIGIT)0<<(n))))
         | 
| 167 167 |  | 
| 168 | 
            +
            typedef union {
         | 
| 169 | 
            +
                ssize_t stride;
         | 
| 170 | 
            +
                size_t *index;
         | 
| 171 | 
            +
            } cumo_stridx_t;
         | 
| 172 | 
            +
             | 
| 173 | 
            +
            #define CUMO_SDX_IS_STRIDE(x) ((x).stride&0x1)
         | 
| 174 | 
            +
            #define CUMO_SDX_IS_INDEX(x)  (!CUMO_SDX_IS_STRIDE(x))
         | 
| 175 | 
            +
            #define CUMO_SDX_GET_STRIDE(x) ((x).stride>>1)
         | 
| 176 | 
            +
            #define CUMO_SDX_GET_INDEX(x)  ((x).index)
         | 
| 177 | 
            +
             | 
| 168 178 | 
             
            #include "cumo/indexer.h"
         | 
| 169 179 | 
             
            #include "cumo/intern_kernel.h"
         | 
| 170 180 |  |