RubyGems - cumo - Versions diffs - 0.2.4 → 0.2.5 - Mend

cumo 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +23 -24
data/bench/cumo_bench.rb +1 -0
data/ext/cumo/cuda/memory_pool.cpp +9 -1
data/ext/cumo/cuda/memory_pool_impl.cpp +2 -13
data/ext/cumo/cumo.c +4 -4
data/ext/cumo/depend.erb +1 -1
data/ext/cumo/extconf.rb +2 -0
data/ext/cumo/include/cumo.h +4 -4
data/ext/cumo/include/cumo/indexer.h +50 -0
data/ext/cumo/include/cumo/intern.h +1 -0
data/ext/cumo/include/cumo/narray.h +20 -1
data/ext/cumo/include/cumo/narray_kernel.h +10 -0
data/ext/cumo/include/cumo/ndloop.h +1 -1
data/ext/cumo/narray/array.c +8 -2
data/ext/cumo/narray/gen/tmpl/store_array.c +15 -3
data/ext/cumo/narray/gen/tmpl_bit/store_array.c +10 -2
data/ext/cumo/narray/index.c +77 -43
data/ext/cumo/narray/narray.c +11 -2
data/ext/cumo/narray/ndloop.c +49 -1
data/ext/cumo/narray/ndloop_kernel.cu +97 -0
data/ext/cumo/narray/step.c +56 -250
data/lib/cumo/narray/extra.rb +50 -1
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 981428686ca946222ba4e575d35d55cc1139ae9a057afe11bb19166d189ddd4d
-  data.tar.gz: 88129353d1da98170baa4362b46032ffa49b236e7559f1c5e60a6bfa32192c7c
+  metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
+  data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
 SHA512:
-  metadata.gz: 7f2cfcf951e787490e427ad6688e8003b7a0d9132e5cf1a030b78727561ddbf603df367e71169f91e926b67660b40f0b1c66f26ab30c744e8afb680d9568bfa7
-  data.tar.gz: 1c47cb15aa0676369c37cc8a77771e7b498b56fbcb7f8b6321db4c3a9c1a24e3cb8a1fdb37e9b72331441458458b943a36fe34bc2b01706998c9fa50d06413be
+  metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
+  data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+# 0.2.5 (2019-03-04)-
+Enhancements:
+* Support arithmetic sequence, which is available in ruby >= 2.6.0 (thanks to naitoh)
 # 0.2.4 (2018-11-21)
 Changes:

data/README.md CHANGED

@@ -1,7 +1,6 @@
 # Cumo
-Cumo (pronounced like "koomo") is CUDA-aware numerical library whose interface is highly compatible with [Ruby Numo](https://github.com/ruby-numo).
-This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
+Cumo (pronounced "koomo") is a CUDA-aware, GPU-optimized numerical library that offers a significant performance boost over [Ruby Numo](https://github.com/ruby-numo), while (mostly) maintaining drop-in compatibility.
 <img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
@@ -13,7 +12,7 @@ This library provides the benefit of speedup using GPU by replacing Numo with on
 ## Preparation
-Install CUDA and setup environment variables as follows:
+Install CUDA and set your environment variables as follows:
 ```bash
 export CUDA_PATH="/usr/local/cuda"
@@ -25,7 +24,7 @@ export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
 ## Installation
-Add a following line to your Gemfile:
+Add the following line to your Gemfile:
 ```ruby
 gem 'cumo'
@@ -63,15 +62,15 @@ An example:
 => 15
 ```
-### How to switch from Numo to Cumo
+### Switching from Numo to Cumo
-Basically, following command should make it work with Cumo.
+The following find-and-replace should just work:
 ```
 find . -type f | xargs sed -i -e 's/Numo/Cumo/g' -e 's/numo/cumo/g'
 ```
-If you want to switch Numo and Cumo dynamically, following snippets should work:
+If you want to dynamically switch between Numo and Cumo, something like the following will work:
 ```ruby
 if gpu
@@ -87,17 +86,17 @@ a = xm::DFloat.new(3,5).seq
 ### Incompatibility With Numo
-Following methods behave incompatibly with Numo as default for performance.
+The following methods behave incompatibly with Numo by default for performance reasons:
 * `extract`
 * `[]`
 * `count_true`
 * `count_false`
-Numo returns a Ruby numeric object for 0-dimensional NArray, but Cumo returns the 0-dimensional NArray instead of a Ruby numeric object.
-This is to avoid synchnoziation between CPU and GPU for performance.
+Numo returns a Ruby numeric object for 0-dimensional NArray, while Cumo returns the 0-dimensional NArray instead of a Ruby numeric object.
+Cumo differs in this way to avoid synchronization and minimize CPU ⇄ GPU data transfer.
-You may set `CUMO_COMPATIBLE_MODE=ON` environment variable to force Cumo NArray behave compatibly with Numo NArray.
+Set the `CUMO_COMPATIBLE_MODE` environment variable to `ON` to force Numo NArray compatibility (for worse performance).
 You may enable or disable `compatible_mode` as:
@@ -109,7 +108,7 @@ Cumo.disable_compatible_mode # disable
 Cumo.compatible_mode_enabled? #=> false
 ```
-You can also use following methods which behaves as Numo NArray's methods. Behaviors of these methods do not depend on `compatible_mode`.
+You can also use the following methods which behave like Numo's NArray methods. The behavior of these methods does not depend on `compatible_mode`.
 * `extract_cpu`
 * `aref_cpu(*idx)`
@@ -118,7 +117,7 @@ You can also use following methods which behaves as Numo NArray's methods. Behav
 ### Select a GPU device ID
-Set `CUDA_VISIBLE_DEVICES=id` environment variable, or
+Set the `CUDA_VISIBLE_DEVICES=id` environment variable, or
 ```
 require 'cumo'
@@ -129,7 +128,7 @@ where `id` is an integer.
 ### Disable GPU Memory Pool
-GPU memory pool is enabled as default. To disable, set `CUMO_MEMORY_POOL=OFF` environment variable , or
+GPU memory pool is enabled by default. To disable it, set `CUMO_MEMORY_POOL=OFF`, or:
 ```
 require 'cumo'
@@ -138,11 +137,11 @@ Cumo::CUDA::MemoryPool.disable
 ## Documentation
-See https://github.com/ruby-numo/numo-narray#documentation and replace Numo to Cumo.
+See https://github.com/ruby-numo/numo-narray#documentation, replacing Numo with Cumo.
 ## Contributions
-This project is still under development. See [issues](https://github.com/sonots/cumo/issues) for future works.
+This project is under active development. See [issues](https://github.com/sonots/cumo/issues) for future works.
 ## Development
@@ -170,12 +169,12 @@ Generate docs:
 bundle exec rake docs
 ```
-## Advanced Tips on Development
+## Advanced Development Tips
 ### ccache
 [ccache](https://ccache.samba.org/) would be useful to speedup compilation time.
-Install ccache and setup as:
+Install ccache and configure with:
 ```bash
@@ -187,7 +186,7 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
 ### Build in parallel
-Use `MAKEFLAGS` environment variable to specify `make` command options. You can build in parallel as:
+Set `MAKEFLAGS` to specify `make` command options. You can build in parallel as:
 ```
 bundle exec env MAKEFLAG=-j8 rake compile
@@ -199,11 +198,11 @@ bundle exec env MAKEFLAG=-j8 rake compile
 bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
 ```
-This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin occurring on runtime.
+This is useful even on development because it makes it possible to skip JIT compilation of PTX to cubin during runtime.
 ### Run tests with gdb
-Compile with debug option:
+Compile with debugging enabled:
 ```
 bundle exec DEBUG=1 rake compile
@@ -242,7 +241,7 @@ bundle exec DTYPE=dfloat ruby test/narray_test.rb
 bundle exec CUDA_LAUNCH_BLOCKING=1
 ```
-### Show GPU synchnoziation warnings
+### Show GPU synchronization warnings
 Cumo shows warnings if CPU and GPU synchronization occurs if:
@@ -250,8 +249,8 @@ Cumo shows warnings if CPU and GPU synchronization occurs if:
 export CUMO_SHOW_WARNING=ON
 ```
-As default, it shows warnings occurred at the same place only once.
-You may want to show warnings everytime rather than once as:
+By default, Cumo shows warnings that occurred at the same place only once.
+To show all, multiple warnings, set:
 ```
 export CUMO_SHOW_WARNING=ON

data/bench/cumo_bench.rb CHANGED

@@ -3,6 +3,7 @@ require 'benchmark'
 NUM = (ARGV.first || 100).to_i
+# warm up
 a = Cumo::Float32.new(10).seq(1)
 b = Cumo::Float32.new(10).seq(10,10)
 c = a + b

data/ext/cumo/cuda/memory_pool.cpp CHANGED

@@ -29,7 +29,15 @@ cumo_cuda_runtime_malloc(size_t size)
         } catch (const cumo::internal::CUDARuntimeError& e) {
             cumo_cuda_runtime_check_status(e.status());
         } catch (const cumo::internal::OutOfMemoryError& e) {
-            rb_raise(cumo_cuda_eOutOfMemoryError, "%s", e.what());
+            // retry after GC
+            rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
+            try {
+                return reinterpret_cast<char*>(pool.Malloc(size));
+            } catch (const cumo::internal::CUDARuntimeError& e) {
+                cumo_cuda_runtime_check_status(e.status());
+            } catch (const cumo::internal::OutOfMemoryError& e) {
+                rb_raise(cumo_cuda_eOutOfMemoryError, "%s", e.what());
+            }
         }
     } else {
         void *ptr = 0;

data/ext/cumo/cuda/memory_pool_impl.cpp CHANGED

@@ -139,6 +139,8 @@ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
             if (e.status() != cudaErrorMemoryAllocation) {
                 throw;
             }
+            // Retry after free all free blocks.
+            // NOTE: Anotehr retry after GC is done at cumo_cuda_runtime_malloc.
             FreeAllBlocks();
             try {
                 mem = std::make_shared<Memory>(size);
@@ -146,21 +148,8 @@ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
                 if (e.status() != cudaErrorMemoryAllocation) {
                     throw;
                 }
-#ifdef NO_RUBY // cpp test does not bind with libruby
                 size_t total = size + GetTotalBytes();
                 throw OutOfMemoryError(size, total);
-#else
-                rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
-                try {
-                    mem = std::make_shared<Memory>(size);
-                } catch (const CUDARuntimeError& e) {
-                    if (e.status() != cudaErrorMemoryAllocation) {
-                        throw;
-                    }
-                    size_t total = size + GetTotalBytes();
-                    throw OutOfMemoryError(size, total);
-                }
-#endif
             }
         }
         chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);

data/ext/cumo/cumo.c CHANGED

@@ -54,11 +54,11 @@ bool cumo_show_warning_enabled_p()
     return cumo_show_warning_enabled;
 }
-static bool cumo_warning_once_enabled;
+static bool cumo_show_warning_once_enabled;
-bool cumo_warning_once_enabled_p()
+bool cumo_show_warning_once_enabled_p()
 {
-    return cumo_warning_once_enabled;
+    return cumo_show_warning_once_enabled;
 }
 /*
@@ -130,7 +130,7 @@ Init_cumo()
     // default is true
     env = getenv("CUMO_SHOW_WARNING_ONCE");
-    cumo_warning_once_enabled = env == NULL || (strcmp(env, "OFF") != 0 && strcmp(env, "0") != 0 && strcmp(env, "NO") != 0);
+    cumo_show_warning_once_enabled = env == NULL || (strcmp(env, "OFF") != 0 && strcmp(env, "0") != 0 && strcmp(env, "NO") != 0);
     Init_cumo_narray();

data/ext/cumo/depend.erb CHANGED

@@ -53,6 +53,6 @@ run-ctest : cuda/memory_pool_impl_test.exe
 	./$<
 cuda/memory_pool_impl_test.exe: cuda/memory_pool_impl_test.cpp cuda/memory_pool_impl.cpp cuda/memory_pool_impl.hpp
-	nvcc -DNO_RUBY -std=c++14 <%= ENV['DEBUG'] ? '-g -O0 --compiler-options -Wall' : '' %> -L. -L$(libdir) -I. $(INCFLAGS) -o $@ $< cuda/memory_pool_impl.cpp
+	nvcc -std=c++14 <%= ENV['DEBUG'] ? '-g -O0 --compiler-options -Wall' : '' %> -L. -L$(libdir) -I. $(INCFLAGS) -o $@ $< cuda/memory_pool_impl.cpp
 CLEANOBJS = *.o */*.o */*/*.o *.bak narray/types/*.c narray/types/*_kernel.cu *.exe */*.exe

data/ext/cumo/extconf.rb CHANGED

@@ -68,6 +68,7 @@ narray/step
 narray/index
 narray/index_kernel
 narray/ndloop
+narray/ndloop_kernel
 narray/data
 narray/data_kernel
 narray/types/bit
@@ -158,6 +159,7 @@ unless have_type("u_int64_t", stdint)
   have_type("uint64_t", stdint)
 end
 have_func("exp10")
+have_func("rb_arithmetic_sequence_extract")
 have_var("rb_cComplex")
 have_func("rb_thread_call_without_gvl")

data/ext/cumo/include/cumo.h CHANGED

@@ -10,17 +10,17 @@ extern "C" {
 #endif
 #endif
-#define CUMO_VERSION "0.2.4"
-#define CUMO_VERSION_CODE 24
+#define CUMO_VERSION "0.2.5"
+#define CUMO_VERSION_CODE 25
 bool cumo_compatible_mode_enabled_p();
 bool cumo_show_warning_enabled_p();
-bool cumo_warning_once_enabled_p();
+bool cumo_show_warning_once_enabled_p();
 #define CUMO_SHOW_WARNING_ONCE( c_str ) \
     { \
         if (cumo_show_warning_enabled_p()) { \
-            if (cumo_warning_once_enabled_p()) { \
+            if (cumo_show_warning_once_enabled_p()) { \
                 static bool show_warning = true; \
                 if (show_warning) { \
                     fprintf(stderr, (c_str)); \

data/ext/cumo/include/cumo/indexer.h CHANGED

@@ -30,6 +30,11 @@ typedef struct {
     ssize_t step[CUMO_NA_MAX_DIMENSION]; // or strides
 } cumo_na_iarray_t;
+typedef struct {
+    char* ptr;
+    cumo_stridx_t stridx[CUMO_NA_MAX_DIMENSION];
+} cumo_na_iarray_stridx_t;
 typedef struct {
     cumo_na_iarray_t in;
     cumo_na_iarray_t out;
@@ -216,6 +221,51 @@ cumo_na_iarray_at_dim1(cumo_na_iarray_t* iarray, cumo_na_indexer_t* indexer) {
     return iarray->ptr + iarray->step[0] * indexer->raw_index;
 }
+__host__ __device__
+static inline char*
+cumo_na_iarray_stridx_at_dim(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) {
+    char* ptr = iarray->ptr;
+    for (int idim = 0; idim < indexer->ndim; ++idim) {
+        if (CUMO_SDX_IS_INDEX(iarray->stridx[idim])) {
+            ptr += CUMO_SDX_GET_INDEX(iarray->stridx[idim])[indexer->index[idim]];
+        } else {
+            ptr += CUMO_SDX_GET_STRIDE(iarray->stridx[idim]) * indexer->index[idim];
+        }
+    }
+    return ptr;
+}
+// Let compiler optimize
+#define CUMO_NA_IARRAY_STRIDX_AT(NDIM) \
+__host__ __device__ \
+static inline char* \
+cumo_na_iarray_stridx_at_dim##NDIM(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) { \
+    char* ptr = iarray->ptr; \
+    for (int idim = 0; idim < NDIM; ++idim) { \
+        if (CUMO_SDX_IS_INDEX(iarray->stridx[idim])) { \
+            ptr += CUMO_SDX_GET_INDEX(iarray->stridx[idim])[indexer->index[idim]]; \
+        } else { \
+            ptr += CUMO_SDX_GET_STRIDE(iarray->stridx[idim]) * indexer->index[idim]; \
+        } \
+    } \
+    return ptr; \
+}
+CUMO_NA_IARRAY_STRIDX_AT(4)
+CUMO_NA_IARRAY_STRIDX_AT(3)
+CUMO_NA_IARRAY_STRIDX_AT(2)
+CUMO_NA_IARRAY_STRIDX_AT(0)
+__host__ __device__
+static inline char*
+cumo_na_iarray_stridx_at_dim1(cumo_na_iarray_stridx_t* iarray, cumo_na_indexer_t* indexer) {
+    if (CUMO_SDX_IS_INDEX(iarray->stridx[0])) {
+        return iarray->ptr + CUMO_SDX_GET_INDEX(iarray->stridx[0])[indexer->raw_index];
+    } else {
+        return iarray->ptr + CUMO_SDX_GET_STRIDE(iarray->stridx[0]) * indexer->raw_index;
+    }
+}
 #endif // #ifdef __CUDACC__
 #endif // CUMO_INDEXER_H

data/ext/cumo/include/cumo/intern.h CHANGED

@@ -69,6 +69,7 @@ bool cumo_na_test_reduce(VALUE reduce, int dim);
 void cumo_na_step_array_index(VALUE self, size_t ary_size, size_t *plen, ssize_t *pbeg, ssize_t *pstep);
 void cumo_na_step_sequence(VALUE self, size_t *plen, double *pbeg, double *pstep);
+void cumo_na_parse_enumerator_step(VALUE enum_obj, VALUE *pstep);
 // used in aref, aset
 int cumo_na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride, size_t *pos_idx);

data/ext/cumo/include/cumo/narray.h CHANGED

@@ -196,10 +196,12 @@ extern VALUE cumo_cUInt32;
 extern VALUE cumo_cUInt16;
 extern VALUE cumo_cUInt8;
 extern VALUE cumo_cRObject;
-extern VALUE cumo_na_cStep;
 #ifndef HAVE_RB_CCOMPLEX
 extern VALUE rb_cComplex;
 #endif
+#ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
+extern VALUE rb_cArithSeq;
+#endif
 extern VALUE cumo_sym_reduce;
 extern VALUE cumo_sym_option;
@@ -265,6 +267,23 @@ typedef struct {
     unsigned int element_stride;
 } cumo_narray_type_info_t;
+// from ruby/enumerator.c
+typedef struct {
+    VALUE obj;
+    ID    meth;
+    VALUE args;
+    // use only above in this source
+    VALUE fib;
+    VALUE dst;
+    VALUE lookahead;
+    VALUE feedvalue;
+    VALUE stop_exc;
+    VALUE size;
+    // incompatible below depending on ruby version
+    //VALUE procs;                      // ruby 2.4
+    //rb_enumerator_size_func *size_fn; // ruby 2.1-2.4
+    //VALUE (*size_fn)(ANYARGS);        // ruby 2.0
+} cumo_enumerator_t;
 static inline cumo_narray_t *
 cumo_na_get_narray_t(VALUE obj)

data/ext/cumo/include/cumo/narray_kernel.h CHANGED

@@ -165,6 +165,16 @@ typedef unsigned int CUMO_BIT_DIGIT;
 #define CUMO_BALL   (~(CUMO_BIT_DIGIT)0)
 #define CUMO_SLB(n) (((n)==CUMO_NB)?~(CUMO_BIT_DIGIT)0:(~(~(CUMO_BIT_DIGIT)0<<(n))))
+typedef union {
+    ssize_t stride;
+    size_t *index;
+} cumo_stridx_t;
+#define CUMO_SDX_IS_STRIDE(x) ((x).stride&0x1)
+#define CUMO_SDX_IS_INDEX(x)  (!CUMO_SDX_IS_STRIDE(x))
+#define CUMO_SDX_GET_STRIDE(x) ((x).stride>>1)
+#define CUMO_SDX_GET_INDEX(x)  ((x).index)
 #include "cumo/indexer.h"
 #include "cumo/intern_kernel.h"

data/ext/cumo/include/cumo/ndloop.h CHANGED

@@ -2,7 +2,7 @@
 #define CUMO_NDLOOP_H
 typedef struct {
-    ssize_t    pos; // - required for each dimension.
+    ssize_t    pos; // only iter[0].pos is used in cumo as an offset.
     ssize_t    step;
     size_t    *idx;
 } cumo_na_loop_iter_t;