RubyGems - cumo - Versions diffs - 0.1.0 - Mend

cumo 0.1.0

Files changed (266) hide show

checksums.yaml +7 -0
data/.gitignore +27 -0
data/.travis.yml +5 -0
data/3rd_party/mkmf-cu/.gitignore +36 -0
data/3rd_party/mkmf-cu/Gemfile +3 -0
data/3rd_party/mkmf-cu/LICENSE +21 -0
data/3rd_party/mkmf-cu/README.md +36 -0
data/3rd_party/mkmf-cu/Rakefile +11 -0
data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
data/CODE_OF_CONDUCT.md +46 -0
data/Gemfile +8 -0
data/LICENSE.txt +82 -0
data/README.md +252 -0
data/Rakefile +43 -0
data/bench/broadcast_fp32.rb +138 -0
data/bench/cumo_bench.rb +193 -0
data/bench/numo_bench.rb +138 -0
data/bench/reduction_fp32.rb +117 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/cumo.gemspec +32 -0
data/ext/cumo/cuda/cublas.c +278 -0
data/ext/cumo/cuda/driver.c +421 -0
data/ext/cumo/cuda/memory_pool.cpp +185 -0
data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
data/ext/cumo/cuda/nvrtc.c +207 -0
data/ext/cumo/cuda/runtime.c +167 -0
data/ext/cumo/cumo.c +148 -0
data/ext/cumo/depend.erb +58 -0
data/ext/cumo/extconf.rb +179 -0
data/ext/cumo/include/cumo.h +25 -0
data/ext/cumo/include/cumo/compat.h +23 -0
data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
data/ext/cumo/include/cumo/cuda/driver.h +22 -0
data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
data/ext/cumo/include/cumo/indexer.h +238 -0
data/ext/cumo/include/cumo/intern.h +142 -0
data/ext/cumo/include/cumo/intern_fwd.h +38 -0
data/ext/cumo/include/cumo/intern_kernel.h +6 -0
data/ext/cumo/include/cumo/narray.h +429 -0
data/ext/cumo/include/cumo/narray_kernel.h +149 -0
data/ext/cumo/include/cumo/ndloop.h +95 -0
data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
data/ext/cumo/include/cumo/template.h +158 -0
data/ext/cumo/include/cumo/template_kernel.h +77 -0
data/ext/cumo/include/cumo/types/bit.h +40 -0
data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
data/ext/cumo/include/cumo/types/complex.h +402 -0
data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
data/ext/cumo/include/cumo/types/dfloat.h +47 -0
data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
data/ext/cumo/include/cumo/types/float_def.h +34 -0
data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
data/ext/cumo/include/cumo/types/float_macro.h +191 -0
data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
data/ext/cumo/include/cumo/types/int16.h +24 -0
data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
data/ext/cumo/include/cumo/types/int32.h +24 -0
data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int64.h +24 -0
data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int8.h +24 -0
data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int_macro.h +67 -0
data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
data/ext/cumo/include/cumo/types/real_accum.h +486 -0
data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
data/ext/cumo/include/cumo/types/robject.h +27 -0
data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
data/ext/cumo/include/cumo/types/scomplex.h +46 -0
data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
data/ext/cumo/include/cumo/types/sfloat.h +48 -0
data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
data/ext/cumo/include/cumo/types/uint16.h +25 -0
data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint32.h +25 -0
data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint64.h +25 -0
data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint8.h +25 -0
data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
data/ext/cumo/narray/SFMT-params.h +97 -0
data/ext/cumo/narray/SFMT-params19937.h +46 -0
data/ext/cumo/narray/SFMT.c +620 -0
data/ext/cumo/narray/SFMT.h +167 -0
data/ext/cumo/narray/array.c +638 -0
data/ext/cumo/narray/data.c +961 -0
data/ext/cumo/narray/gen/cogen.rb +56 -0
data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
data/ext/cumo/narray/gen/def/bit.rb +37 -0
data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
data/ext/cumo/narray/gen/def/int16.rb +36 -0
data/ext/cumo/narray/gen/def/int32.rb +36 -0
data/ext/cumo/narray/gen/def/int64.rb +36 -0
data/ext/cumo/narray/gen/def/int8.rb +36 -0
data/ext/cumo/narray/gen/def/robject.rb +37 -0
data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
data/ext/cumo/narray/gen/def/uint16.rb +36 -0
data/ext/cumo/narray/gen/def/uint32.rb +36 -0
data/ext/cumo/narray/gen/def/uint64.rb +36 -0
data/ext/cumo/narray/gen/def/uint8.rb +36 -0
data/ext/cumo/narray/gen/erbpp2.rb +346 -0
data/ext/cumo/narray/gen/narray_def.rb +268 -0
data/ext/cumo/narray/gen/spec.rb +425 -0
data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
data/ext/cumo/narray/gen/tmpl/class.c +9 -0
data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
data/ext/cumo/narray/gen/tmpl/each.c +47 -0
data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
data/ext/cumo/narray/gen/tmpl/format.c +62 -0
data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
data/ext/cumo/narray/gen/tmpl/median.c +66 -0
data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
data/ext/cumo/narray/gen/tmpl/module.c +9 -0
data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
data/ext/cumo/narray/gen/tmpl/store.c +41 -0
data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
data/ext/cumo/narray/index.c +880 -0
data/ext/cumo/narray/kwargs.c +153 -0
data/ext/cumo/narray/math.c +142 -0
data/ext/cumo/narray/narray.c +1948 -0
data/ext/cumo/narray/ndloop.c +2105 -0
data/ext/cumo/narray/rand.c +45 -0
data/ext/cumo/narray/step.c +474 -0
data/ext/cumo/narray/struct.c +886 -0
data/lib/cumo.rb +3 -0
data/lib/cumo/cuda.rb +11 -0
data/lib/cumo/cuda/compile_error.rb +36 -0
data/lib/cumo/cuda/compiler.rb +161 -0
data/lib/cumo/cuda/device.rb +47 -0
data/lib/cumo/cuda/link_state.rb +31 -0
data/lib/cumo/cuda/module.rb +40 -0
data/lib/cumo/cuda/nvrtc_program.rb +27 -0
data/lib/cumo/linalg.rb +12 -0
data/lib/cumo/narray.rb +2 -0
data/lib/cumo/narray/extra.rb +1278 -0
data/lib/erbpp.rb +294 -0
data/lib/erbpp/line_number.rb +137 -0
data/lib/erbpp/narray_def.rb +381 -0
data/numo-narray-version +1 -0
data/run.gdb +7 -0
metadata +353 -0

data/lib/cumo.rb ADDED

@@ -0,0 +1,3 @@
+require_relative File.join(__dir__, '../ext/cumo/cumo')
+require_relative 'cumo/cuda'
+require_relative 'cumo/narray/extra'

data/lib/cumo/cuda.rb ADDED

@@ -0,0 +1,11 @@
+module Cumo
+  module CUDA
+  end
+end
+require_relative 'cuda/compile_error'
+require_relative 'cuda/compiler'
+require_relative 'cuda/device'
+require_relative 'cuda/module'
+require_relative 'cuda/link_state'
+require_relative 'cuda/nvrtc_program'

data/lib/cumo/cuda/compile_error.rb ADDED

@@ -0,0 +1,36 @@
+require_relative '../cuda'
+module Cumo::CUDA
+  class CompileError < StandardError
+    def initialize(msg, source, name, options)
+      @msg = msg
+      @source = source
+      @name = name
+      @options = options
+    end
+    def message
+      @msg
+    end
+    def to_s
+      @msg
+    end
+    def dump(io)
+      lines = @source.split("\n")
+      digits = Math.log10(lines.size).floor + 1
+      linum_fmt = "%0#{digits}d "
+      io.puts("NVRTC compilation error: #{@msg}")
+      io.puts("-----")
+      io.puts("Name: #{@name}")
+      io.puts("Options: #{@options.join(' ')}")
+      io.puts("CUDA source:")
+      lines.each.with_index do |line, i|
+        io.puts(linum_fmt.sprintf(i + 1) << line.rstrip)
+      end
+      io.puts("-----")
+      io.flush
+    end
+  end
+end

data/lib/cumo/cuda/compiler.rb ADDED

@@ -0,0 +1,161 @@
+require 'tmpdir'
+require 'tempfile'
+require 'fileutils'
+require 'digest/md5'
+require_relative '../cuda'
+module Cumo::CUDA
+  class Compiler
+    VALID_KERNEL_NAME = /\A[a-zA-Z_][a-zA-Z_0-9]*\z/
+    DEFAULT_CACHE_DIR = File.expand_path('~/.cumo/kernel_cache')
+    @@empty_file_preprocess_cache ||= {}
+    def self.valid_kernel_name?(name)
+      VALID_KERNEL_NAME.match?(name)
+    end
+    def compile_using_nvrtc(source, options: [], arch: nil)
+      arch ||= get_arch
+      options += ["-arch=#{arch}"]
+      Dir.mktmpdir do |root_dir|
+        path = File.join(root_dir, 'kern')
+        cu_path = "#{path}.cu"
+        File.open(cu_path, 'w') do |cu_file|
+          cu_file.write(source)
+        end
+        prog = NVRTCProgram.new(source, name: cu_path)
+        begin
+          ptx = prog.compile(options: options)
+        rescue CompileError => e
+          if get_bool_env_variable('CUMO_DUMP_CUDA_SOURCE_ON_ERROR', false)
+            e.dump($stderr)
+          end
+          raise e
+        ensure
+          prog.destroy
+        end
+        return ptx
+      end
+    end
+    def compile_with_cache(source, options: [], arch: nil, cache_dir: nil, extra_source: nil)
+      # NVRTC does not use extra_source. extra_source is used for cache key.
+      cache_dir ||= get_cache_dir
+      arch ||= get_arch
+      options += ['-ftz=true']
+      env = [arch, options, get_nvrtc_version]
+      base = @@empty_file_preprocess_cache[env]
+      if base.nil?
+        # This is checking of NVRTC compiler internal version
+        base = preprocess('', options, arch)
+        @@empty_file_preprocess_cache[env] = base
+      end
+      key_src = "#{env} #{base} #{source} #{extra_source}"
+      key_src.encode!('utf-8')
+      digest = Digest::MD5.hexdigest(key_src)
+      name = "#{digest}_2.cubin"
+      unless Dir.exist?(cache_dir)
+        FileUtils.mkdir_p(cache_dir)
+      end
+      # TODO(sonots): thread-safe?
+      path = File.join(cache_dir, name)
+      cubin = load_cache(path)
+      if cubin
+        mod = Module.new
+        mod.load(cubin)
+        return mod
+      end
+      ptx = compile_using_nvrtc(source, options: options, arch: arch)
+      cubin = nil
+      cubin_hash = nil
+      LinkState.new do |ls|
+        ls.add_ptr_data(ptx, 'cumo.ptx')
+        cubin = ls.complete()
+        cubin_hash = Digest::MD5.hexdigest(cubin)
+      end
+      save_cache(path, cubin_hash, cubin)
+      # Save .cu source file along with .cubin
+      if get_bool_env_variable('CUMO_CACHE_SAVE_CUDA_SOURCE', false)
+        File.open("#{path}.cu", 'w') do |f|
+          f.write(source)
+        end
+      end
+      mod = Module.new
+      mod.load(cubin)
+      return mod
+    end
+    private
+    def save_cache(path, cubin_hash, cubin)
+      tf = Tempfile.create
+      tf.write(cubin_hash)
+      tf.write(cubin)
+      temp_path = tf.path
+      File.rename(temp_path, path)
+    end
+    def load_cache(path)
+      return nil unless File.exist?(path)
+      File.open(path, 'rb') do |file|
+        data = file.read
+        return nil unless data.size >= 32
+        hash = data[0...32]
+        cubin = data[32..-1]
+        cubin_hash = Digest::MD5.hexdigest(cubin)
+        return nil unless hash == cubin_hash
+        return cubin
+      end
+      nil
+    end
+    def get_cache_dir
+      ENV.fetch('CUMO_CACHE_DIR', DEFAULT_CACHE_DIR)
+    end
+    def get_nvrtc_version
+      @@nvrtc_version ||= NVRTC.nvrtcVersion
+    end
+    def get_arch
+      cc = Device.new.compute_capability
+      "compute_#{cc}"
+    end
+    def get_bool_env_variable(name, default)
+      val = ENV[name]
+      return default if val.nil? or val.size == 0
+      Integer(val) == 1 rescue false
+    end
+    def preprocess(source, options, arch)
+      options += ["-arch=#{arch}"]
+      prog = NVRTCProgram.new(source, name: '')
+      begin
+        result = prog.compile(options: options)
+        return result
+      rescue CompileError => e
+        if get_bool_env_variable('CUMO_DUMP_CUDA_SOURCE_ON_ERROR', false)
+          e.dump($stderr)
+        end
+        raise e
+      ensure
+        prog.destroy
+      end
+    end
+  end
+end

data/lib/cumo/cuda/device.rb ADDED

@@ -0,0 +1,47 @@
+require_relative '../cuda'
+module Cumo::CUDA
+  class Device
+    attr_reader :id
+    def self.get_currend_id
+      Runtime.cudaGetDevice
+    end
+    def initialize(device_id = nil)
+      if device_id
+        @id = device_id
+      else
+        @id = Runtime.cudaGetDevice
+      end
+      @_device_stack = []
+    end
+    def use
+      Runtime.cudaSetDevice(@id)
+    end
+    def with
+      raise unless block_given?
+      prev_id = Runtime.cudaGetDevice
+      @_device_stack << prev_id
+      begin
+        Runtime.cudaSetDevice(@id) unless prev_id != @id
+        yield
+      ensure
+        prev_id = @_device_stack.pop
+        Runtime.cudaSetDevice(prev_id)
+      end
+    end
+    def synchronize
+      Runtime.cudaDeviceSynchronize
+    end
+    def compute_capability
+      major = Runtime.cudaDeviceGetAttributes(75, @id)
+      minor = Runtime.cudaDeviceGetAttributes(76, @id)
+      "#{major}#{minor}"
+    end
+  end
+end

data/lib/cumo/cuda/link_state.rb ADDED

@@ -0,0 +1,31 @@
+require_relative '../cuda'
+module Cumo::CUDA
+  # CUDA link state.
+  class LinkState
+    def initialize
+      @ptr = Driver.cuLinkCreate
+      if block_given?
+        begin
+          yield(self)
+        ensure
+          destroy
+        end
+      end
+    end
+    def destroy
+      return unless @ptr
+      Driver.cuLinkDestroy(@ptr)
+      @ptr = nil
+    end
+    def add_ptr_data(data, name)
+      Driver.cuLinkAddData(@ptr, Driver::CU_JIT_INPUT_PTX, data, name)
+    end
+    def complete
+      cubin = Driver.cuLinkComplete(@ptr)
+    end
+  end
+end

data/lib/cumo/cuda/module.rb ADDED

@@ -0,0 +1,40 @@
+require_relative '../cuda'
+module Cumo::CUDA
+  # CUDA kernel module.
+  class Module
+    def initialize
+      @ptr = nil
+      if block_given?
+        begin
+          yield(self)
+        ensure
+          unload
+        end
+      end
+    end
+    def unload
+      return unless @ptr
+      Driver.cuModuleUnload(@ptr)
+      @ptr = nil
+    end
+    def load_file(fname)
+      @ptr = Driver.cuModuleLoad(fname)
+    end
+    def load(cubin)
+      @ptr = Driver.cuModuleLoadData(cubin)
+    end
+    def get_global_var(name)
+      Driver.cuModuleGetGlobal(@ptr, name)
+    end
+    def get_function(name)
+      # Function(name)
+    end
+  end
+end

data/lib/cumo/cuda/nvrtc_program.rb ADDED

@@ -0,0 +1,27 @@
+require_relative '../cuda'
+require_relative 'compile_error'
+module Cumo::CUDA
+  class NVRTCProgram
+    def initialize(src, name: "default_program", headers: [], include_names: [])
+      @ptr = nil
+      @src = src # should be UTF-8
+      @name = name # should be UTF-8
+      @ptr = NVRTC.nvrtcCreateProgram(src, name, headers, include_names)
+    end
+    def destroy
+      NVRTC.nvrtcDestroyProgram(@ptr) if @ptr
+    end
+    def compile(options: [])
+      begin
+        NVRTC.nvrtcCompileProgram(@ptr, options)
+        return NVRTC.nvrtcGetPTX(@ptr)
+      rescue NVRTCError => e
+        log = NVRTC.nvrtcGetProgramLog(@ptr)
+        raise CompileError.new(log, @src, @name, options)
+      end
+    end
+  end
+end

data/lib/cumo/linalg.rb ADDED

@@ -0,0 +1,12 @@
+require 'cumo'
+# Provide compatibility layers with numo/linalg
+module Cumo
+  module Blas
+    class << self
+      def gemm(a, *args, **kwargs)
+        a.gemm(*args, **kwargs)
+      end
+    end
+  end
+end

data/lib/cumo/narray.rb ADDED

	@@ -0,0 +1,2 @@
1	+ # This file is for compatibility with require 'numo/narray'
2	+ require_relative '../cumo'

data/lib/cumo/narray/extra.rb ADDED

@@ -0,0 +1,1278 @@
+module Cumo
+  class NArray
+    # Return an unallocated array with the same shape and type as self.
+    def new_narray
+      self.class.new(*shape)
+    end
+    # Return an array of zeros with the same shape and type as self.
+    def new_zeros
+      self.class.zeros(*shape)
+    end
+    # Return an array of ones with the same shape and type as self.
+    def new_ones
+      self.class.ones(*shape)
+    end
+    # Return an array filled with value with the same shape and type as self.
+    def new_fill(value)
+      self.class.new(*shape).fill(value)
+    end
+    # Convert angles from radians to degrees.
+    def rad2deg
+      self * (180/Math::PI)
+    end
+    # Convert angles from degrees to radians.
+    def deg2rad
+      self * (Math::PI/180)
+    end
+    # Flip each row in the left/right direction.
+    # Same as `a[true, (-1..0).step(-1), ...]`.
+    def fliplr
+      reverse(1)
+    end
+    # Flip each column in the up/down direction.
+    # Same as `a[(-1..0).step(-1), ...]`.
+    def flipud
+      reverse(0)
+    end
+    # Multi-dimensional array indexing.
+    # Same as [] for one-dimensional NArray.
+    # Similar to numpy's tuple indexing, i.e., `a[[1,2,..],[3,4,..]]`
+    # (This method will be rewritten in C)
+    # @return [Cumo::NArray] one-dimensional view of self.
+    # @example
+    #   p x = Cumo::DFloat.new(3,3,3).seq
+    #   # Cumo::DFloat#shape=[3,3,3]
+    #   # [[[0, 1, 2],
+    #   #   [3, 4, 5],
+    #   #   [6, 7, 8]],
+    #   #  [[9, 10, 11],
+    #   #   [12, 13, 14],
+    #   #   [15, 16, 17]],
+    #   #  [[18, 19, 20],
+    #   #   [21, 22, 23],
+    #   #   [24, 25, 26]]]
+    #
+    #   p x.at([0,1,2],[0,1,2],[-1,-2,-3])
+    #   # Cumo::DFloat(view)#shape=[3]
+    #   # [2, 13, 24]
+    def at(*indices)
+      if indices.size != ndim
+        raise DimensionError, "argument length does not match dimension size"
+      end
+      idx = nil
+      stride = 1
+      (indices.size-1).downto(0) do |i|
+        ix = Int64.cast(indices[i])
+        if ix.ndim != 1
+          raise DimensionError, "index array is not one-dimensional"
+        end
+        ix[ix < 0] += shape[i]
+        if ((ix < 0) & (ix >= shape[i])).any?
+          raise IndexError, "index array is out of range"
+        end
+        if idx
+          if idx.size != ix.size
+            raise ShapeError, "index array sizes mismatch"
+          end
+          idx += ix * stride
+          stride *= shape[i]
+        else
+          idx = ix
+          stride = shape[i]
+        end
+      end
+      self[idx]
+    end
+    # Rotate in the plane specified by axes.
+    # @example
+    #   p a = Cumo::Int32.new(2,2).seq
+    #   # Cumo::Int32#shape=[2,2]
+    #   # [[0, 1],
+    #   #  [2, 3]]
+    #
+    #   p a.rot90
+    #   # Cumo::Int32(view)#shape=[2,2]
+    #   # [[1, 3],
+    #   #  [0, 2]]
+    #
+    #   p a.rot90(2)
+    #   # Cumo::Int32(view)#shape=[2,2]
+    #   # [[3, 2],
+    #   #  [1, 0]]
+    #
+    #   p a.rot90(3)
+    #   # Cumo::Int32(view)#shape=[2,2]
+    #   # [[2, 0],
+    #   #  [3, 1]]
+    def rot90(k=1,axes=[0,1])
+      case k % 4
+      when 0
+        view
+      when 1
+        swapaxes(*axes).reverse(axes[0])
+      when 2
+        reverse(*axes)
+      when 3
+        swapaxes(*axes).reverse(axes[1])
+      end
+    end
+    def to_i
+      if size==1
+        self.extract_cpu.to_i
+      else
+        # convert to Int?
+        raise TypeError, "can't convert #{self.class} into Integer"
+      end
+    end
+    def to_f
+      if size==1
+        self.extract_cpu.to_f
+      else
+        # convert to DFloat?
+        raise TypeError, "can't convert #{self.class} into Float"
+      end
+    end
+    def to_c
+      if size==1
+        Complex(self.extract_cpu)
+      else
+        # convert to DComplex?
+        raise TypeError, "can't convert #{self.class} into Complex"
+      end
+    end
+    # Convert the argument to an narray if not an narray.
+    def self.cast(a)
+      a.kind_of?(NArray) ? a : NArray.array_type(a).cast(a)
+    end
+    def self.asarray(a)
+      case a
+      when NArray
+        (a.ndim == 0) ? a[:new] : a
+      when Numeric,Range
+        self[a]
+      else
+        cast(a)
+      end
+    end
+    # parse matrix like matlab, octave
+    # @example
+    #   a = Cumo::DFloat.parse %[
+    #    2 -3 5
+    #    4 9 7
+    #    2 -1 6
+    #   ]
+    #   => Cumo::DFloat#shape=[3,3]
+    #   [[2, -3, 5],
+    #    [4, 9, 7],
+    #    [2, -1, 6]]
+    def self.parse(str, split1d:/\s+/, split2d:/;?$|;/,
+                   split3d:/\s*\n(\s*\n)+/m)
+      a = []
+      str.split(split3d).each do |block|
+        b = []
+        #print "b"; p block
+        block.split(split2d).each do |line|
+          #p line
+          line.strip!
+          if !line.empty?
+            c = []
+            line.split(split1d).each do |item|
+              c << eval(item.strip) if !item.empty?
+            end
+            b << c if !c.empty?
+          end
+        end
+        a << b if !b.empty?
+      end
+      if a.size==1
+        self.cast(a[0])
+      else
+        self.cast(a)
+      end
+    end
+    # Append values to the end of an narray.
+    # @example
+    #   a = Cumo::DFloat[1, 2, 3]
+    #   p a.append([[4, 5, 6], [7, 8, 9]])
+    #   # Cumo::DFloat#shape=[9]
+    #   # [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    #
+    #   a = Cumo::DFloat[[1, 2, 3]]
+    #   p a.append([[4, 5, 6], [7, 8, 9]],axis:0)
+    #   # Cumo::DFloat#shape=[3,3]
+    #   # [[1, 2, 3],
+    #   #  [4, 5, 6],
+    #   #  [7, 8, 9]]
+    #
+    #   a = Cumo::DFloat[[1, 2, 3], [4, 5, 6]]
+    #   p a.append([7, 8, 9], axis:0)
+    #   # in `append': dimension mismatch (Cumo::NArray::DimensionError)
+    def append(other,axis:nil)
+      other = self.class.cast(other)
+      if axis
+        if ndim != other.ndim
+          raise DimensionError, "dimension mismatch"
+        end
+        return concatenate(other,axis:axis)
+      else
+        a = self.class.zeros(size+other.size)
+        a[0...size] = self[true]
+        a[size..-1] = other[true]
+        return a
+      end
+    end
+    # Return a new array with sub-arrays along an axis deleted.
+    # If axis is not given, obj is applied to the flattened array.
+    # @example
+    #   a = Cumo::DFloat[[1,2,3,4], [5,6,7,8], [9,10,11,12]]
+    #   p a.delete(1,0)
+    #   # Cumo::DFloat(view)#shape=[2,4]
+    #   # [[1, 2, 3, 4],
+    #   #  [9, 10, 11, 12]]
+    #
+    #   p a.delete((0..-1).step(2),1)
+    #   # Cumo::DFloat(view)#shape=[3,2]
+    #   # [[2, 4],
+    #   #  [6, 8],
+    #   #  [10, 12]]
+    #
+    #   p a.delete([1,3,5])
+    #   # Cumo::DFloat(view)#shape=[9]
+    #   # [1, 3, 5, 7, 8, 9, 10, 11, 12]
+    def delete(indice,axis=nil)
+      if axis
+        bit = Bit.ones(shape[axis])
+        bit[indice] = 0
+        idx = [true]*ndim
+        idx[axis] = bit.where
+        return self[*idx].copy
+      else
+        bit = Bit.ones(size)
+        bit[indice] = 0
+        return self[bit.where].copy
+      end
+    end
+    # Insert values along the axis before the indices.
+    # @example
+    #   p a = Cumo::DFloat[[1, 2], [3, 4]]
+    #   a = Cumo::Int32[[1, 1], [2, 2], [3, 3]]
+    #
+    #   p a.insert(1,5)
+    #   # Cumo::Int32#shape=[7]
+    #   # [1, 5, 1, 2, 2, 3, 3]
+    #
+    #   p a.insert(1, 5, axis:1)
+    #   # Cumo::Int32#shape=[3,3]
+    #   # [[1, 5, 1],
+    #   #  [2, 5, 2],
+    #   #  [3, 5, 3]]
+    #
+    #   p a.insert([1], [[11],[12],[13]], axis:1)
+    #   # Cumo::Int32#shape=[3,3]
+    #   # [[1, 11, 1],
+    #   #  [2, 12, 2],
+    #   #  [3, 13, 3]]
+    #
+    #   p a.insert(1, [11, 12, 13], axis:1)
+    #   # Cumo::Int32#shape=[3,3]
+    #   # [[1, 11, 1],
+    #   #  [2, 12, 2],
+    #   #  [3, 13, 3]]
+    #
+    #   p a.insert([1], [11, 12, 13], axis:1)
+    #   # Cumo::Int32#shape=[3,5]
+    #   # [[1, 11, 12, 13, 1],
+    #   #  [2, 11, 12, 13, 2],
+    #   #  [3, 11, 12, 13, 3]]
+    #
+    #   p b = a.flatten
+    #   # Cumo::Int32(view)#shape=[6]
+    #   # [1, 1, 2, 2, 3, 3]
+    #
+    #   p b.insert(2,[15,16])
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 1, 15, 16, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,2],[15,16])
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 1, 15, 16, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,1],[15,16])
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 16, 1, 15, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,0,1],[15,16,17])
+    #   # Cumo::Int32#shape=[9]
+    #   # [16, 1, 17, 1, 15, 2, 2, 3, 3]
+    #
+    #   p b.insert(2..3, [15, 16])
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 1, 15, 2, 16, 2, 3, 3]
+    #
+    #   p b.insert(2, [7.13, 0.5])
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 1, 7, 0, 2, 2, 3, 3]
+    #
+    #   p x = Cumo::DFloat.new(2,4).seq
+    #   # Cumo::DFloat#shape=[2,4]
+    #   # [[0, 1, 2, 3],
+    #   #  [4, 5, 6, 7]]
+    #
+    #   p x.insert([1,3],999,axis:1)
+    #   # Cumo::DFloat#shape=[2,6]
+    #   # [[0, 999, 1, 2, 999, 3],
+    #   #  [4, 999, 5, 6, 999, 7]]
+    def insert(indice,values,axis:nil)
+      if axis
+        values = self.class.asarray(values)
+        nd = values.ndim
+        midx = [:new]*(ndim-nd) + [true]*nd
+        case indice
+        when Numeric
+          midx[-nd-1] = true
+          midx[axis] = :new
+        end
+        values = values[*midx]
+      else
+        values = self.class.asarray(values).flatten
+      end
+      idx = Int64.asarray(indice)
+      nidx = idx.size
+      if nidx == 1
+        nidx = values.shape[axis||0]
+        idx = idx + Int64.new(nidx).seq
+      else
+        sidx = idx.sort_index
+        idx[sidx] += Int64.new(nidx).seq
+      end
+      if axis
+        bit = Bit.ones(shape[axis]+nidx)
+        bit[idx] = 0
+        new_shape = shape
+        new_shape[axis] += nidx
+        a = self.class.zeros(new_shape)
+        mdidx = [true]*ndim
+        mdidx[axis] = bit.where
+        a[*mdidx] = self
+        mdidx[axis] = idx
+        a[*mdidx] = values
+      else
+        bit = Bit.ones(size+nidx)
+        bit[idx] = 0
+        a = self.class.zeros(size+nidx)
+        a[bit.where] = self.flatten
+        a[idx] = values
+      end
+      return a
+    end
+    class << self
+    # @example
+    #   p a = Cumo::DFloat[[1, 2], [3, 4]]
+    #   # Cumo::DFloat#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b = Cumo::DFloat[[5, 6]]
+    #   # Cumo::DFloat#shape=[1,2]
+    #   # [[5, 6]]
+    #
+    #   p Cumo::NArray.concatenate([a,b],axis:0)
+    #   # Cumo::DFloat#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [5, 6]]
+    #
+    #   p Cumo::NArray.concatenate([a,b.transpose], axis:1)
+    #   # Cumo::DFloat#shape=[2,3]
+    #   # [[1, 2, 5],
+    #   #  [3, 4, 6]]
+    def concatenate(arrays,axis:0)
+      klass = (self==NArray) ? NArray.array_type(arrays) : self
+      nd = 0
+      arrays = arrays.map do |a|
+        case a
+        when NArray
+          # ok
+        when Numeric
+          a = klass[a]
+        when Array
+          a = klass.cast(a)
+        else
+          raise TypeError,"not Cumo::NArray: #{a.inspect[0..48]}"
+        end
+        if a.ndim > nd
+          nd = a.ndim
+        end
+        a
+      end
+      if axis < 0
+        axis += nd
+      end
+      if axis < 0 || axis >= nd
+        raise ArgumentError,"axis is out of range"
+      end
+      new_shape = nil
+      sum_size = 0
+      arrays.each do |a|
+        a_shape = a.shape
+        if nd != a_shape.size
+          a_shape = [1]*(nd-a_shape.size) + a_shape
+        end
+        sum_size += a_shape.delete_at(axis)
+        if new_shape
+          if new_shape != a_shape
+            raise ShapeError,"shape mismatch"
+          end
+        else
+          new_shape = a_shape
+        end
+      end
+      new_shape.insert(axis,sum_size)
+      result = klass.zeros(*new_shape)
+      lst = 0
+      refs = [true] * nd
+      arrays.each do |a|
+        fst = lst
+        lst = fst + (a.shape[axis-nd]||1)
+        refs[axis] = fst...lst
+        result[*refs] = a
+      end
+      result
+    end
+    # Stack arrays vertically (row wise).
+    # @example
+    #   a = Cumo::Int32[1,2,3]
+    #   b = Cumo::Int32[2,3,4]
+    #   p Cumo::NArray.vstack([a,b])
+    #   # Cumo::Int32#shape=[2,3]
+    #   # [[1, 2, 3],
+    #   #  [2, 3, 4]]
+    #
+    #   a = Cumo::Int32[[1],[2],[3]]
+    #   b = Cumo::Int32[[2],[3],[4]]
+    #   p Cumo::NArray.vstack([a,b])
+    #   # Cumo::Int32#shape=[6,1]
+    #   # [[1],
+    #   #  [2],
+    #   #  [3],
+    #   #  [2],
+    #   #  [3],
+    #   #  [4]]
+    def vstack(arrays)
+      arys = arrays.map do |a|
+        _atleast_2d(cast(a))
+      end
+      concatenate(arys,axis:0)
+    end
+    # Stack arrays horizontally (column wise).
+    # @example
+    #   a = Cumo::Int32[1,2,3]
+    #   b = Cumo::Int32[2,3,4]
+    #   p Cumo::NArray.hstack([a,b])
+    #   # Cumo::Int32#shape=[6]
+    #   # [1, 2, 3, 2, 3, 4]
+    #
+    #   a = Cumo::Int32[[1],[2],[3]]
+    #   b = Cumo::Int32[[2],[3],[4]]
+    #   p Cumo::NArray.hstack([a,b])
+    #   # Cumo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [2, 3],
+    #   #  [3, 4]]
+    def hstack(arrays)
+      klass = (self==NArray) ? NArray.array_type(arrays) : self
+      nd = 0
+      arys = arrays.map do |a|
+        a = klass.cast(a)
+        nd = a.ndim if a.ndim > nd
+        a
+      end
+      dim = (nd >= 2) ? 1 : 0
+      concatenate(arys,axis:dim)
+    end
+    # Stack arrays in depth wise (along third axis).
+    # @example
+    #   a = Cumo::Int32[1,2,3]
+    #   b = Cumo::Int32[2,3,4]
+    #   p Cumo::NArray.dstack([a,b])
+    #   # Cumo::Int32#shape=[1,3,2]
+    #   # [[[1, 2],
+    #   #   [2, 3],
+    #   #   [3, 4]]]
+    #
+    #   a = Cumo::Int32[[1],[2],[3]]
+    #   b = Cumo::Int32[[2],[3],[4]]
+    #   p Cumo::NArray.dstack([a,b])
+    #   # Cumo::Int32#shape=[3,1,2]
+    #   # [[[1, 2]],
+    #   #  [[2, 3]],
+    #   #  [[3, 4]]]
+    def dstack(arrays)
+      arys = arrays.map do |a|
+        _atleast_3d(cast(a))
+      end
+      concatenate(arys,axis:2)
+    end
+    # Stack 1-d arrays into columns of a 2-d array.
+    # @example
+    #   x = Cumo::Int32[1,2,3]
+    #   y = Cumo::Int32[2,3,4]
+    #   p Cumo::NArray.column_stack([x,y])
+    #   # Cumo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [2, 3],
+    #   #  [3, 4]]
+    def column_stack(arrays)
+      arys = arrays.map do |a|
+        a = cast(a)
+        case a.ndim
+        when 0; a[:new,:new]
+        when 1; a[true,:new]
+        else; a
+        end
+      end
+      concatenate(arys,axis:1)
+    end
+    private
+    # Return an narray with at least two dimension.
+    def _atleast_2d(a)
+      case a.ndim
+      when 0; a[:new,:new]
+      when 1; a[:new,true]
+      else;   a
+      end
+    end
+    # Return an narray with at least three dimension.
+    def _atleast_3d(a)
+      case a.ndim
+      when 0; a[:new,:new,:new]
+      when 1; a[:new,true,:new]
+      when 2; a[true,true,:new]
+      else;   a
+      end
+    end
+    end # class << self
+    # @example
+    #   p a = Cumo::DFloat[[1, 2], [3, 4]]
+    #   # Cumo::DFloat#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b = Cumo::DFloat[[5, 6]]
+    #   # Cumo::DFloat#shape=[1,2]
+    #   # [[5, 6]]
+    #
+    #   p a.concatenate(b,axis:0)
+    #   # Cumo::DFloat#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [5, 6]]
+    #
+    #   p a.concatenate(b.transpose, axis:1)
+    #   # Cumo::DFloat#shape=[2,3]
+    #   # [[1, 2, 5],
+    #   #  [3, 4, 6]]
+    def concatenate(*arrays,axis:0)
+      axis = check_axis(axis)
+      self_shape = shape
+      self_shape.delete_at(axis)
+      sum_size = shape[axis]
+      arrays.map! do |a|
+        case a
+        when NArray
+          # ok
+        when Numeric
+          a = self.class.new(1).store(a)
+        when Array
+          a = self.class.cast(a)
+        else
+          raise TypeError,"not Cumo::NArray: #{a.inspect[0..48]}"
+        end
+        if a.ndim > ndim
+          raise ShapeError,"dimension mismatch"
+        end
+        a_shape = a.shape
+        sum_size += a_shape.delete_at(axis-ndim) || 1
+        if self_shape != a_shape
+          raise ShapeError,"shape mismatch"
+        end
+        a
+      end
+      self_shape.insert(axis,sum_size)
+      result = self.class.zeros(*self_shape)
+      lst = shape[axis]
+      refs = [true] * ndim
+      refs[axis] = 0...lst
+      result[*refs] = self
+      arrays.each do |a|
+        fst = lst
+        lst = fst + (a.shape[axis-ndim] || 1)
+        refs[axis] = fst...lst
+        result[*refs] = a
+      end
+      result
+    end
+    # @example
+    #   p x = Cumo::DFloat.new(9).seq
+    #   # Cumo::DFloat#shape=[9]
+    #   # [0, 1, 2, 3, 4, 5, 6, 7, 8]
+    #
+    #   pp x.split(3)
+    #   # [Cumo::DFloat(view)#shape=[3]
+    #   # [0, 1, 2],
+    #   #  Cumo::DFloat(view)#shape=[3]
+    #   # [3, 4, 5],
+    #   #  Cumo::DFloat(view)#shape=[3]
+    #   # [6, 7, 8]]
+    #
+    #   p x = Cumo::DFloat.new(8).seq
+    #   # Cumo::DFloat#shape=[8]
+    #   # [0, 1, 2, 3, 4, 5, 6, 7]
+    #
+    #   pp x.split([3, 5, 6, 10])
+    #   # [Cumo::DFloat(view)#shape=[3]
+    #   # [0, 1, 2],
+    #   #  Cumo::DFloat(view)#shape=[2]
+    #   # [3, 4],
+    #   #  Cumo::DFloat(view)#shape=[1]
+    #   # [5],
+    #   #  Cumo::DFloat(view)#shape=[2]
+    #   # [6, 7],
+    #   #  Cumo::DFloat(view)#shape=[0][]]
+    def split(indices_or_sections, axis:0)
+      axis = check_axis(axis)
+      size_axis = shape[axis]
+      case indices_or_sections
+      when Integer
+        div_axis, mod_axis = size_axis.divmod(indices_or_sections)
+        refs = [true]*ndim
+        beg_idx = 0
+        mod_axis.times.map do |i|
+          end_idx = beg_idx + div_axis + 1
+          refs[axis] = beg_idx ... end_idx
+          beg_idx = end_idx
+          self[*refs]
+        end +
+        (indices_or_sections-mod_axis).times.map do |i|
+          end_idx = beg_idx + div_axis
+          refs[axis] = beg_idx ... end_idx
+          beg_idx = end_idx
+          self[*refs]
+        end
+      when NArray
+        split(indices_or_sections.to_a,axis:axis)
+      when Array
+        refs = [true]*ndim
+        fst = 0
+        (indices_or_sections + [size_axis]).map do |lst|
+          lst = size_axis if lst > size_axis
+          refs[axis] = (fst < size_axis) ? fst...lst : -1...-1
+          fst = lst
+          self[*refs]
+        end
+      else
+        raise TypeError,"argument must be Integer or Array"
+      end
+    end
+    # @example
+    #   p x = Cumo::DFloat.new(4,4).seq
+    #   # Cumo::DFloat#shape=[4,4]
+    #   # [[0, 1, 2, 3],
+    #   #  [4, 5, 6, 7],
+    #   #  [8, 9, 10, 11],
+    #   #  [12, 13, 14, 15]]
+    #
+    #   pp x.hsplit(2)
+    #   # [Cumo::DFloat(view)#shape=[4,2]
+    #   # [[0, 1],
+    #   #  [4, 5],
+    #   #  [8, 9],
+    #   #  [12, 13]],
+    #   #  Cumo::DFloat(view)#shape=[4,2]
+    #   # [[2, 3],
+    #   #  [6, 7],
+    #   #  [10, 11],
+    #   #  [14, 15]]]
+    #
+    #   pp x.hsplit([3, 6])
+    #   # [Cumo::DFloat(view)#shape=[4,3]
+    #   # [[0, 1, 2],
+    #   #  [4, 5, 6],
+    #   #  [8, 9, 10],
+    #   #  [12, 13, 14]],
+    #   #  Cumo::DFloat(view)#shape=[4,1]
+    #   # [[3],
+    #   #  [7],
+    #   #  [11],
+    #   #  [15]],
+    #   #  Cumo::DFloat(view)#shape=[4,0][]]
+    def vsplit(indices_or_sections)
+      split(indices_or_sections, axis:0)
+    end
+    def hsplit(indices_or_sections)
+      split(indices_or_sections, axis:1)
+    end
+    def dsplit(indices_or_sections)
+      split(indices_or_sections, axis:2)
+    end
+    # @example
+    #   p a = Cumo::NArray[0,1,2]
+    #   # Cumo::Int32#shape=[3]
+    #   # [0, 1, 2]
+    #
+    #   p a.tile(2)
+    #   # Cumo::Int32#shape=[6]
+    #   # [0, 1, 2, 0, 1, 2]
+    #
+    #   p a.tile(2,2)
+    #   # Cumo::Int32#shape=[2,6]
+    #   # [[0, 1, 2, 0, 1, 2],
+    #   #  [0, 1, 2, 0, 1, 2]]
+    #
+    #   p a.tile(2,1,2)
+    #   # Cumo::Int32#shape=[2,1,6]
+    #   # [[[0, 1, 2, 0, 1, 2]],
+    #   #  [[0, 1, 2, 0, 1, 2]]]
+    #
+    #   p b = Cumo::NArray[[1, 2], [3, 4]]
+    #   # Cumo::Int32#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b.tile(2)
+    #   # Cumo::Int32#shape=[2,4]
+    #   # [[1, 2, 1, 2],
+    #   #  [3, 4, 3, 4]]
+    #
+    #   p b.tile(2,1)
+    #   # Cumo::Int32#shape=[4,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [1, 2],
+    #   #  [3, 4]]
+    #
+    #   p c = Cumo::NArray[1,2,3,4]
+    #   # Cumo::Int32#shape=[4]
+    #   # [1, 2, 3, 4]
+    #
+    #   p c.tile(4,1)
+    #   # Cumo::Int32#shape=[4,4]
+    #   # [[1, 2, 3, 4],
+    #   #  [1, 2, 3, 4],
+    #   #  [1, 2, 3, 4],
+    #   #  [1, 2, 3, 4]]
+    def tile(*arg)
+      arg.each do |i|
+        if !i.kind_of?(Integer) || i<1
+          raise ArgumentError,"argument should be positive integer"
+        end
+      end
+      ns = arg.size
+      nd = self.ndim
+      shp = self.shape
+      new_shp = []
+      src_shp = []
+      res_shp = []
+      (nd-ns).times do
+        new_shp << 1
+        new_shp << (n = shp.shift)
+        src_shp << :new
+        src_shp << true
+        res_shp << n
+      end
+      (ns-nd).times do
+        new_shp << (m = arg.shift)
+        new_shp << 1
+        src_shp << :new
+        src_shp << :new
+        res_shp << m
+      end
+      [nd,ns].min.times do
+        new_shp << (m = arg.shift)
+        new_shp << (n = shp.shift)
+        src_shp << :new
+        src_shp << true
+        res_shp << n*m
+      end
+      self.class.new(*new_shp).store(self[*src_shp]).reshape(*res_shp)
+    end
+    # @example
+    #   p Cumo::NArray[3].repeat(4)
+    #   # Cumo::Int32#shape=[4]
+    #   # [3, 3, 3, 3]
+    #
+    #   p x = Cumo::NArray[[1,2],[3,4]]
+    #   # Cumo::Int32#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p x.repeat(2)
+    #   # Cumo::Int32#shape=[8]
+    #   # [1, 1, 2, 2, 3, 3, 4, 4]
+    #
+    #   p x.repeat(3,axis:1)
+    #   # Cumo::Int32#shape=[2,6]
+    #   # [[1, 1, 1, 2, 2, 2],
+    #   #  [3, 3, 3, 4, 4, 4]]
+    #
+    #   p x.repeat([1,2],axis:0)
+    #   # Cumo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [3, 4]]
+    def repeat(arg,axis:nil)
+      case axis
+      when Integer
+        axis = check_axis(axis)
+        c = self
+      when NilClass
+        c = self.flatten
+        axis = 0
+      else
+        raise ArgumentError,"invalid axis"
+      end
+      case arg
+      when Integer
+        if !arg.kind_of?(Integer) || arg<1
+          raise ArgumentError,"argument should be positive integer"
+        end
+        idx = c.shape[axis].times.map{|i| [i]*arg}.flatten
+      else
+        arg = arg.to_a
+        if arg.size != c.shape[axis]
+          raise ArgumentError,"repeat size shoud be equal to size along axis"
+        end
+        arg.each do |i|
+          if !i.kind_of?(Integer) || i<0
+            raise ArgumentError,"argument should be non-negative integer"
+          end
+        end
+        idx = arg.each_with_index.map{|a,i| [i]*a}.flatten
+      end
+      ref = [true] * c.ndim
+      ref[axis] = idx
+      c[*ref].copy
+    end
+    # Calculate the n-th discrete difference along given axis.
+    # @example
+    #   p x = Cumo::DFloat[1, 2, 4, 7, 0]
+    #   # Cumo::DFloat#shape=[5]
+    #   # [1, 2, 4, 7, 0]
+    #
+    #   p x.diff
+    #   # Cumo::DFloat#shape=[4]
+    #   # [1, 2, 3, -7]
+    #
+    #   p x.diff(2)
+    #   # Cumo::DFloat#shape=[3]
+    #   # [1, 1, -10]
+    #
+    #   p x = Cumo::DFloat[[1, 3, 6, 10], [0, 5, 6, 8]]
+    #   # Cumo::DFloat#shape=[2,4]
+    #   # [[1, 3, 6, 10],
+    #   #  [0, 5, 6, 8]]
+    #
+    #   p x.diff
+    #   # Cumo::DFloat#shape=[2,3]
+    #   # [[2, 3, 4],
+    #   #  [5, 1, 2]]
+    #
+    #   p x.diff(axis:0)
+    #   # Cumo::DFloat#shape=[1,4]
+    #   # [[-1, 2, 0, -2]]
+    def diff(n=1,axis:-1)
+      axis = check_axis(axis)
+      if n < 0 || n >= shape[axis]
+        raise ShapeError,"n=#{n} is invalid for shape[#{axis}]=#{shape[axis]}"
+      end
+      # calculate polynomial coefficient
+      c = self.class[-1,1]
+      2.upto(n) do |i|
+        x = self.class.zeros(i+1)
+        x[0..-2] = c
+        y = self.class.zeros(i+1)
+        y[1..-1] = c
+        c = y - x
+      end
+      s = [true]*ndim
+      s[axis] = n..-1
+      result = self[*s].dup
+      sum = result.inplace
+      (n-1).downto(0) do |i|
+        s = [true]*ndim
+        s[axis] = i..-n-1+i
+        sum + self[*s] * c[i] # inplace addition
+      end
+      return result
+    end
+    # Upper triangular matrix.
+    # Return a copy with the elements below the k-th diagonal filled with zero.
+    def triu(k=0)
+      dup.triu!(k)
+    end
+    # Upper triangular matrix.
+    # Fill the self elements below the k-th diagonal with zero.
+    def triu!(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      if contiguous?
+        *shp,m,n = shape
+        idx = tril_indices(k-1)
+        reshape!(*shp,m*n)
+        self[false,idx] = 0
+        reshape!(*shp,m,n)
+      else
+        store(triu(k))
+      end
+    end
+    # Return the indices for the uppler-triangle on and above the k-th diagonal.
+    def triu_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.triu_indices(m,n,k=0)
+    end
+    # Return the indices for the uppler-triangle on and above the k-th diagonal.
+    def self.triu_indices(m,n,k=0)
+      x = Cumo::Int64.new(m,1).seq + k
+      y = Cumo::Int64.new(1,n).seq
+      (x<=y).where
+    end
+    # Lower triangular matrix.
+    # Return a copy with the elements above the k-th diagonal filled with zero.
+    def tril(k=0)
+      dup.tril!(k)
+    end
+    # Lower triangular matrix.
+    # Fill the self elements above the k-th diagonal with zero.
+    def tril!(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      if contiguous?
+        idx = triu_indices(k+1)
+        *shp,m,n = shape
+        reshape!(*shp,m*n)
+        self[false,idx] = 0
+        reshape!(*shp,m,n)
+      else
+        store(tril(k))
+      end
+    end
+    # Return the indices for the lower-triangle on and below the k-th diagonal.
+    def tril_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.tril_indices(m,n,k)
+    end
+    # Return the indices for the lower-triangle on and below the k-th diagonal.
+    def self.tril_indices(m,n,k=0)
+      x = Cumo::Int64.new(m,1).seq + k
+      y = Cumo::Int64.new(1,n).seq
+      (x>=y).where
+    end
+    # Return the k-th diagonal indices.
+    def diag_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.diag_indices(m,n,k)
+    end
+    # Return the k-th diagonal indices.
+    def self.diag_indices(m,n,k=0)
+      x = Cumo::Int64.new(m,1).seq + k
+      y = Cumo::Int64.new(1,n).seq
+      (x.eq y).where
+    end
+    # Return a matrix whose diagonal is constructed by self along the last axis.
+    def diag(k=0)
+      *shp,n = shape
+      n += k.abs
+      a = self.class.zeros(*shp,n,n)
+      a.diagonal(k).store(self)
+      a
+    end
+    # Return the sum along diagonals of the array.
+    #
+    # If 2-D array, computes the summation along its diagonal with the
+    # given offset, i.e., sum of `a[i,i+offset]`.
+    # If more than 2-D array, the diagonal is determined from the axes
+    # specified by axis argument. The default is axis=[-2,-1].
+    # @param offset [Integer] (optional, default=0) diagonal offset
+    # @param axis [Array] (optional, default=[-2,-1]) diagonal axis
+    # @param nan [Bool] (optional, default=false) nan-aware algorithm, i.e., if true then it ignores nan.
+    def trace(offset=nil,axis=nil,nan:false)
+      diagonal(offset,axis).sum(nan:nan,axis:-1)
+    end
+    @@warn_slow_dot = false
+    # Dot product of two arrays.
+    # @param b [Cumo::NArray]
+    # @return [Cumo::NArray]  return dot product
+    def dot(b)
+      t = self.class::UPCAST[b.class]
+      if [SFloat, DFloat, SComplex, DComplex].include?(t)
+        b = self.class.asarray(b)
+        case self.ndim
+        when 1
+          case b.ndim
+          when 1
+            self.mulsum(b, axis:-1)
+          else
+            self[:new, false].gemm(b).flatten
+          end
+        else
+          case b.ndim
+          when 1
+            self.gemm(b[false, :new]).flatten
+          else
+            self.gemm(b)
+          end
+        end
+      else
+        b = self.class.asarray(b)
+        case b.ndim
+        when 1
+          mulsum(b, axis:-1)
+        else
+          case ndim
+          when 0
+            b.mulsum(self, axis:-2)
+          when 1
+            self[true,:new].mulsum(b, axis:-2)
+          else
+            unless @@warn_slow_dot
+              nx = 200
+              ns = 200000
+              am,an = shape[-2..-1]
+              bm,bn = b.shape[-2..-1]
+              if am > nx && an > nx && bm > nx && bn > nx &&
+                  size > ns && b.size > ns
+                @@warn_slow_dot = true
+                warn "\nwarning: matrix dot for #{t} is slow. Consider SFloat, DFloat, SComplex, or DComplex to use cuBLAS.\n\n"
+              end
+            end
+            self[false,:new].mulsum(b[false,:new,true,true], axis:-2)
+          end
+        end
+      end
+    end
+    # Inner product of two arrays.
+    # Same as `(a*b).sum(axis:-1)`.
+    # @param b [Cumo::NArray]
+    # @param axis [Integer] applied axis
+    # @return [Cumo::NArray]  return (a*b).sum(axis:axis)
+    def inner(b, axis:-1)
+      mulsum(b, axis:axis)
+    end
+    # Outer product of two arrays.
+    # Same as `self[false,:new] * b[false,:new,true]`.
+    #
+    # @param b [Cumo::NArray]
+    # @param axis [Integer] applied axis (default=-1)
+    # @return [Cumo::NArray]  return outer product
+    # @example
+    #   a = Cumo::DFloat.ones(5)
+    #   => Cumo::DFloat#shape=[5]
+    #   [1, 1, 1, 1, 1]
+    #   b = Cumo::DFloat.linspace(-2,2,5)
+    #   => Cumo::DFloat#shape=[5]
+    #   [-2, -1, 0, 1, 2]
+    #   a.outer(b)
+    #   => Cumo::DFloat#shape=[5,5]
+    #   [[-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2]]
+    def outer(b, axis:nil)
+      b = NArray.cast(b)
+      if axis.nil?
+        self[false,:new] * ((b.ndim==0) ? b : b[false,:new,true])
+      else
+        md,nd = [ndim,b.ndim].minmax
+        axis = check_axis(axis) - nd
+        if axis < -md
+          raise ArgumentError,"axis=#{axis} is out of range"
+        end
+        adim = [true]*ndim
+        adim[axis+ndim+1,0] = :new
+        bdim = [true]*b.ndim
+        bdim[axis+b.ndim,0] = :new
+        self[*adim] * b[*bdim]
+      end
+    end
+    # Kronecker product of two arrays.
+    #
+    #     kron(a,b)[k_0, k_1, ...] = a[i_0, i_1, ...] * b[j_0, j_1, ...]
+    #        where:  k_n = i_n * b.shape[n] + j_n
+    #
+    # @param b [Cumo::NArray]
+    # @return [Cumo::NArray]  return Kronecker product
+    # @example
+    #   Cumo::DFloat[1,10,100].kron([5,6,7])
+    #   => Cumo::DFloat#shape=[9]
+    #   [5, 6, 7, 50, 60, 70, 500, 600, 700]
+    #   Cumo::DFloat[5,6,7].kron([1,10,100])
+    #   => Cumo::DFloat#shape=[9]
+    #   [5, 50, 500, 6, 60, 600, 7, 70, 700]
+    #   Cumo::DFloat.eye(2).kron(Cumo::DFloat.ones(2,2))
+    #   => Cumo::DFloat#shape=[4,4]
+    #   [[1, 1, 0, 0],
+    #    [1, 1, 0, 0],
+    #    [0, 0, 1, 1],
+    #    [0, 0, 1, 1]]
+    def kron(b)
+      b = NArray.cast(b)
+      nda = ndim
+      ndb = b.ndim
+      shpa = shape
+      shpb = b.shape
+      adim = [:new]*(2*[ndb-nda,0].max) + [true,:new]*nda
+      bdim = [:new]*(2*[nda-ndb,0].max) + [:new,true]*ndb
+      shpr = (-[nda,ndb].max..-1).map{|i| (shpa[i]||1) * (shpb[i]||1)}
+      (self[*adim] * b[*bdim]).reshape(*shpr)
+    end
+    # under construction
+    def cov(y=nil, ddof:1, fweights:nil, aweights:nil)
+      if y
+        m = NArray.vstack([self,y])
+      else
+        m = self
+      end
+      w = nil
+      if fweights
+        f = fweights
+        w = f
+      end
+      if aweights
+        a = aweights
+        w = w ? w*a : a
+      end
+      if w
+        w_sum = w.sum(axis:-1, keepdims:true)
+        if ddof == 0
+          fact = w_sum
+        elsif aweights.nil?
+          fact = w_sum - ddof
+        else
+          wa_sum = (w*a).sum(axis:-1, keepdims:true)
+          fact = w_sum - ddof * wa_sum / w_sum
+        end
+        if (fact <= 0).any?
+          raise StandardError,"Degrees of freedom <= 0 for slice"
+        end
+      else
+        fact = m.shape[-1] - ddof
+      end
+      if w
+        m -= (m*w).sum(axis:-1, keepdims:true) / w_sum
+        mw = m*w
+      else
+        m -= m.mean(axis:-1, keepdims:true)
+        mw = m
+      end
+      mt = (m.ndim < 2) ? m : m.swapaxes(-2,-1)
+      mw.dot(mt.conj) / fact
+    end
+    private
+    # @!visibility private
+    def check_axis(axis)
+      unless Integer===axis
+        raise ArgumentError,"axis=#{axis} must be Integer"
+      end
+      a = axis
+      if a < 0
+        a += ndim
+      end
+      if a < 0 || a >= ndim
+        raise ArgumentError,"axis=#{axis} is invalid"
+      end
+      a
+    end
+  end
+end