RubyGems - numo-narray - Versions diffs - 0.9.1.2 → 0.9.1.3 - Mend

numo-narray 0.9.1.2 → 0.9.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/Rakefile +7 -1
data/ext/numo/narray/array.c +6 -6
data/ext/numo/narray/data.c +8 -8
data/ext/numo/narray/depend.erb +4 -4
data/ext/numo/narray/extconf.rb +2 -2
data/ext/numo/narray/gen/cogen.rb +13 -0
data/ext/numo/narray/gen/def/dfloat.rb +1 -0
data/ext/numo/narray/gen/def/sfloat.rb +1 -0
data/ext/numo/narray/gen/narray_def.rb +14 -2
data/ext/numo/narray/gen/spec.rb +26 -10
data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
data/ext/numo/narray/gen/tmpl/binary.c +149 -10
data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
data/ext/numo/narray/gen/tmpl/cast.c +1 -1
data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
data/ext/numo/narray/gen/tmpl/each.c +1 -1
data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
data/ext/numo/narray/gen/tmpl/lib.c +5 -0
data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
data/ext/numo/narray/gen/tmpl/median.c +3 -2
data/ext/numo/narray/gen/tmpl/pow.c +1 -1
data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
data/ext/numo/narray/gen/tmpl/store.c +4 -4
data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
data/ext/numo/narray/index.c +64 -37
data/ext/numo/narray/math.c +4 -4
data/ext/numo/narray/narray.c +54 -29
data/ext/numo/narray/ndloop.c +7 -7
data/ext/numo/narray/numo/narray.h +9 -2
data/ext/numo/narray/numo/template.h +18 -0
data/ext/numo/narray/numo/types/bit.h +5 -0
data/ext/numo/narray/numo/types/complex_macro.h +5 -0
data/ext/numo/narray/numo/types/float_macro.h +5 -0
data/ext/numo/narray/numo/types/int_macro.h +24 -0
data/ext/numo/narray/numo/types/robj_macro.h +5 -0
data/ext/numo/narray/numo/types/uint_macro.h +24 -0
data/ext/numo/narray/numo/types/xint_macro.h +5 -25
data/ext/numo/narray/rand.c +2 -29
data/ext/numo/narray/step.c +1 -28
data/ext/numo/narray/struct.c +26 -22
data/lib/numo/narray/extra.rb +50 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 21795de8941d7b8176772c41039e4d8a5b3ae03a51090f6ee2979f257de88d43
-  data.tar.gz: ff44506a3ce17bed9b927c23bf0c92395e1c1c54b8ccfcff9edc0aa6edb833c5
+  metadata.gz: 4081b1facf83501be82b2446c10bddefbf2e60530f1249074b880f7729968b4f
+  data.tar.gz: 19b64cf6e4778f25f60f821a6454cc1b5d2e9dc12f60ecb9f3bfb928155e45db
 SHA512:
-  metadata.gz: 1be858a3b1ba7c5028dea3a034ecc4b70ff9e41711957b8989e58a517f911bf41849c84a0f64ec1beb993416b42675384e3e035179910b724692613f765b10cc
-  data.tar.gz: 7db10683049eab739ee97f3599112ea95566ed223d4e24b020cfe400011ae0d6ad862ed01080c50fe3dd3546c508286b70abd4f0c3a7fdec6a977655e08957ba
+  metadata.gz: a7e32c3a9b208bb23d83c8d4b2f96a7e646e7e9db5bb42c2eea81598b9fdd4072f452f7318f4b9c594152bc9587c596896f1f55d4d41c3cb48ced01b0e764a28
+  data.tar.gz: b2686de4edbcbb35ff180c8a706f23f853740526249ac1f7c9729bd42fb34af5922eca9fd383a8cbea46ecc85280164a9346fbbc2f31fe4839a8bc15c7ccb8c3

data/Rakefile CHANGED

@@ -40,7 +40,13 @@ namespace :build do
       ["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
       ["cd", build_dir],
       ["bundle"],
-      ["rake", "cross", "native", "gem"],
+      [
+        "rake",
+        "RUBY_CC_VERSION=2.5.0:2.4.0:2.3.0:2.2.2:2.1.6",
+        "cross",
+        "native",
+        "gem",
+      ],
     ]
     raw_commands = commands.collect do |command|
       Shellwords.join(command)

data/ext/numo/narray/array.c CHANGED

@@ -106,7 +106,7 @@ static VALUE
         return type;
     default:
-        if (CLASS_OF(v) == rb_const_get( rb_cObject, id_Complex )) {
+        if (rb_obj_class(v) == rb_const_get( rb_cObject, id_Complex )) {
             return NA_DCOMPLEX;
         }
     }
@@ -232,9 +232,9 @@ na_mdai_investigate(na_mdai_t *mdai, int ndim)
             }
             // type
             if (NIL_P(mdai->na_type)) {
-                mdai->na_type = CLASS_OF(v);
+                mdai->na_type = rb_obj_class(v);
             } else {
-                mdai->na_type = na_upcast(CLASS_OF(v), mdai->na_type);
+                mdai->na_type = na_upcast(rb_obj_class(v), mdai->na_type);
             }
         } else {
             mdai->type = na_mdai_object_type(mdai->type, v);
@@ -423,7 +423,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
         narray_t *na;
         GetNArray(obj,na);
         ndim = na->ndim;
-        dtype = update_type(ptype, CLASS_OF(obj));
+        dtype = update_type(ptype, rb_obj_class(obj));
         if (pshape) {
             dshape = rb_ary_new2(ndim);
             for (i=0; i<ndim; i++) {
@@ -436,7 +436,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
         }
     } else {
         rb_raise(rb_eTypeError,"invalid type for NArray: %s",
-                 rb_class2name(CLASS_OF(obj)));
+                 rb_class2name(rb_obj_class(obj)));
     }
 }
@@ -539,7 +539,7 @@ na_mdai_for_struct(na_mdai_t *mdai, int ndim)
     //fpintf(stderr,"val = ");    rb_p(val);
-    if (CLASS_OF(val) == mdai->na_type) {
+    if (rb_obj_class(val) == mdai->na_type) {
         GetNArray(val,na);
         if ( ndim+na->ndim > mdai->capa ) {
             abort();

data/ext/numo/narray/data.c CHANGED

@@ -57,7 +57,7 @@ static ID id_swap_byte;
 }
 #define m_memcpy(src,dst) memcpy(dst,src,e)
-void
+static void
 iter_copy_bytes(na_loop_t *const lp)
 {
     size_t e;
@@ -206,7 +206,7 @@ check_axis(int axis, int ndim)
     #  [[1, 5],
     #   [3, 7]]]
 */
-VALUE
+static VALUE
 na_swapaxes(VALUE self, VALUE a1, VALUE a2)
 {
     int  i, j, ndim;
@@ -232,7 +232,7 @@ na_swapaxes(VALUE self, VALUE a1, VALUE a2)
     return view;
 }
-VALUE
+static VALUE
 na_transpose_map(VALUE self, int *map)
 {
     int  i, ndim;
@@ -262,7 +262,7 @@ na_transpose_map(VALUE self, int *map)
 #define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
-VALUE
+static VALUE
 na_transpose(int argc, VALUE *argv, VALUE self)
 {
     int ndim, *map, *permute;
@@ -467,7 +467,7 @@ na_flatten_dim(VALUE self, int sd)
     shape[sd] = size;
     // new object
-    view = na_s_allocate_view(CLASS_OF(self));
+    view = na_s_allocate_view(rb_obj_class(self));
     na_copy_flags(self, view);
     GetNArrayView(view, na2);
@@ -591,7 +591,7 @@ na_flatten(VALUE self)
      [10, 11, 12, 3, 14],
      [15, 16, 17, 18, 4]]
  */
-VALUE
+static VALUE
 na_diagonal(int argc, VALUE *argv, VALUE self)
 {
     int  i, k, nd;
@@ -690,7 +690,7 @@ na_diagonal(int argc, VALUE *argv, VALUE self)
     shape[k] = diag_size;
     // new object
-    view = na_s_allocate_view(CLASS_OF(self));
+    view = na_s_allocate_view(rb_obj_class(self));
     na_copy_flags(self, view);
     GetNArrayView(view, na2);
@@ -803,7 +803,7 @@ na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
     GetNArray(self,na);
     nd = na->ndim;
-    view = na_s_allocate_view(CLASS_OF(self));
+    view = na_s_allocate_view(rb_obj_class(self));
     na_copy_flags(self, view);
     GetNArrayView(view, na2);

data/ext/numo/narray/depend.erb CHANGED

@@ -13,14 +13,14 @@ TAGS : $(TAGSRC)
 doc :
 	yard doc *.c types/*.c
-C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
+C_TMPL = <%=Dir.glob("#{__dir__}/gen/tmpl*/*.c").join(" ")%>
-COGEN = gen/cogen.rb
-DEPENDS = $(C_TMPL) gen/*.rb
+COGEN = <%= __dir__ %>/gen/cogen.rb
+DEPENDS = $(C_TMPL) <%= __dir__ %>/gen/*.rb
 <%
    type_c = []
-   type_rb = Dir.glob("gen/def/*.rb")
+   type_rb = Dir.glob("#{__dir__}/gen/def/*.rb")
    type_rb.each do |s|
      type_c << c = "types/"+File.basename(s,".rb")+".c"
 %>

data/ext/numo/narray/extconf.rb CHANGED

@@ -2,8 +2,8 @@ require 'rbconfig.rb'
 require 'mkmf'
 require "erb"
-if RUBY_VERSION < "2.0.0"
-  puts "Numo::NArray requires Ruby version 2.0 or later."
+if RUBY_VERSION < "2.1.0"
+  puts "Numo::NArray requires Ruby version 2.1 or later."
   exit(1)
 end

data/ext/numo/narray/gen/cogen.rb CHANGED

@@ -1,5 +1,12 @@
 #! /usr/bin/env ruby
+# Build gems for Windows by using fake RbConfig::CONFIG by rake-compiler.
+fake_path  = File.join(Dir.pwd, 'fake.rb')
+if File.exist? fake_path
+  $:.unshift(Dir.pwd)
+  require 'fake'
+end
 thisdir = File.dirname(__FILE__)
 libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
 $LOAD_PATH.unshift libpath
@@ -43,6 +50,12 @@ code = DefLib.new do
   set file_name: $output||""
   set include_files: ["numo/types/#{type_name}.h"]
   set lib_name: "numo_"+type_name
+  if (::RbConfig::CONFIG['target_cpu'] == 'x86_64') or  (::RbConfig::CONFIG['target_cpu'] == 'x64')
+    set is_simd: true
+  else
+    set is_simd: false
+  end
   def_class do
     extend NArrayMethod

data/ext/numo/narray/gen/def/dfloat.rb CHANGED

@@ -5,6 +5,7 @@ set class_name:          "DFloat"
 set class_alias:         "Float64"
 set class_var:           "cT"
 set ctype:               "double"
+set simd_type:           "pd"
 set has_math:            true
 set is_bit:              false

data/ext/numo/narray/gen/def/sfloat.rb CHANGED

@@ -5,6 +5,7 @@ set class_name:          "SFloat"
 set class_alias:         "Float32"
 set class_var:           "cT"
 set ctype:               "float"
+set simd_type:           "ps"
 set has_math:            true
 set is_bit:              false

data/ext/numo/narray/gen/narray_def.rb CHANGED

@@ -228,13 +228,25 @@ class Store < DefMethod
   def extract_data(ptr,pos,x)
     case type_name
     when "Bit"
-      "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_real(b);}"
+      "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_sint(b);}"
     when "RObject"
       "#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
     when /Complex/
       "{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
-    else
+    when /Float/
       "#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /UInt64/
+      "#{x} = m_from_uint64(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /UInt32/
+      "#{x} = m_from_uint32(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /Int64/
+      "#{x} = m_from_int64(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /Int32/
+      "#{x} = m_from_int32(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /Int/
+      "#{x} = m_from_sint(*(#{dtype}*)(#{ptr}+#{pos}))"
+    else
+      raise "unknown type: #{type_name}"
     end
   end
 end

data/ext/numo/narray/gen/spec.rb CHANGED

@@ -109,14 +109,14 @@ def_method "store" do
   end
   store_from "DFloat","double",   "m_from_real"
   store_from "SFloat","float",    "m_from_real"
-  store_from "Int64", "int64_t",  "m_from_real"
-  store_from "Int32", "int32_t",  "m_from_real"
-  store_from "Int16", "int16_t",  "m_from_real"
-  store_from "Int8",  "int8_t",   "m_from_real"
-  store_from "UInt64","u_int64_t","m_from_real"
-  store_from "UInt32","u_int32_t","m_from_real"
-  store_from "UInt16","u_int16_t","m_from_real"
-  store_from "UInt8", "u_int8_t", "m_from_real"
+  store_from "Int64", "int64_t",  "m_from_int64"
+  store_from "Int32", "int32_t",  "m_from_int32"
+  store_from "Int16", "int16_t",  "m_from_sint"
+  store_from "Int8",  "int8_t",   "m_from_sint"
+  store_from "UInt64","u_int64_t","m_from_uint64"
+  store_from "UInt32","u_int32_t","m_from_uint32"
+  store_from "UInt16","u_int16_t","m_from_sint"
+  store_from "UInt8", "u_int8_t", "m_from_sint"
   store_from "RObject", "VALUE",  "m_num_to_data"
   store_array
 end
@@ -273,8 +273,19 @@ if is_float
   cond_unary "isfinite"
 end
-accum "sum","dtype","cT"
-accum "prod","dtype","cT"
+if is_int
+  if is_unsigned
+    accum "sum","u_int64_t","numo_cUInt64"
+    accum "prod","u_int64_t","numo_cUInt64"
+  else
+    accum "sum","int64_t","numo_cInt64"
+    accum "prod","int64_t","numo_cInt64"
+  end
+else
+  accum "sum","dtype","cT"
+  accum "prod","dtype","cT"
+end
 if is_double_precision
   accum "kahan_sum","dtype","cT"
 end
@@ -353,6 +364,8 @@ if has_math
 fn = get(:full_class_name)
 cn = get(:class_name)
 nm = get(:name)
+st = get(:simd_type)
+dp = get(:is_double_precision)
 is_c = is_complex
 def_module do
@@ -363,6 +376,9 @@ def_module do
   set full_module_name: fn+"::NMath"
   set module_name: "Math"
   set module_var: "mTM"
+  set simd_type: st
+  set is_double_precision: dp
+  set is_complex: is_c
   math "sqrt"
   math "cbrt"

data/ext/numo/narray/gen/tmpl/accum_binary.c CHANGED

@@ -89,7 +89,7 @@ static VALUE
     //<% if is_object %>
     return <%=c_func%>_self(argc, argv, self);
     //<% else %>
-    klass = na_upcast(CLASS_OF(self),CLASS_OF(argv[0]));
+    klass = na_upcast(rb_obj_class(self),rb_obj_class(argv[0]));
     if (klass==cT) {
         return <%=c_func%>_self(argc, argv, self);
     } else {

data/ext/numo/narray/gen/tmpl/accum_index.c CHANGED

@@ -31,7 +31,17 @@ static void
   @param [Numeric,Array,Range] axis  Affected dimensions.
   @return [Integer,Numo::Int] returns result index of <%=name%>.
   @example
-      Numo::NArray[3,4,1,2].min_index => 3
+  <% if name == 'min_index' %>
+      Numo::NArray[3,4,1,2].min_index => 2
+      Numo::NArray[[3,4,1],[2,0,5]].min_index => 4
+      Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 1) => [2, 4]
+      Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 0) => [3, 4, 2]
+  <% else %>
+      Numo::NArray[3,4,1,2].max_index => 1
+      Numo::NArray[[3,4,1],[2,0,5]].max_index => 5
+      Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 1) => [1, 5]
+      Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 0) => [0, 1, 5]
+  <% end %>
  */
 static VALUE
 <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)

data/ext/numo/narray/gen/tmpl/alloc_func.c CHANGED

@@ -70,7 +70,7 @@ static void
     }
 }
-const rb_data_type_t <%=type_name%>_data_type = {
+static const rb_data_type_t <%=type_name%>_data_type = {
     "<%=full_class_name%>",
     {<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
     &na_data_type,
@@ -80,7 +80,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
 <% else %>
-const rb_data_type_t <%=type_name%>_data_type = {
+static const rb_data_type_t <%=type_name%>_data_type = {
     "<%=full_class_name%>",
     {0, <%=type_name%>_free, <%=type_name%>_memsize,},
     &na_data_type,
@@ -90,7 +90,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
 <% end %>
-VALUE
+static VALUE
 <%=c_func(0)%>(VALUE klass)
 {
     narray_data_t *na = ALLOC(narray_data_t);

data/ext/numo/narray/gen/tmpl/binary.c CHANGED

@@ -11,10 +11,24 @@
 static void
 <%=c_iter%>(na_loop_t *const lp)
 {
-    size_t   i, n;
+    size_t   i=0;
+    size_t   n;
     char    *p1, *p2, *p3;
     ssize_t  s1, s2, s3;
+<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
+    size_t cnt;
+    size_t cnt_simd_loop = -1;
+  <% if is_double_precision %>
+    __m128d a;
+    __m128d b;
+  <% else %>
+    __m128 a;
+    __m128 b;
+  <% end %>
+    size_t num_pack; // Number of elements packed for SIMD.
+    num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
+<% end %>
     INIT_COUNTER(lp, n);
     INIT_PTR(lp, 0, p1, s1);
     INIT_PTR(lp, 1, p2, s2);
@@ -28,24 +42,149 @@ static void
         if (s1 == sizeof(dtype) &&
             s2 == sizeof(dtype) &&
             s3 == sizeof(dtype) ) {
+<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
+            // Check number of elements. & Check same alignment.
+            if ((n >= num_pack) && is_same_aligned3(&((dtype*)p1)[i], &((dtype*)p2)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
+                // Calculate up to the position just before the start of SIMD computation.
+                cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
+                if (p1 == p3) { // inplace case
+                    for (; i < cnt; i++) {
+                        ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
+                    }
+                } else {
+                    for (; i < cnt; i++) {
+                        ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
+                    }
+                }
-            for (i=0; i<n; i++) {
-                check_intdivzero(*(dtype*)p2);
-                ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
+                // Get the count of SIMD computation loops.
+                cnt_simd_loop = (n - i) % num_pack;
+                // SIMD computation.
+                if (p1 == p3) { // inplace case
+                    for(; i < n - cnt_simd_loop; i += num_pack){
+                        a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
+                        b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
+                        a = _mm_<%=name%>_<%=simd_type%>(a, b);
+                        _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
+                    }
+                } else {
+                    for(; i < n - cnt_simd_loop; i += num_pack){
+                        a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
+                        b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
+                        a = _mm_<%=name%>_<%=simd_type%>(a, b);
+                        _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
+                    }
+                }
+            }
+            // Compute the remainder of the SIMD operation.
+            if (cnt_simd_loop != 0){
+<% end %>
+                if (p1 == p3) { // inplace case
+                    for (; i<n; i++) {
+                        check_intdivzero(((dtype*)p2)[i]);
+                        ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
+                    }
+                } else {
+                    for (; i<n; i++) {
+                        check_intdivzero(((dtype*)p2)[i]);
+                        ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
+                    }
+                }
+<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
             }
+<% end %>
             return;
         }
         if (is_aligned_step(s1,sizeof(dtype)) &&
             is_aligned_step(s2,sizeof(dtype)) &&
             is_aligned_step(s3,sizeof(dtype)) ) {
             //<% end %>
-            for (i=0; i<n; i++) {
+            if (s2 == 0){ // Broadcasting from scalar value.
                 check_intdivzero(*(dtype*)p2);
-                *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
-                p1 += s1;
-                p2 += s2;
-                p3 += s3;
+                if (s1 == sizeof(dtype) &&
+                    s3 == sizeof(dtype) ) {
+<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
+                    // Broadcast a scalar value and use it for SIMD computation.
+                    b = _mm_load1_<%=simd_type%>(&((dtype*)p2)[0]);
+                    // Check number of elements. & Check same alignment.
+                    if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
+                        // Calculate up to the position just before the start of SIMD computation.
+                        cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
+                        if (p1 == p3) { // inplace case
+                            for (; i < cnt; i++) {
+                                ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
+                            }
+                        } else {
+                            for (; i < cnt; i++) {
+                                ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
+                            }
+                        }
+                        // Get the count of SIMD computation loops.
+                        cnt_simd_loop = (n - i) % num_pack;
+                        // SIMD computation.
+                        if (p1 == p3) { // inplace case
+                            for(; i < n - cnt_simd_loop; i += num_pack){
+                                a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
+                                a = _mm_<%=name%>_<%=simd_type%>(a, b);
+                                _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
+                            }
+                        } else {
+                            for(; i < n - cnt_simd_loop; i += num_pack){
+                                a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
+                                a = _mm_<%=name%>_<%=simd_type%>(a, b);
+                                _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
+                            }
+                        }
+                    }
+                    // Compute the remainder of the SIMD operation.
+                    if (cnt_simd_loop != 0){
+<% end %>
+                        if (p1 == p3) { // inplace case
+                            for (; i<n; i++) {
+                                ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
+                            }
+                        } else {
+                            for (; i<n; i++) {
+                                ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
+                            }
+                        }
+<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
+                    }
+<% end %>
+                } else {
+                    for (i=0; i<n; i++) {
+                        *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
+                        p1 += s1;
+                        p3 += s3;
+                    }
+                }
+            } else {
+                if (p1 == p3) { // inplace case
+                    for (i=0; i<n; i++) {
+                        check_intdivzero(*(dtype*)p2);
+                        *(dtype*)p1 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
+                        p1 += s1;
+                        p2 += s2;
+                    }
+                } else {
+                    for (i=0; i<n; i++) {
+                        check_intdivzero(*(dtype*)p2);
+                        *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
+                        p1 += s1;
+                        p2 += s2;
+                        p3 += s3;
+                    }
+                }
             }
             return;
             //<% if need_align %>
         }
@@ -86,7 +225,7 @@ static VALUE
     <% else %>
     VALUE klass, v;
-    klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
+    klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
     if (klass==cT) {
         return <%=c_func%>_self(self, other);
     } else {