numo-narray 0.9.1.2 → 0.9.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +7 -1
  3. data/ext/numo/narray/array.c +6 -6
  4. data/ext/numo/narray/data.c +8 -8
  5. data/ext/numo/narray/depend.erb +4 -4
  6. data/ext/numo/narray/extconf.rb +2 -2
  7. data/ext/numo/narray/gen/cogen.rb +13 -0
  8. data/ext/numo/narray/gen/def/dfloat.rb +1 -0
  9. data/ext/numo/narray/gen/def/sfloat.rb +1 -0
  10. data/ext/numo/narray/gen/narray_def.rb +14 -2
  11. data/ext/numo/narray/gen/spec.rb +26 -10
  12. data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
  13. data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
  14. data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
  15. data/ext/numo/narray/gen/tmpl/binary.c +149 -10
  16. data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
  17. data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
  18. data/ext/numo/narray/gen/tmpl/cast.c +1 -1
  19. data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
  20. data/ext/numo/narray/gen/tmpl/each.c +1 -1
  21. data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
  22. data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
  23. data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
  24. data/ext/numo/narray/gen/tmpl/lib.c +5 -0
  25. data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
  26. data/ext/numo/narray/gen/tmpl/median.c +3 -2
  27. data/ext/numo/narray/gen/tmpl/pow.c +1 -1
  28. data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
  29. data/ext/numo/narray/gen/tmpl/store.c +4 -4
  30. data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
  31. data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
  32. data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
  33. data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
  34. data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
  35. data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
  36. data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
  37. data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
  38. data/ext/numo/narray/index.c +64 -37
  39. data/ext/numo/narray/math.c +4 -4
  40. data/ext/numo/narray/narray.c +54 -29
  41. data/ext/numo/narray/ndloop.c +7 -7
  42. data/ext/numo/narray/numo/narray.h +9 -2
  43. data/ext/numo/narray/numo/template.h +18 -0
  44. data/ext/numo/narray/numo/types/bit.h +5 -0
  45. data/ext/numo/narray/numo/types/complex_macro.h +5 -0
  46. data/ext/numo/narray/numo/types/float_macro.h +5 -0
  47. data/ext/numo/narray/numo/types/int_macro.h +24 -0
  48. data/ext/numo/narray/numo/types/robj_macro.h +5 -0
  49. data/ext/numo/narray/numo/types/uint_macro.h +24 -0
  50. data/ext/numo/narray/numo/types/xint_macro.h +5 -25
  51. data/ext/numo/narray/rand.c +2 -29
  52. data/ext/numo/narray/step.c +1 -28
  53. data/ext/numo/narray/struct.c +26 -22
  54. data/lib/numo/narray/extra.rb +50 -1
  55. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21795de8941d7b8176772c41039e4d8a5b3ae03a51090f6ee2979f257de88d43
4
- data.tar.gz: ff44506a3ce17bed9b927c23bf0c92395e1c1c54b8ccfcff9edc0aa6edb833c5
3
+ metadata.gz: 4081b1facf83501be82b2446c10bddefbf2e60530f1249074b880f7729968b4f
4
+ data.tar.gz: 19b64cf6e4778f25f60f821a6454cc1b5d2e9dc12f60ecb9f3bfb928155e45db
5
5
  SHA512:
6
- metadata.gz: 1be858a3b1ba7c5028dea3a034ecc4b70ff9e41711957b8989e58a517f911bf41849c84a0f64ec1beb993416b42675384e3e035179910b724692613f765b10cc
7
- data.tar.gz: 7db10683049eab739ee97f3599112ea95566ed223d4e24b020cfe400011ae0d6ad862ed01080c50fe3dd3546c508286b70abd4f0c3a7fdec6a977655e08957ba
6
+ metadata.gz: a7e32c3a9b208bb23d83c8d4b2f96a7e646e7e9db5bb42c2eea81598b9fdd4072f452f7318f4b9c594152bc9587c596896f1f55d4d41c3cb48ced01b0e764a28
7
+ data.tar.gz: b2686de4edbcbb35ff180c8a706f23f853740526249ac1f7c9729bd42fb34af5922eca9fd383a8cbea46ecc85280164a9346fbbc2f31fe4839a8bc15c7ccb8c3
data/Rakefile CHANGED
@@ -40,7 +40,13 @@ namespace :build do
40
40
  ["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
41
41
  ["cd", build_dir],
42
42
  ["bundle"],
43
- ["rake", "cross", "native", "gem"],
43
+ [
44
+ "rake",
45
+ "RUBY_CC_VERSION=2.5.0:2.4.0:2.3.0:2.2.2:2.1.6",
46
+ "cross",
47
+ "native",
48
+ "gem",
49
+ ],
44
50
  ]
45
51
  raw_commands = commands.collect do |command|
46
52
  Shellwords.join(command)
@@ -106,7 +106,7 @@ static VALUE
106
106
  return type;
107
107
 
108
108
  default:
109
- if (CLASS_OF(v) == rb_const_get( rb_cObject, id_Complex )) {
109
+ if (rb_obj_class(v) == rb_const_get( rb_cObject, id_Complex )) {
110
110
  return NA_DCOMPLEX;
111
111
  }
112
112
  }
@@ -232,9 +232,9 @@ na_mdai_investigate(na_mdai_t *mdai, int ndim)
232
232
  }
233
233
  // type
234
234
  if (NIL_P(mdai->na_type)) {
235
- mdai->na_type = CLASS_OF(v);
235
+ mdai->na_type = rb_obj_class(v);
236
236
  } else {
237
- mdai->na_type = na_upcast(CLASS_OF(v), mdai->na_type);
237
+ mdai->na_type = na_upcast(rb_obj_class(v), mdai->na_type);
238
238
  }
239
239
  } else {
240
240
  mdai->type = na_mdai_object_type(mdai->type, v);
@@ -423,7 +423,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
423
423
  narray_t *na;
424
424
  GetNArray(obj,na);
425
425
  ndim = na->ndim;
426
- dtype = update_type(ptype, CLASS_OF(obj));
426
+ dtype = update_type(ptype, rb_obj_class(obj));
427
427
  if (pshape) {
428
428
  dshape = rb_ary_new2(ndim);
429
429
  for (i=0; i<ndim; i++) {
@@ -436,7 +436,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
436
436
  }
437
437
  } else {
438
438
  rb_raise(rb_eTypeError,"invalid type for NArray: %s",
439
- rb_class2name(CLASS_OF(obj)));
439
+ rb_class2name(rb_obj_class(obj)));
440
440
  }
441
441
  }
442
442
 
@@ -539,7 +539,7 @@ na_mdai_for_struct(na_mdai_t *mdai, int ndim)
539
539
 
540
540
  //fpintf(stderr,"val = "); rb_p(val);
541
541
 
542
- if (CLASS_OF(val) == mdai->na_type) {
542
+ if (rb_obj_class(val) == mdai->na_type) {
543
543
  GetNArray(val,na);
544
544
  if ( ndim+na->ndim > mdai->capa ) {
545
545
  abort();
@@ -57,7 +57,7 @@ static ID id_swap_byte;
57
57
  }
58
58
 
59
59
  #define m_memcpy(src,dst) memcpy(dst,src,e)
60
- void
60
+ static void
61
61
  iter_copy_bytes(na_loop_t *const lp)
62
62
  {
63
63
  size_t e;
@@ -206,7 +206,7 @@ check_axis(int axis, int ndim)
206
206
  # [[1, 5],
207
207
  # [3, 7]]]
208
208
  */
209
- VALUE
209
+ static VALUE
210
210
  na_swapaxes(VALUE self, VALUE a1, VALUE a2)
211
211
  {
212
212
  int i, j, ndim;
@@ -232,7 +232,7 @@ na_swapaxes(VALUE self, VALUE a1, VALUE a2)
232
232
  return view;
233
233
  }
234
234
 
235
- VALUE
235
+ static VALUE
236
236
  na_transpose_map(VALUE self, int *map)
237
237
  {
238
238
  int i, ndim;
@@ -262,7 +262,7 @@ na_transpose_map(VALUE self, int *map)
262
262
 
263
263
  #define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
264
264
 
265
- VALUE
265
+ static VALUE
266
266
  na_transpose(int argc, VALUE *argv, VALUE self)
267
267
  {
268
268
  int ndim, *map, *permute;
@@ -467,7 +467,7 @@ na_flatten_dim(VALUE self, int sd)
467
467
  shape[sd] = size;
468
468
 
469
469
  // new object
470
- view = na_s_allocate_view(CLASS_OF(self));
470
+ view = na_s_allocate_view(rb_obj_class(self));
471
471
  na_copy_flags(self, view);
472
472
  GetNArrayView(view, na2);
473
473
 
@@ -591,7 +591,7 @@ na_flatten(VALUE self)
591
591
  [10, 11, 12, 3, 14],
592
592
  [15, 16, 17, 18, 4]]
593
593
  */
594
- VALUE
594
+ static VALUE
595
595
  na_diagonal(int argc, VALUE *argv, VALUE self)
596
596
  {
597
597
  int i, k, nd;
@@ -690,7 +690,7 @@ na_diagonal(int argc, VALUE *argv, VALUE self)
690
690
  shape[k] = diag_size;
691
691
 
692
692
  // new object
693
- view = na_s_allocate_view(CLASS_OF(self));
693
+ view = na_s_allocate_view(rb_obj_class(self));
694
694
  na_copy_flags(self, view);
695
695
  GetNArrayView(view, na2);
696
696
 
@@ -803,7 +803,7 @@ na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
803
803
  GetNArray(self,na);
804
804
  nd = na->ndim;
805
805
 
806
- view = na_s_allocate_view(CLASS_OF(self));
806
+ view = na_s_allocate_view(rb_obj_class(self));
807
807
 
808
808
  na_copy_flags(self, view);
809
809
  GetNArrayView(view, na2);
@@ -13,14 +13,14 @@ TAGS : $(TAGSRC)
13
13
  doc :
14
14
  yard doc *.c types/*.c
15
15
 
16
- C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
16
+ C_TMPL = <%=Dir.glob("#{__dir__}/gen/tmpl*/*.c").join(" ")%>
17
17
 
18
- COGEN = gen/cogen.rb
19
- DEPENDS = $(C_TMPL) gen/*.rb
18
+ COGEN = <%= __dir__ %>/gen/cogen.rb
19
+ DEPENDS = $(C_TMPL) <%= __dir__ %>/gen/*.rb
20
20
 
21
21
  <%
22
22
  type_c = []
23
- type_rb = Dir.glob("gen/def/*.rb")
23
+ type_rb = Dir.glob("#{__dir__}/gen/def/*.rb")
24
24
  type_rb.each do |s|
25
25
  type_c << c = "types/"+File.basename(s,".rb")+".c"
26
26
  %>
@@ -2,8 +2,8 @@ require 'rbconfig.rb'
2
2
  require 'mkmf'
3
3
  require "erb"
4
4
 
5
- if RUBY_VERSION < "2.0.0"
6
- puts "Numo::NArray requires Ruby version 2.0 or later."
5
+ if RUBY_VERSION < "2.1.0"
6
+ puts "Numo::NArray requires Ruby version 2.1 or later."
7
7
  exit(1)
8
8
  end
9
9
 
@@ -1,5 +1,12 @@
1
1
  #! /usr/bin/env ruby
2
2
 
3
+ # Build gems for Windows by using fake RbConfig::CONFIG by rake-compiler.
4
+ fake_path = File.join(Dir.pwd, 'fake.rb')
5
+ if File.exist? fake_path
6
+ $:.unshift(Dir.pwd)
7
+ require 'fake'
8
+ end
9
+
3
10
  thisdir = File.dirname(__FILE__)
4
11
  libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
5
12
  $LOAD_PATH.unshift libpath
@@ -43,6 +50,12 @@ code = DefLib.new do
43
50
  set file_name: $output||""
44
51
  set include_files: ["numo/types/#{type_name}.h"]
45
52
  set lib_name: "numo_"+type_name
53
+
54
+ if (::RbConfig::CONFIG['target_cpu'] == 'x86_64') or (::RbConfig::CONFIG['target_cpu'] == 'x64')
55
+ set is_simd: true
56
+ else
57
+ set is_simd: false
58
+ end
46
59
 
47
60
  def_class do
48
61
  extend NArrayMethod
@@ -5,6 +5,7 @@ set class_name: "DFloat"
5
5
  set class_alias: "Float64"
6
6
  set class_var: "cT"
7
7
  set ctype: "double"
8
+ set simd_type: "pd"
8
9
 
9
10
  set has_math: true
10
11
  set is_bit: false
@@ -5,6 +5,7 @@ set class_name: "SFloat"
5
5
  set class_alias: "Float32"
6
6
  set class_var: "cT"
7
7
  set ctype: "float"
8
+ set simd_type: "ps"
8
9
 
9
10
  set has_math: true
10
11
  set is_bit: false
@@ -228,13 +228,25 @@ class Store < DefMethod
228
228
  def extract_data(ptr,pos,x)
229
229
  case type_name
230
230
  when "Bit"
231
- "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_real(b);}"
231
+ "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_sint(b);}"
232
232
  when "RObject"
233
233
  "#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
234
234
  when /Complex/
235
235
  "{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
236
- else
236
+ when /Float/
237
237
  "#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
238
+ when /UInt64/
239
+ "#{x} = m_from_uint64(*(#{dtype}*)(#{ptr}+#{pos}))"
240
+ when /UInt32/
241
+ "#{x} = m_from_uint32(*(#{dtype}*)(#{ptr}+#{pos}))"
242
+ when /Int64/
243
+ "#{x} = m_from_int64(*(#{dtype}*)(#{ptr}+#{pos}))"
244
+ when /Int32/
245
+ "#{x} = m_from_int32(*(#{dtype}*)(#{ptr}+#{pos}))"
246
+ when /Int/
247
+ "#{x} = m_from_sint(*(#{dtype}*)(#{ptr}+#{pos}))"
248
+ else
249
+ raise "unknown type: #{type_name}"
238
250
  end
239
251
  end
240
252
  end
@@ -109,14 +109,14 @@ def_method "store" do
109
109
  end
110
110
  store_from "DFloat","double", "m_from_real"
111
111
  store_from "SFloat","float", "m_from_real"
112
- store_from "Int64", "int64_t", "m_from_real"
113
- store_from "Int32", "int32_t", "m_from_real"
114
- store_from "Int16", "int16_t", "m_from_real"
115
- store_from "Int8", "int8_t", "m_from_real"
116
- store_from "UInt64","u_int64_t","m_from_real"
117
- store_from "UInt32","u_int32_t","m_from_real"
118
- store_from "UInt16","u_int16_t","m_from_real"
119
- store_from "UInt8", "u_int8_t", "m_from_real"
112
+ store_from "Int64", "int64_t", "m_from_int64"
113
+ store_from "Int32", "int32_t", "m_from_int32"
114
+ store_from "Int16", "int16_t", "m_from_sint"
115
+ store_from "Int8", "int8_t", "m_from_sint"
116
+ store_from "UInt64","u_int64_t","m_from_uint64"
117
+ store_from "UInt32","u_int32_t","m_from_uint32"
118
+ store_from "UInt16","u_int16_t","m_from_sint"
119
+ store_from "UInt8", "u_int8_t", "m_from_sint"
120
120
  store_from "RObject", "VALUE", "m_num_to_data"
121
121
  store_array
122
122
  end
@@ -273,8 +273,19 @@ if is_float
273
273
  cond_unary "isfinite"
274
274
  end
275
275
 
276
- accum "sum","dtype","cT"
277
- accum "prod","dtype","cT"
276
+ if is_int
277
+ if is_unsigned
278
+ accum "sum","u_int64_t","numo_cUInt64"
279
+ accum "prod","u_int64_t","numo_cUInt64"
280
+ else
281
+ accum "sum","int64_t","numo_cInt64"
282
+ accum "prod","int64_t","numo_cInt64"
283
+ end
284
+ else
285
+ accum "sum","dtype","cT"
286
+ accum "prod","dtype","cT"
287
+ end
288
+
278
289
  if is_double_precision
279
290
  accum "kahan_sum","dtype","cT"
280
291
  end
@@ -353,6 +364,8 @@ if has_math
353
364
  fn = get(:full_class_name)
354
365
  cn = get(:class_name)
355
366
  nm = get(:name)
367
+ st = get(:simd_type)
368
+ dp = get(:is_double_precision)
356
369
  is_c = is_complex
357
370
 
358
371
  def_module do
@@ -363,6 +376,9 @@ def_module do
363
376
  set full_module_name: fn+"::NMath"
364
377
  set module_name: "Math"
365
378
  set module_var: "mTM"
379
+ set simd_type: st
380
+ set is_double_precision: dp
381
+ set is_complex: is_c
366
382
 
367
383
  math "sqrt"
368
384
  math "cbrt"
@@ -89,7 +89,7 @@ static VALUE
89
89
  //<% if is_object %>
90
90
  return <%=c_func%>_self(argc, argv, self);
91
91
  //<% else %>
92
- klass = na_upcast(CLASS_OF(self),CLASS_OF(argv[0]));
92
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(argv[0]));
93
93
  if (klass==cT) {
94
94
  return <%=c_func%>_self(argc, argv, self);
95
95
  } else {
@@ -31,7 +31,17 @@ static void
31
31
  @param [Numeric,Array,Range] axis Affected dimensions.
32
32
  @return [Integer,Numo::Int] returns result index of <%=name%>.
33
33
  @example
34
- Numo::NArray[3,4,1,2].min_index => 3
34
+ <% if name == 'min_index' %>
35
+ Numo::NArray[3,4,1,2].min_index => 2
36
+ Numo::NArray[[3,4,1],[2,0,5]].min_index => 4
37
+ Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 1) => [2, 4]
38
+ Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 0) => [3, 4, 2]
39
+ <% else %>
40
+ Numo::NArray[3,4,1,2].max_index => 1
41
+ Numo::NArray[[3,4,1],[2,0,5]].max_index => 5
42
+ Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 1) => [1, 5]
43
+ Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 0) => [0, 1, 5]
44
+ <% end %>
35
45
  */
36
46
  static VALUE
37
47
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
@@ -70,7 +70,7 @@ static void
70
70
  }
71
71
  }
72
72
 
73
- const rb_data_type_t <%=type_name%>_data_type = {
73
+ static const rb_data_type_t <%=type_name%>_data_type = {
74
74
  "<%=full_class_name%>",
75
75
  {<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
76
76
  &na_data_type,
@@ -80,7 +80,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
80
80
 
81
81
  <% else %>
82
82
 
83
- const rb_data_type_t <%=type_name%>_data_type = {
83
+ static const rb_data_type_t <%=type_name%>_data_type = {
84
84
  "<%=full_class_name%>",
85
85
  {0, <%=type_name%>_free, <%=type_name%>_memsize,},
86
86
  &na_data_type,
@@ -90,7 +90,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
90
90
 
91
91
  <% end %>
92
92
 
93
- VALUE
93
+ static VALUE
94
94
  <%=c_func(0)%>(VALUE klass)
95
95
  {
96
96
  narray_data_t *na = ALLOC(narray_data_t);
@@ -11,10 +11,24 @@
11
11
  static void
12
12
  <%=c_iter%>(na_loop_t *const lp)
13
13
  {
14
- size_t i, n;
14
+ size_t i=0;
15
+ size_t n;
15
16
  char *p1, *p2, *p3;
16
17
  ssize_t s1, s2, s3;
17
18
 
19
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
20
+ size_t cnt;
21
+ size_t cnt_simd_loop = -1;
22
+ <% if is_double_precision %>
23
+ __m128d a;
24
+ __m128d b;
25
+ <% else %>
26
+ __m128 a;
27
+ __m128 b;
28
+ <% end %>
29
+ size_t num_pack; // Number of elements packed for SIMD.
30
+ num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
31
+ <% end %>
18
32
  INIT_COUNTER(lp, n);
19
33
  INIT_PTR(lp, 0, p1, s1);
20
34
  INIT_PTR(lp, 1, p2, s2);
@@ -28,24 +42,149 @@ static void
28
42
  if (s1 == sizeof(dtype) &&
29
43
  s2 == sizeof(dtype) &&
30
44
  s3 == sizeof(dtype) ) {
45
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
46
+ // Check number of elements. & Check same alignment.
47
+ if ((n >= num_pack) && is_same_aligned3(&((dtype*)p1)[i], &((dtype*)p2)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
48
+ // Calculate up to the position just before the start of SIMD computation.
49
+ cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
50
+ if (p1 == p3) { // inplace case
51
+ for (; i < cnt; i++) {
52
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
53
+ }
54
+ } else {
55
+ for (; i < cnt; i++) {
56
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
57
+ }
58
+ }
31
59
 
32
- for (i=0; i<n; i++) {
33
- check_intdivzero(*(dtype*)p2);
34
- ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
60
+ // Get the count of SIMD computation loops.
61
+ cnt_simd_loop = (n - i) % num_pack;
62
+
63
+ // SIMD computation.
64
+ if (p1 == p3) { // inplace case
65
+ for(; i < n - cnt_simd_loop; i += num_pack){
66
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
67
+ b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
68
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
69
+ _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
70
+ }
71
+ } else {
72
+ for(; i < n - cnt_simd_loop; i += num_pack){
73
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
74
+ b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
75
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
76
+ _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
77
+ }
78
+ }
79
+ }
80
+
81
+ // Compute the remainder of the SIMD operation.
82
+ if (cnt_simd_loop != 0){
83
+ <% end %>
84
+ if (p1 == p3) { // inplace case
85
+ for (; i<n; i++) {
86
+ check_intdivzero(((dtype*)p2)[i]);
87
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
88
+ }
89
+ } else {
90
+ for (; i<n; i++) {
91
+ check_intdivzero(((dtype*)p2)[i]);
92
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
93
+ }
94
+ }
95
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
35
96
  }
97
+ <% end %>
36
98
  return;
37
99
  }
100
+
38
101
  if (is_aligned_step(s1,sizeof(dtype)) &&
39
102
  is_aligned_step(s2,sizeof(dtype)) &&
40
103
  is_aligned_step(s3,sizeof(dtype)) ) {
41
104
  //<% end %>
42
- for (i=0; i<n; i++) {
105
+
106
+ if (s2 == 0){ // Broadcasting from scalar value.
43
107
  check_intdivzero(*(dtype*)p2);
44
- *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
45
- p1 += s1;
46
- p2 += s2;
47
- p3 += s3;
108
+ if (s1 == sizeof(dtype) &&
109
+ s3 == sizeof(dtype) ) {
110
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
111
+ // Broadcast a scalar value and use it for SIMD computation.
112
+ b = _mm_load1_<%=simd_type%>(&((dtype*)p2)[0]);
113
+
114
+ // Check number of elements. & Check same alignment.
115
+ if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
116
+ // Calculate up to the position just before the start of SIMD computation.
117
+ cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
118
+ if (p1 == p3) { // inplace case
119
+ for (; i < cnt; i++) {
120
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
121
+ }
122
+ } else {
123
+ for (; i < cnt; i++) {
124
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
125
+ }
126
+ }
127
+
128
+ // Get the count of SIMD computation loops.
129
+ cnt_simd_loop = (n - i) % num_pack;
130
+
131
+ // SIMD computation.
132
+ if (p1 == p3) { // inplace case
133
+ for(; i < n - cnt_simd_loop; i += num_pack){
134
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
135
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
136
+ _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
137
+ }
138
+ } else {
139
+ for(; i < n - cnt_simd_loop; i += num_pack){
140
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
141
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
142
+ _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
143
+ }
144
+ }
145
+ }
146
+
147
+ // Compute the remainder of the SIMD operation.
148
+ if (cnt_simd_loop != 0){
149
+ <% end %>
150
+ if (p1 == p3) { // inplace case
151
+ for (; i<n; i++) {
152
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
153
+ }
154
+ } else {
155
+ for (; i<n; i++) {
156
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
157
+ }
158
+ }
159
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
160
+ }
161
+ <% end %>
162
+ } else {
163
+ for (i=0; i<n; i++) {
164
+ *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
165
+ p1 += s1;
166
+ p3 += s3;
167
+ }
168
+ }
169
+ } else {
170
+ if (p1 == p3) { // inplace case
171
+ for (i=0; i<n; i++) {
172
+ check_intdivzero(*(dtype*)p2);
173
+ *(dtype*)p1 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
174
+ p1 += s1;
175
+ p2 += s2;
176
+ }
177
+ } else {
178
+ for (i=0; i<n; i++) {
179
+ check_intdivzero(*(dtype*)p2);
180
+ *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
181
+ p1 += s1;
182
+ p2 += s2;
183
+ p3 += s3;
184
+ }
185
+ }
48
186
  }
187
+
49
188
  return;
50
189
  //<% if need_align %>
51
190
  }
@@ -86,7 +225,7 @@ static VALUE
86
225
  <% else %>
87
226
  VALUE klass, v;
88
227
 
89
- klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
228
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
90
229
  if (klass==cT) {
91
230
  return <%=c_func%>_self(self, other);
92
231
  } else {