numo-narray 0.9.1.2 → 0.9.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +7 -1
  3. data/ext/numo/narray/array.c +6 -6
  4. data/ext/numo/narray/data.c +8 -8
  5. data/ext/numo/narray/depend.erb +4 -4
  6. data/ext/numo/narray/extconf.rb +2 -2
  7. data/ext/numo/narray/gen/cogen.rb +13 -0
  8. data/ext/numo/narray/gen/def/dfloat.rb +1 -0
  9. data/ext/numo/narray/gen/def/sfloat.rb +1 -0
  10. data/ext/numo/narray/gen/narray_def.rb +14 -2
  11. data/ext/numo/narray/gen/spec.rb +26 -10
  12. data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
  13. data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
  14. data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
  15. data/ext/numo/narray/gen/tmpl/binary.c +149 -10
  16. data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
  17. data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
  18. data/ext/numo/narray/gen/tmpl/cast.c +1 -1
  19. data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
  20. data/ext/numo/narray/gen/tmpl/each.c +1 -1
  21. data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
  22. data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
  23. data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
  24. data/ext/numo/narray/gen/tmpl/lib.c +5 -0
  25. data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
  26. data/ext/numo/narray/gen/tmpl/median.c +3 -2
  27. data/ext/numo/narray/gen/tmpl/pow.c +1 -1
  28. data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
  29. data/ext/numo/narray/gen/tmpl/store.c +4 -4
  30. data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
  31. data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
  32. data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
  33. data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
  34. data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
  35. data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
  36. data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
  37. data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
  38. data/ext/numo/narray/index.c +64 -37
  39. data/ext/numo/narray/math.c +4 -4
  40. data/ext/numo/narray/narray.c +54 -29
  41. data/ext/numo/narray/ndloop.c +7 -7
  42. data/ext/numo/narray/numo/narray.h +9 -2
  43. data/ext/numo/narray/numo/template.h +18 -0
  44. data/ext/numo/narray/numo/types/bit.h +5 -0
  45. data/ext/numo/narray/numo/types/complex_macro.h +5 -0
  46. data/ext/numo/narray/numo/types/float_macro.h +5 -0
  47. data/ext/numo/narray/numo/types/int_macro.h +24 -0
  48. data/ext/numo/narray/numo/types/robj_macro.h +5 -0
  49. data/ext/numo/narray/numo/types/uint_macro.h +24 -0
  50. data/ext/numo/narray/numo/types/xint_macro.h +5 -25
  51. data/ext/numo/narray/rand.c +2 -29
  52. data/ext/numo/narray/step.c +1 -28
  53. data/ext/numo/narray/struct.c +26 -22
  54. data/lib/numo/narray/extra.rb +50 -1
  55. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21795de8941d7b8176772c41039e4d8a5b3ae03a51090f6ee2979f257de88d43
4
- data.tar.gz: ff44506a3ce17bed9b927c23bf0c92395e1c1c54b8ccfcff9edc0aa6edb833c5
3
+ metadata.gz: 4081b1facf83501be82b2446c10bddefbf2e60530f1249074b880f7729968b4f
4
+ data.tar.gz: 19b64cf6e4778f25f60f821a6454cc1b5d2e9dc12f60ecb9f3bfb928155e45db
5
5
  SHA512:
6
- metadata.gz: 1be858a3b1ba7c5028dea3a034ecc4b70ff9e41711957b8989e58a517f911bf41849c84a0f64ec1beb993416b42675384e3e035179910b724692613f765b10cc
7
- data.tar.gz: 7db10683049eab739ee97f3599112ea95566ed223d4e24b020cfe400011ae0d6ad862ed01080c50fe3dd3546c508286b70abd4f0c3a7fdec6a977655e08957ba
6
+ metadata.gz: a7e32c3a9b208bb23d83c8d4b2f96a7e646e7e9db5bb42c2eea81598b9fdd4072f452f7318f4b9c594152bc9587c596896f1f55d4d41c3cb48ced01b0e764a28
7
+ data.tar.gz: b2686de4edbcbb35ff180c8a706f23f853740526249ac1f7c9729bd42fb34af5922eca9fd383a8cbea46ecc85280164a9346fbbc2f31fe4839a8bc15c7ccb8c3
data/Rakefile CHANGED
@@ -40,7 +40,13 @@ namespace :build do
40
40
  ["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
41
41
  ["cd", build_dir],
42
42
  ["bundle"],
43
- ["rake", "cross", "native", "gem"],
43
+ [
44
+ "rake",
45
+ "RUBY_CC_VERSION=2.5.0:2.4.0:2.3.0:2.2.2:2.1.6",
46
+ "cross",
47
+ "native",
48
+ "gem",
49
+ ],
44
50
  ]
45
51
  raw_commands = commands.collect do |command|
46
52
  Shellwords.join(command)
@@ -106,7 +106,7 @@ static VALUE
106
106
  return type;
107
107
 
108
108
  default:
109
- if (CLASS_OF(v) == rb_const_get( rb_cObject, id_Complex )) {
109
+ if (rb_obj_class(v) == rb_const_get( rb_cObject, id_Complex )) {
110
110
  return NA_DCOMPLEX;
111
111
  }
112
112
  }
@@ -232,9 +232,9 @@ na_mdai_investigate(na_mdai_t *mdai, int ndim)
232
232
  }
233
233
  // type
234
234
  if (NIL_P(mdai->na_type)) {
235
- mdai->na_type = CLASS_OF(v);
235
+ mdai->na_type = rb_obj_class(v);
236
236
  } else {
237
- mdai->na_type = na_upcast(CLASS_OF(v), mdai->na_type);
237
+ mdai->na_type = na_upcast(rb_obj_class(v), mdai->na_type);
238
238
  }
239
239
  } else {
240
240
  mdai->type = na_mdai_object_type(mdai->type, v);
@@ -423,7 +423,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
423
423
  narray_t *na;
424
424
  GetNArray(obj,na);
425
425
  ndim = na->ndim;
426
- dtype = update_type(ptype, CLASS_OF(obj));
426
+ dtype = update_type(ptype, rb_obj_class(obj));
427
427
  if (pshape) {
428
428
  dshape = rb_ary_new2(ndim);
429
429
  for (i=0; i<ndim; i++) {
@@ -436,7 +436,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
436
436
  }
437
437
  } else {
438
438
  rb_raise(rb_eTypeError,"invalid type for NArray: %s",
439
- rb_class2name(CLASS_OF(obj)));
439
+ rb_class2name(rb_obj_class(obj)));
440
440
  }
441
441
  }
442
442
 
@@ -539,7 +539,7 @@ na_mdai_for_struct(na_mdai_t *mdai, int ndim)
539
539
 
540
540
  //fpintf(stderr,"val = "); rb_p(val);
541
541
 
542
- if (CLASS_OF(val) == mdai->na_type) {
542
+ if (rb_obj_class(val) == mdai->na_type) {
543
543
  GetNArray(val,na);
544
544
  if ( ndim+na->ndim > mdai->capa ) {
545
545
  abort();
@@ -57,7 +57,7 @@ static ID id_swap_byte;
57
57
  }
58
58
 
59
59
  #define m_memcpy(src,dst) memcpy(dst,src,e)
60
- void
60
+ static void
61
61
  iter_copy_bytes(na_loop_t *const lp)
62
62
  {
63
63
  size_t e;
@@ -206,7 +206,7 @@ check_axis(int axis, int ndim)
206
206
  # [[1, 5],
207
207
  # [3, 7]]]
208
208
  */
209
- VALUE
209
+ static VALUE
210
210
  na_swapaxes(VALUE self, VALUE a1, VALUE a2)
211
211
  {
212
212
  int i, j, ndim;
@@ -232,7 +232,7 @@ na_swapaxes(VALUE self, VALUE a1, VALUE a2)
232
232
  return view;
233
233
  }
234
234
 
235
- VALUE
235
+ static VALUE
236
236
  na_transpose_map(VALUE self, int *map)
237
237
  {
238
238
  int i, ndim;
@@ -262,7 +262,7 @@ na_transpose_map(VALUE self, int *map)
262
262
 
263
263
  #define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
264
264
 
265
- VALUE
265
+ static VALUE
266
266
  na_transpose(int argc, VALUE *argv, VALUE self)
267
267
  {
268
268
  int ndim, *map, *permute;
@@ -467,7 +467,7 @@ na_flatten_dim(VALUE self, int sd)
467
467
  shape[sd] = size;
468
468
 
469
469
  // new object
470
- view = na_s_allocate_view(CLASS_OF(self));
470
+ view = na_s_allocate_view(rb_obj_class(self));
471
471
  na_copy_flags(self, view);
472
472
  GetNArrayView(view, na2);
473
473
 
@@ -591,7 +591,7 @@ na_flatten(VALUE self)
591
591
  [10, 11, 12, 3, 14],
592
592
  [15, 16, 17, 18, 4]]
593
593
  */
594
- VALUE
594
+ static VALUE
595
595
  na_diagonal(int argc, VALUE *argv, VALUE self)
596
596
  {
597
597
  int i, k, nd;
@@ -690,7 +690,7 @@ na_diagonal(int argc, VALUE *argv, VALUE self)
690
690
  shape[k] = diag_size;
691
691
 
692
692
  // new object
693
- view = na_s_allocate_view(CLASS_OF(self));
693
+ view = na_s_allocate_view(rb_obj_class(self));
694
694
  na_copy_flags(self, view);
695
695
  GetNArrayView(view, na2);
696
696
 
@@ -803,7 +803,7 @@ na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
803
803
  GetNArray(self,na);
804
804
  nd = na->ndim;
805
805
 
806
- view = na_s_allocate_view(CLASS_OF(self));
806
+ view = na_s_allocate_view(rb_obj_class(self));
807
807
 
808
808
  na_copy_flags(self, view);
809
809
  GetNArrayView(view, na2);
@@ -13,14 +13,14 @@ TAGS : $(TAGSRC)
13
13
  doc :
14
14
  yard doc *.c types/*.c
15
15
 
16
- C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
16
+ C_TMPL = <%=Dir.glob("#{__dir__}/gen/tmpl*/*.c").join(" ")%>
17
17
 
18
- COGEN = gen/cogen.rb
19
- DEPENDS = $(C_TMPL) gen/*.rb
18
+ COGEN = <%= __dir__ %>/gen/cogen.rb
19
+ DEPENDS = $(C_TMPL) <%= __dir__ %>/gen/*.rb
20
20
 
21
21
  <%
22
22
  type_c = []
23
- type_rb = Dir.glob("gen/def/*.rb")
23
+ type_rb = Dir.glob("#{__dir__}/gen/def/*.rb")
24
24
  type_rb.each do |s|
25
25
  type_c << c = "types/"+File.basename(s,".rb")+".c"
26
26
  %>
@@ -2,8 +2,8 @@ require 'rbconfig.rb'
2
2
  require 'mkmf'
3
3
  require "erb"
4
4
 
5
- if RUBY_VERSION < "2.0.0"
6
- puts "Numo::NArray requires Ruby version 2.0 or later."
5
+ if RUBY_VERSION < "2.1.0"
6
+ puts "Numo::NArray requires Ruby version 2.1 or later."
7
7
  exit(1)
8
8
  end
9
9
 
@@ -1,5 +1,12 @@
1
1
  #! /usr/bin/env ruby
2
2
 
3
+ # Build gems for Windows by using fake RbConfig::CONFIG by rake-compiler.
4
+ fake_path = File.join(Dir.pwd, 'fake.rb')
5
+ if File.exist? fake_path
6
+ $:.unshift(Dir.pwd)
7
+ require 'fake'
8
+ end
9
+
3
10
  thisdir = File.dirname(__FILE__)
4
11
  libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
5
12
  $LOAD_PATH.unshift libpath
@@ -43,6 +50,12 @@ code = DefLib.new do
43
50
  set file_name: $output||""
44
51
  set include_files: ["numo/types/#{type_name}.h"]
45
52
  set lib_name: "numo_"+type_name
53
+
54
+ if (::RbConfig::CONFIG['target_cpu'] == 'x86_64') or (::RbConfig::CONFIG['target_cpu'] == 'x64')
55
+ set is_simd: true
56
+ else
57
+ set is_simd: false
58
+ end
46
59
 
47
60
  def_class do
48
61
  extend NArrayMethod
@@ -5,6 +5,7 @@ set class_name: "DFloat"
5
5
  set class_alias: "Float64"
6
6
  set class_var: "cT"
7
7
  set ctype: "double"
8
+ set simd_type: "pd"
8
9
 
9
10
  set has_math: true
10
11
  set is_bit: false
@@ -5,6 +5,7 @@ set class_name: "SFloat"
5
5
  set class_alias: "Float32"
6
6
  set class_var: "cT"
7
7
  set ctype: "float"
8
+ set simd_type: "ps"
8
9
 
9
10
  set has_math: true
10
11
  set is_bit: false
@@ -228,13 +228,25 @@ class Store < DefMethod
228
228
  def extract_data(ptr,pos,x)
229
229
  case type_name
230
230
  when "Bit"
231
- "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_real(b);}"
231
+ "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_sint(b);}"
232
232
  when "RObject"
233
233
  "#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
234
234
  when /Complex/
235
235
  "{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
236
- else
236
+ when /Float/
237
237
  "#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
238
+ when /UInt64/
239
+ "#{x} = m_from_uint64(*(#{dtype}*)(#{ptr}+#{pos}))"
240
+ when /UInt32/
241
+ "#{x} = m_from_uint32(*(#{dtype}*)(#{ptr}+#{pos}))"
242
+ when /Int64/
243
+ "#{x} = m_from_int64(*(#{dtype}*)(#{ptr}+#{pos}))"
244
+ when /Int32/
245
+ "#{x} = m_from_int32(*(#{dtype}*)(#{ptr}+#{pos}))"
246
+ when /Int/
247
+ "#{x} = m_from_sint(*(#{dtype}*)(#{ptr}+#{pos}))"
248
+ else
249
+ raise "unknown type: #{type_name}"
238
250
  end
239
251
  end
240
252
  end
@@ -109,14 +109,14 @@ def_method "store" do
109
109
  end
110
110
  store_from "DFloat","double", "m_from_real"
111
111
  store_from "SFloat","float", "m_from_real"
112
- store_from "Int64", "int64_t", "m_from_real"
113
- store_from "Int32", "int32_t", "m_from_real"
114
- store_from "Int16", "int16_t", "m_from_real"
115
- store_from "Int8", "int8_t", "m_from_real"
116
- store_from "UInt64","u_int64_t","m_from_real"
117
- store_from "UInt32","u_int32_t","m_from_real"
118
- store_from "UInt16","u_int16_t","m_from_real"
119
- store_from "UInt8", "u_int8_t", "m_from_real"
112
+ store_from "Int64", "int64_t", "m_from_int64"
113
+ store_from "Int32", "int32_t", "m_from_int32"
114
+ store_from "Int16", "int16_t", "m_from_sint"
115
+ store_from "Int8", "int8_t", "m_from_sint"
116
+ store_from "UInt64","u_int64_t","m_from_uint64"
117
+ store_from "UInt32","u_int32_t","m_from_uint32"
118
+ store_from "UInt16","u_int16_t","m_from_sint"
119
+ store_from "UInt8", "u_int8_t", "m_from_sint"
120
120
  store_from "RObject", "VALUE", "m_num_to_data"
121
121
  store_array
122
122
  end
@@ -273,8 +273,19 @@ if is_float
273
273
  cond_unary "isfinite"
274
274
  end
275
275
 
276
- accum "sum","dtype","cT"
277
- accum "prod","dtype","cT"
276
+ if is_int
277
+ if is_unsigned
278
+ accum "sum","u_int64_t","numo_cUInt64"
279
+ accum "prod","u_int64_t","numo_cUInt64"
280
+ else
281
+ accum "sum","int64_t","numo_cInt64"
282
+ accum "prod","int64_t","numo_cInt64"
283
+ end
284
+ else
285
+ accum "sum","dtype","cT"
286
+ accum "prod","dtype","cT"
287
+ end
288
+
278
289
  if is_double_precision
279
290
  accum "kahan_sum","dtype","cT"
280
291
  end
@@ -353,6 +364,8 @@ if has_math
353
364
  fn = get(:full_class_name)
354
365
  cn = get(:class_name)
355
366
  nm = get(:name)
367
+ st = get(:simd_type)
368
+ dp = get(:is_double_precision)
356
369
  is_c = is_complex
357
370
 
358
371
  def_module do
@@ -363,6 +376,9 @@ def_module do
363
376
  set full_module_name: fn+"::NMath"
364
377
  set module_name: "Math"
365
378
  set module_var: "mTM"
379
+ set simd_type: st
380
+ set is_double_precision: dp
381
+ set is_complex: is_c
366
382
 
367
383
  math "sqrt"
368
384
  math "cbrt"
@@ -89,7 +89,7 @@ static VALUE
89
89
  //<% if is_object %>
90
90
  return <%=c_func%>_self(argc, argv, self);
91
91
  //<% else %>
92
- klass = na_upcast(CLASS_OF(self),CLASS_OF(argv[0]));
92
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(argv[0]));
93
93
  if (klass==cT) {
94
94
  return <%=c_func%>_self(argc, argv, self);
95
95
  } else {
@@ -31,7 +31,17 @@ static void
31
31
  @param [Numeric,Array,Range] axis Affected dimensions.
32
32
  @return [Integer,Numo::Int] returns result index of <%=name%>.
33
33
  @example
34
- Numo::NArray[3,4,1,2].min_index => 3
34
+ <% if name == 'min_index' %>
35
+ Numo::NArray[3,4,1,2].min_index => 2
36
+ Numo::NArray[[3,4,1],[2,0,5]].min_index => 4
37
+ Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 1) => [2, 4]
38
+ Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 0) => [3, 4, 2]
39
+ <% else %>
40
+ Numo::NArray[3,4,1,2].max_index => 1
41
+ Numo::NArray[[3,4,1],[2,0,5]].max_index => 5
42
+ Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 1) => [1, 5]
43
+ Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 0) => [0, 1, 5]
44
+ <% end %>
35
45
  */
36
46
  static VALUE
37
47
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
@@ -70,7 +70,7 @@ static void
70
70
  }
71
71
  }
72
72
 
73
- const rb_data_type_t <%=type_name%>_data_type = {
73
+ static const rb_data_type_t <%=type_name%>_data_type = {
74
74
  "<%=full_class_name%>",
75
75
  {<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
76
76
  &na_data_type,
@@ -80,7 +80,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
80
80
 
81
81
  <% else %>
82
82
 
83
- const rb_data_type_t <%=type_name%>_data_type = {
83
+ static const rb_data_type_t <%=type_name%>_data_type = {
84
84
  "<%=full_class_name%>",
85
85
  {0, <%=type_name%>_free, <%=type_name%>_memsize,},
86
86
  &na_data_type,
@@ -90,7 +90,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
90
90
 
91
91
  <% end %>
92
92
 
93
- VALUE
93
+ static VALUE
94
94
  <%=c_func(0)%>(VALUE klass)
95
95
  {
96
96
  narray_data_t *na = ALLOC(narray_data_t);
@@ -11,10 +11,24 @@
11
11
  static void
12
12
  <%=c_iter%>(na_loop_t *const lp)
13
13
  {
14
- size_t i, n;
14
+ size_t i=0;
15
+ size_t n;
15
16
  char *p1, *p2, *p3;
16
17
  ssize_t s1, s2, s3;
17
18
 
19
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
20
+ size_t cnt;
21
+ size_t cnt_simd_loop = -1;
22
+ <% if is_double_precision %>
23
+ __m128d a;
24
+ __m128d b;
25
+ <% else %>
26
+ __m128 a;
27
+ __m128 b;
28
+ <% end %>
29
+ size_t num_pack; // Number of elements packed for SIMD.
30
+ num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
31
+ <% end %>
18
32
  INIT_COUNTER(lp, n);
19
33
  INIT_PTR(lp, 0, p1, s1);
20
34
  INIT_PTR(lp, 1, p2, s2);
@@ -28,24 +42,149 @@ static void
28
42
  if (s1 == sizeof(dtype) &&
29
43
  s2 == sizeof(dtype) &&
30
44
  s3 == sizeof(dtype) ) {
45
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
46
+ // Check number of elements. & Check same alignment.
47
+ if ((n >= num_pack) && is_same_aligned3(&((dtype*)p1)[i], &((dtype*)p2)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
48
+ // Calculate up to the position just before the start of SIMD computation.
49
+ cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
50
+ if (p1 == p3) { // inplace case
51
+ for (; i < cnt; i++) {
52
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
53
+ }
54
+ } else {
55
+ for (; i < cnt; i++) {
56
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
57
+ }
58
+ }
31
59
 
32
- for (i=0; i<n; i++) {
33
- check_intdivzero(*(dtype*)p2);
34
- ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
60
+ // Get the count of SIMD computation loops.
61
+ cnt_simd_loop = (n - i) % num_pack;
62
+
63
+ // SIMD computation.
64
+ if (p1 == p3) { // inplace case
65
+ for(; i < n - cnt_simd_loop; i += num_pack){
66
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
67
+ b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
68
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
69
+ _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
70
+ }
71
+ } else {
72
+ for(; i < n - cnt_simd_loop; i += num_pack){
73
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
74
+ b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
75
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
76
+ _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
77
+ }
78
+ }
79
+ }
80
+
81
+ // Compute the remainder of the SIMD operation.
82
+ if (cnt_simd_loop != 0){
83
+ <% end %>
84
+ if (p1 == p3) { // inplace case
85
+ for (; i<n; i++) {
86
+ check_intdivzero(((dtype*)p2)[i]);
87
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
88
+ }
89
+ } else {
90
+ for (; i<n; i++) {
91
+ check_intdivzero(((dtype*)p2)[i]);
92
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
93
+ }
94
+ }
95
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
35
96
  }
97
+ <% end %>
36
98
  return;
37
99
  }
100
+
38
101
  if (is_aligned_step(s1,sizeof(dtype)) &&
39
102
  is_aligned_step(s2,sizeof(dtype)) &&
40
103
  is_aligned_step(s3,sizeof(dtype)) ) {
41
104
  //<% end %>
42
- for (i=0; i<n; i++) {
105
+
106
+ if (s2 == 0){ // Broadcasting from scalar value.
43
107
  check_intdivzero(*(dtype*)p2);
44
- *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
45
- p1 += s1;
46
- p2 += s2;
47
- p3 += s3;
108
+ if (s1 == sizeof(dtype) &&
109
+ s3 == sizeof(dtype) ) {
110
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
111
+ // Broadcast a scalar value and use it for SIMD computation.
112
+ b = _mm_load1_<%=simd_type%>(&((dtype*)p2)[0]);
113
+
114
+ // Check number of elements. & Check same alignment.
115
+ if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
116
+ // Calculate up to the position just before the start of SIMD computation.
117
+ cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
118
+ if (p1 == p3) { // inplace case
119
+ for (; i < cnt; i++) {
120
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
121
+ }
122
+ } else {
123
+ for (; i < cnt; i++) {
124
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
125
+ }
126
+ }
127
+
128
+ // Get the count of SIMD computation loops.
129
+ cnt_simd_loop = (n - i) % num_pack;
130
+
131
+ // SIMD computation.
132
+ if (p1 == p3) { // inplace case
133
+ for(; i < n - cnt_simd_loop; i += num_pack){
134
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
135
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
136
+ _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
137
+ }
138
+ } else {
139
+ for(; i < n - cnt_simd_loop; i += num_pack){
140
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
141
+ a = _mm_<%=name%>_<%=simd_type%>(a, b);
142
+ _mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
143
+ }
144
+ }
145
+ }
146
+
147
+ // Compute the remainder of the SIMD operation.
148
+ if (cnt_simd_loop != 0){
149
+ <% end %>
150
+ if (p1 == p3) { // inplace case
151
+ for (; i<n; i++) {
152
+ ((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
153
+ }
154
+ } else {
155
+ for (; i<n; i++) {
156
+ ((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
157
+ }
158
+ }
159
+ <% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
160
+ }
161
+ <% end %>
162
+ } else {
163
+ for (i=0; i<n; i++) {
164
+ *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
165
+ p1 += s1;
166
+ p3 += s3;
167
+ }
168
+ }
169
+ } else {
170
+ if (p1 == p3) { // inplace case
171
+ for (i=0; i<n; i++) {
172
+ check_intdivzero(*(dtype*)p2);
173
+ *(dtype*)p1 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
174
+ p1 += s1;
175
+ p2 += s2;
176
+ }
177
+ } else {
178
+ for (i=0; i<n; i++) {
179
+ check_intdivzero(*(dtype*)p2);
180
+ *(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
181
+ p1 += s1;
182
+ p2 += s2;
183
+ p3 += s3;
184
+ }
185
+ }
48
186
  }
187
+
49
188
  return;
50
189
  //<% if need_align %>
51
190
  }
@@ -86,7 +225,7 @@ static VALUE
86
225
  <% else %>
87
226
  VALUE klass, v;
88
227
 
89
- klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
228
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
90
229
  if (klass==cT) {
91
230
  return <%=c_func%>_self(self, other);
92
231
  } else {