numo-narray 0.9.1.2 → 0.9.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +7 -1
- data/ext/numo/narray/array.c +6 -6
- data/ext/numo/narray/data.c +8 -8
- data/ext/numo/narray/depend.erb +4 -4
- data/ext/numo/narray/extconf.rb +2 -2
- data/ext/numo/narray/gen/cogen.rb +13 -0
- data/ext/numo/narray/gen/def/dfloat.rb +1 -0
- data/ext/numo/narray/gen/def/sfloat.rb +1 -0
- data/ext/numo/narray/gen/narray_def.rb +14 -2
- data/ext/numo/narray/gen/spec.rb +26 -10
- data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
- data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
- data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
- data/ext/numo/narray/gen/tmpl/binary.c +149 -10
- data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
- data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
- data/ext/numo/narray/gen/tmpl/cast.c +1 -1
- data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
- data/ext/numo/narray/gen/tmpl/each.c +1 -1
- data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
- data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/numo/narray/gen/tmpl/lib.c +5 -0
- data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl/median.c +3 -2
- data/ext/numo/narray/gen/tmpl/pow.c +1 -1
- data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
- data/ext/numo/narray/gen/tmpl/store.c +4 -4
- data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
- data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
- data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
- data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
- data/ext/numo/narray/index.c +64 -37
- data/ext/numo/narray/math.c +4 -4
- data/ext/numo/narray/narray.c +54 -29
- data/ext/numo/narray/ndloop.c +7 -7
- data/ext/numo/narray/numo/narray.h +9 -2
- data/ext/numo/narray/numo/template.h +18 -0
- data/ext/numo/narray/numo/types/bit.h +5 -0
- data/ext/numo/narray/numo/types/complex_macro.h +5 -0
- data/ext/numo/narray/numo/types/float_macro.h +5 -0
- data/ext/numo/narray/numo/types/int_macro.h +24 -0
- data/ext/numo/narray/numo/types/robj_macro.h +5 -0
- data/ext/numo/narray/numo/types/uint_macro.h +24 -0
- data/ext/numo/narray/numo/types/xint_macro.h +5 -25
- data/ext/numo/narray/rand.c +2 -29
- data/ext/numo/narray/step.c +1 -28
- data/ext/numo/narray/struct.c +26 -22
- data/lib/numo/narray/extra.rb +50 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4081b1facf83501be82b2446c10bddefbf2e60530f1249074b880f7729968b4f
|
4
|
+
data.tar.gz: 19b64cf6e4778f25f60f821a6454cc1b5d2e9dc12f60ecb9f3bfb928155e45db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a7e32c3a9b208bb23d83c8d4b2f96a7e646e7e9db5bb42c2eea81598b9fdd4072f452f7318f4b9c594152bc9587c596896f1f55d4d41c3cb48ced01b0e764a28
|
7
|
+
data.tar.gz: b2686de4edbcbb35ff180c8a706f23f853740526249ac1f7c9729bd42fb34af5922eca9fd383a8cbea46ecc85280164a9346fbbc2f31fe4839a8bc15c7ccb8c3
|
data/Rakefile
CHANGED
@@ -40,7 +40,13 @@ namespace :build do
|
|
40
40
|
["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
|
41
41
|
["cd", build_dir],
|
42
42
|
["bundle"],
|
43
|
-
[
|
43
|
+
[
|
44
|
+
"rake",
|
45
|
+
"RUBY_CC_VERSION=2.5.0:2.4.0:2.3.0:2.2.2:2.1.6",
|
46
|
+
"cross",
|
47
|
+
"native",
|
48
|
+
"gem",
|
49
|
+
],
|
44
50
|
]
|
45
51
|
raw_commands = commands.collect do |command|
|
46
52
|
Shellwords.join(command)
|
data/ext/numo/narray/array.c
CHANGED
@@ -106,7 +106,7 @@ static VALUE
|
|
106
106
|
return type;
|
107
107
|
|
108
108
|
default:
|
109
|
-
if (
|
109
|
+
if (rb_obj_class(v) == rb_const_get( rb_cObject, id_Complex )) {
|
110
110
|
return NA_DCOMPLEX;
|
111
111
|
}
|
112
112
|
}
|
@@ -232,9 +232,9 @@ na_mdai_investigate(na_mdai_t *mdai, int ndim)
|
|
232
232
|
}
|
233
233
|
// type
|
234
234
|
if (NIL_P(mdai->na_type)) {
|
235
|
-
mdai->na_type =
|
235
|
+
mdai->na_type = rb_obj_class(v);
|
236
236
|
} else {
|
237
|
-
mdai->na_type = na_upcast(
|
237
|
+
mdai->na_type = na_upcast(rb_obj_class(v), mdai->na_type);
|
238
238
|
}
|
239
239
|
} else {
|
240
240
|
mdai->type = na_mdai_object_type(mdai->type, v);
|
@@ -423,7 +423,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
|
|
423
423
|
narray_t *na;
|
424
424
|
GetNArray(obj,na);
|
425
425
|
ndim = na->ndim;
|
426
|
-
dtype = update_type(ptype,
|
426
|
+
dtype = update_type(ptype, rb_obj_class(obj));
|
427
427
|
if (pshape) {
|
428
428
|
dshape = rb_ary_new2(ndim);
|
429
429
|
for (i=0; i<ndim; i++) {
|
@@ -436,7 +436,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
|
|
436
436
|
}
|
437
437
|
} else {
|
438
438
|
rb_raise(rb_eTypeError,"invalid type for NArray: %s",
|
439
|
-
rb_class2name(
|
439
|
+
rb_class2name(rb_obj_class(obj)));
|
440
440
|
}
|
441
441
|
}
|
442
442
|
|
@@ -539,7 +539,7 @@ na_mdai_for_struct(na_mdai_t *mdai, int ndim)
|
|
539
539
|
|
540
540
|
//fpintf(stderr,"val = "); rb_p(val);
|
541
541
|
|
542
|
-
if (
|
542
|
+
if (rb_obj_class(val) == mdai->na_type) {
|
543
543
|
GetNArray(val,na);
|
544
544
|
if ( ndim+na->ndim > mdai->capa ) {
|
545
545
|
abort();
|
data/ext/numo/narray/data.c
CHANGED
@@ -57,7 +57,7 @@ static ID id_swap_byte;
|
|
57
57
|
}
|
58
58
|
|
59
59
|
#define m_memcpy(src,dst) memcpy(dst,src,e)
|
60
|
-
void
|
60
|
+
static void
|
61
61
|
iter_copy_bytes(na_loop_t *const lp)
|
62
62
|
{
|
63
63
|
size_t e;
|
@@ -206,7 +206,7 @@ check_axis(int axis, int ndim)
|
|
206
206
|
# [[1, 5],
|
207
207
|
# [3, 7]]]
|
208
208
|
*/
|
209
|
-
VALUE
|
209
|
+
static VALUE
|
210
210
|
na_swapaxes(VALUE self, VALUE a1, VALUE a2)
|
211
211
|
{
|
212
212
|
int i, j, ndim;
|
@@ -232,7 +232,7 @@ na_swapaxes(VALUE self, VALUE a1, VALUE a2)
|
|
232
232
|
return view;
|
233
233
|
}
|
234
234
|
|
235
|
-
VALUE
|
235
|
+
static VALUE
|
236
236
|
na_transpose_map(VALUE self, int *map)
|
237
237
|
{
|
238
238
|
int i, ndim;
|
@@ -262,7 +262,7 @@ na_transpose_map(VALUE self, int *map)
|
|
262
262
|
|
263
263
|
#define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
|
264
264
|
|
265
|
-
VALUE
|
265
|
+
static VALUE
|
266
266
|
na_transpose(int argc, VALUE *argv, VALUE self)
|
267
267
|
{
|
268
268
|
int ndim, *map, *permute;
|
@@ -467,7 +467,7 @@ na_flatten_dim(VALUE self, int sd)
|
|
467
467
|
shape[sd] = size;
|
468
468
|
|
469
469
|
// new object
|
470
|
-
view = na_s_allocate_view(
|
470
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
471
471
|
na_copy_flags(self, view);
|
472
472
|
GetNArrayView(view, na2);
|
473
473
|
|
@@ -591,7 +591,7 @@ na_flatten(VALUE self)
|
|
591
591
|
[10, 11, 12, 3, 14],
|
592
592
|
[15, 16, 17, 18, 4]]
|
593
593
|
*/
|
594
|
-
VALUE
|
594
|
+
static VALUE
|
595
595
|
na_diagonal(int argc, VALUE *argv, VALUE self)
|
596
596
|
{
|
597
597
|
int i, k, nd;
|
@@ -690,7 +690,7 @@ na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
690
690
|
shape[k] = diag_size;
|
691
691
|
|
692
692
|
// new object
|
693
|
-
view = na_s_allocate_view(
|
693
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
694
694
|
na_copy_flags(self, view);
|
695
695
|
GetNArrayView(view, na2);
|
696
696
|
|
@@ -803,7 +803,7 @@ na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
|
|
803
803
|
GetNArray(self,na);
|
804
804
|
nd = na->ndim;
|
805
805
|
|
806
|
-
view = na_s_allocate_view(
|
806
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
807
807
|
|
808
808
|
na_copy_flags(self, view);
|
809
809
|
GetNArrayView(view, na2);
|
data/ext/numo/narray/depend.erb
CHANGED
@@ -13,14 +13,14 @@ TAGS : $(TAGSRC)
|
|
13
13
|
doc :
|
14
14
|
yard doc *.c types/*.c
|
15
15
|
|
16
|
-
C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
|
16
|
+
C_TMPL = <%=Dir.glob("#{__dir__}/gen/tmpl*/*.c").join(" ")%>
|
17
17
|
|
18
|
-
COGEN = gen/cogen.rb
|
19
|
-
DEPENDS = $(C_TMPL) gen/*.rb
|
18
|
+
COGEN = <%= __dir__ %>/gen/cogen.rb
|
19
|
+
DEPENDS = $(C_TMPL) <%= __dir__ %>/gen/*.rb
|
20
20
|
|
21
21
|
<%
|
22
22
|
type_c = []
|
23
|
-
type_rb = Dir.glob("gen/def/*.rb")
|
23
|
+
type_rb = Dir.glob("#{__dir__}/gen/def/*.rb")
|
24
24
|
type_rb.each do |s|
|
25
25
|
type_c << c = "types/"+File.basename(s,".rb")+".c"
|
26
26
|
%>
|
data/ext/numo/narray/extconf.rb
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
|
+
# Build gems for Windows by using fake RbConfig::CONFIG by rake-compiler.
|
4
|
+
fake_path = File.join(Dir.pwd, 'fake.rb')
|
5
|
+
if File.exist? fake_path
|
6
|
+
$:.unshift(Dir.pwd)
|
7
|
+
require 'fake'
|
8
|
+
end
|
9
|
+
|
3
10
|
thisdir = File.dirname(__FILE__)
|
4
11
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
|
5
12
|
$LOAD_PATH.unshift libpath
|
@@ -43,6 +50,12 @@ code = DefLib.new do
|
|
43
50
|
set file_name: $output||""
|
44
51
|
set include_files: ["numo/types/#{type_name}.h"]
|
45
52
|
set lib_name: "numo_"+type_name
|
53
|
+
|
54
|
+
if (::RbConfig::CONFIG['target_cpu'] == 'x86_64') or (::RbConfig::CONFIG['target_cpu'] == 'x64')
|
55
|
+
set is_simd: true
|
56
|
+
else
|
57
|
+
set is_simd: false
|
58
|
+
end
|
46
59
|
|
47
60
|
def_class do
|
48
61
|
extend NArrayMethod
|
@@ -228,13 +228,25 @@ class Store < DefMethod
|
|
228
228
|
def extract_data(ptr,pos,x)
|
229
229
|
case type_name
|
230
230
|
when "Bit"
|
231
|
-
"{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x =
|
231
|
+
"{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_sint(b);}"
|
232
232
|
when "RObject"
|
233
233
|
"#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
|
234
234
|
when /Complex/
|
235
235
|
"{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
|
236
|
-
|
236
|
+
when /Float/
|
237
237
|
"#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
|
238
|
+
when /UInt64/
|
239
|
+
"#{x} = m_from_uint64(*(#{dtype}*)(#{ptr}+#{pos}))"
|
240
|
+
when /UInt32/
|
241
|
+
"#{x} = m_from_uint32(*(#{dtype}*)(#{ptr}+#{pos}))"
|
242
|
+
when /Int64/
|
243
|
+
"#{x} = m_from_int64(*(#{dtype}*)(#{ptr}+#{pos}))"
|
244
|
+
when /Int32/
|
245
|
+
"#{x} = m_from_int32(*(#{dtype}*)(#{ptr}+#{pos}))"
|
246
|
+
when /Int/
|
247
|
+
"#{x} = m_from_sint(*(#{dtype}*)(#{ptr}+#{pos}))"
|
248
|
+
else
|
249
|
+
raise "unknown type: #{type_name}"
|
238
250
|
end
|
239
251
|
end
|
240
252
|
end
|
data/ext/numo/narray/gen/spec.rb
CHANGED
@@ -109,14 +109,14 @@ def_method "store" do
|
|
109
109
|
end
|
110
110
|
store_from "DFloat","double", "m_from_real"
|
111
111
|
store_from "SFloat","float", "m_from_real"
|
112
|
-
store_from "Int64", "int64_t", "
|
113
|
-
store_from "Int32", "int32_t", "
|
114
|
-
store_from "Int16", "int16_t", "
|
115
|
-
store_from "Int8", "int8_t", "
|
116
|
-
store_from "UInt64","u_int64_t","
|
117
|
-
store_from "UInt32","u_int32_t","
|
118
|
-
store_from "UInt16","u_int16_t","
|
119
|
-
store_from "UInt8", "u_int8_t", "
|
112
|
+
store_from "Int64", "int64_t", "m_from_int64"
|
113
|
+
store_from "Int32", "int32_t", "m_from_int32"
|
114
|
+
store_from "Int16", "int16_t", "m_from_sint"
|
115
|
+
store_from "Int8", "int8_t", "m_from_sint"
|
116
|
+
store_from "UInt64","u_int64_t","m_from_uint64"
|
117
|
+
store_from "UInt32","u_int32_t","m_from_uint32"
|
118
|
+
store_from "UInt16","u_int16_t","m_from_sint"
|
119
|
+
store_from "UInt8", "u_int8_t", "m_from_sint"
|
120
120
|
store_from "RObject", "VALUE", "m_num_to_data"
|
121
121
|
store_array
|
122
122
|
end
|
@@ -273,8 +273,19 @@ if is_float
|
|
273
273
|
cond_unary "isfinite"
|
274
274
|
end
|
275
275
|
|
276
|
-
|
277
|
-
|
276
|
+
if is_int
|
277
|
+
if is_unsigned
|
278
|
+
accum "sum","u_int64_t","numo_cUInt64"
|
279
|
+
accum "prod","u_int64_t","numo_cUInt64"
|
280
|
+
else
|
281
|
+
accum "sum","int64_t","numo_cInt64"
|
282
|
+
accum "prod","int64_t","numo_cInt64"
|
283
|
+
end
|
284
|
+
else
|
285
|
+
accum "sum","dtype","cT"
|
286
|
+
accum "prod","dtype","cT"
|
287
|
+
end
|
288
|
+
|
278
289
|
if is_double_precision
|
279
290
|
accum "kahan_sum","dtype","cT"
|
280
291
|
end
|
@@ -353,6 +364,8 @@ if has_math
|
|
353
364
|
fn = get(:full_class_name)
|
354
365
|
cn = get(:class_name)
|
355
366
|
nm = get(:name)
|
367
|
+
st = get(:simd_type)
|
368
|
+
dp = get(:is_double_precision)
|
356
369
|
is_c = is_complex
|
357
370
|
|
358
371
|
def_module do
|
@@ -363,6 +376,9 @@ def_module do
|
|
363
376
|
set full_module_name: fn+"::NMath"
|
364
377
|
set module_name: "Math"
|
365
378
|
set module_var: "mTM"
|
379
|
+
set simd_type: st
|
380
|
+
set is_double_precision: dp
|
381
|
+
set is_complex: is_c
|
366
382
|
|
367
383
|
math "sqrt"
|
368
384
|
math "cbrt"
|
@@ -89,7 +89,7 @@ static VALUE
|
|
89
89
|
//<% if is_object %>
|
90
90
|
return <%=c_func%>_self(argc, argv, self);
|
91
91
|
//<% else %>
|
92
|
-
klass = na_upcast(
|
92
|
+
klass = na_upcast(rb_obj_class(self),rb_obj_class(argv[0]));
|
93
93
|
if (klass==cT) {
|
94
94
|
return <%=c_func%>_self(argc, argv, self);
|
95
95
|
} else {
|
@@ -31,7 +31,17 @@ static void
|
|
31
31
|
@param [Numeric,Array,Range] axis Affected dimensions.
|
32
32
|
@return [Integer,Numo::Int] returns result index of <%=name%>.
|
33
33
|
@example
|
34
|
-
|
34
|
+
<% if name == 'min_index' %>
|
35
|
+
Numo::NArray[3,4,1,2].min_index => 2
|
36
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index => 4
|
37
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 1) => [2, 4]
|
38
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 0) => [3, 4, 2]
|
39
|
+
<% else %>
|
40
|
+
Numo::NArray[3,4,1,2].max_index => 1
|
41
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index => 5
|
42
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 1) => [1, 5]
|
43
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 0) => [0, 1, 5]
|
44
|
+
<% end %>
|
35
45
|
*/
|
36
46
|
static VALUE
|
37
47
|
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
|
@@ -70,7 +70,7 @@ static void
|
|
70
70
|
}
|
71
71
|
}
|
72
72
|
|
73
|
-
const rb_data_type_t <%=type_name%>_data_type = {
|
73
|
+
static const rb_data_type_t <%=type_name%>_data_type = {
|
74
74
|
"<%=full_class_name%>",
|
75
75
|
{<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
|
76
76
|
&na_data_type,
|
@@ -80,7 +80,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
|
|
80
80
|
|
81
81
|
<% else %>
|
82
82
|
|
83
|
-
const rb_data_type_t <%=type_name%>_data_type = {
|
83
|
+
static const rb_data_type_t <%=type_name%>_data_type = {
|
84
84
|
"<%=full_class_name%>",
|
85
85
|
{0, <%=type_name%>_free, <%=type_name%>_memsize,},
|
86
86
|
&na_data_type,
|
@@ -90,7 +90,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
|
|
90
90
|
|
91
91
|
<% end %>
|
92
92
|
|
93
|
-
VALUE
|
93
|
+
static VALUE
|
94
94
|
<%=c_func(0)%>(VALUE klass)
|
95
95
|
{
|
96
96
|
narray_data_t *na = ALLOC(narray_data_t);
|
@@ -11,10 +11,24 @@
|
|
11
11
|
static void
|
12
12
|
<%=c_iter%>(na_loop_t *const lp)
|
13
13
|
{
|
14
|
-
size_t i
|
14
|
+
size_t i=0;
|
15
|
+
size_t n;
|
15
16
|
char *p1, *p2, *p3;
|
16
17
|
ssize_t s1, s2, s3;
|
17
18
|
|
19
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
20
|
+
size_t cnt;
|
21
|
+
size_t cnt_simd_loop = -1;
|
22
|
+
<% if is_double_precision %>
|
23
|
+
__m128d a;
|
24
|
+
__m128d b;
|
25
|
+
<% else %>
|
26
|
+
__m128 a;
|
27
|
+
__m128 b;
|
28
|
+
<% end %>
|
29
|
+
size_t num_pack; // Number of elements packed for SIMD.
|
30
|
+
num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
|
31
|
+
<% end %>
|
18
32
|
INIT_COUNTER(lp, n);
|
19
33
|
INIT_PTR(lp, 0, p1, s1);
|
20
34
|
INIT_PTR(lp, 1, p2, s2);
|
@@ -28,24 +42,149 @@ static void
|
|
28
42
|
if (s1 == sizeof(dtype) &&
|
29
43
|
s2 == sizeof(dtype) &&
|
30
44
|
s3 == sizeof(dtype) ) {
|
45
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
46
|
+
// Check number of elements. & Check same alignment.
|
47
|
+
if ((n >= num_pack) && is_same_aligned3(&((dtype*)p1)[i], &((dtype*)p2)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
|
48
|
+
// Calculate up to the position just before the start of SIMD computation.
|
49
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
|
50
|
+
if (p1 == p3) { // inplace case
|
51
|
+
for (; i < cnt; i++) {
|
52
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
53
|
+
}
|
54
|
+
} else {
|
55
|
+
for (; i < cnt; i++) {
|
56
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
57
|
+
}
|
58
|
+
}
|
31
59
|
|
32
|
-
|
33
|
-
|
34
|
-
|
60
|
+
// Get the count of SIMD computation loops.
|
61
|
+
cnt_simd_loop = (n - i) % num_pack;
|
62
|
+
|
63
|
+
// SIMD computation.
|
64
|
+
if (p1 == p3) { // inplace case
|
65
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
66
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
67
|
+
b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
|
68
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
69
|
+
_mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
|
70
|
+
}
|
71
|
+
} else {
|
72
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
73
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
74
|
+
b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
|
75
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
76
|
+
_mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
// Compute the remainder of the SIMD operation.
|
82
|
+
if (cnt_simd_loop != 0){
|
83
|
+
<% end %>
|
84
|
+
if (p1 == p3) { // inplace case
|
85
|
+
for (; i<n; i++) {
|
86
|
+
check_intdivzero(((dtype*)p2)[i]);
|
87
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
88
|
+
}
|
89
|
+
} else {
|
90
|
+
for (; i<n; i++) {
|
91
|
+
check_intdivzero(((dtype*)p2)[i]);
|
92
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
93
|
+
}
|
94
|
+
}
|
95
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
35
96
|
}
|
97
|
+
<% end %>
|
36
98
|
return;
|
37
99
|
}
|
100
|
+
|
38
101
|
if (is_aligned_step(s1,sizeof(dtype)) &&
|
39
102
|
is_aligned_step(s2,sizeof(dtype)) &&
|
40
103
|
is_aligned_step(s3,sizeof(dtype)) ) {
|
41
104
|
//<% end %>
|
42
|
-
|
105
|
+
|
106
|
+
if (s2 == 0){ // Broadcasting from scalar value.
|
43
107
|
check_intdivzero(*(dtype*)p2);
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
108
|
+
if (s1 == sizeof(dtype) &&
|
109
|
+
s3 == sizeof(dtype) ) {
|
110
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
111
|
+
// Broadcast a scalar value and use it for SIMD computation.
|
112
|
+
b = _mm_load1_<%=simd_type%>(&((dtype*)p2)[0]);
|
113
|
+
|
114
|
+
// Check number of elements. & Check same alignment.
|
115
|
+
if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
|
116
|
+
// Calculate up to the position just before the start of SIMD computation.
|
117
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
|
118
|
+
if (p1 == p3) { // inplace case
|
119
|
+
for (; i < cnt; i++) {
|
120
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
121
|
+
}
|
122
|
+
} else {
|
123
|
+
for (; i < cnt; i++) {
|
124
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
// Get the count of SIMD computation loops.
|
129
|
+
cnt_simd_loop = (n - i) % num_pack;
|
130
|
+
|
131
|
+
// SIMD computation.
|
132
|
+
if (p1 == p3) { // inplace case
|
133
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
134
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
135
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
136
|
+
_mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
|
137
|
+
}
|
138
|
+
} else {
|
139
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
140
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
141
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
142
|
+
_mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
// Compute the remainder of the SIMD operation.
|
148
|
+
if (cnt_simd_loop != 0){
|
149
|
+
<% end %>
|
150
|
+
if (p1 == p3) { // inplace case
|
151
|
+
for (; i<n; i++) {
|
152
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
153
|
+
}
|
154
|
+
} else {
|
155
|
+
for (; i<n; i++) {
|
156
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
160
|
+
}
|
161
|
+
<% end %>
|
162
|
+
} else {
|
163
|
+
for (i=0; i<n; i++) {
|
164
|
+
*(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
165
|
+
p1 += s1;
|
166
|
+
p3 += s3;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
} else {
|
170
|
+
if (p1 == p3) { // inplace case
|
171
|
+
for (i=0; i<n; i++) {
|
172
|
+
check_intdivzero(*(dtype*)p2);
|
173
|
+
*(dtype*)p1 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
174
|
+
p1 += s1;
|
175
|
+
p2 += s2;
|
176
|
+
}
|
177
|
+
} else {
|
178
|
+
for (i=0; i<n; i++) {
|
179
|
+
check_intdivzero(*(dtype*)p2);
|
180
|
+
*(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
181
|
+
p1 += s1;
|
182
|
+
p2 += s2;
|
183
|
+
p3 += s3;
|
184
|
+
}
|
185
|
+
}
|
48
186
|
}
|
187
|
+
|
49
188
|
return;
|
50
189
|
//<% if need_align %>
|
51
190
|
}
|
@@ -86,7 +225,7 @@ static VALUE
|
|
86
225
|
<% else %>
|
87
226
|
VALUE klass, v;
|
88
227
|
|
89
|
-
klass = na_upcast(
|
228
|
+
klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
|
90
229
|
if (klass==cT) {
|
91
230
|
return <%=c_func%>_self(self, other);
|
92
231
|
} else {
|