numo-narray 0.9.1.2 → 0.9.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +7 -1
- data/ext/numo/narray/array.c +6 -6
- data/ext/numo/narray/data.c +8 -8
- data/ext/numo/narray/depend.erb +4 -4
- data/ext/numo/narray/extconf.rb +2 -2
- data/ext/numo/narray/gen/cogen.rb +13 -0
- data/ext/numo/narray/gen/def/dfloat.rb +1 -0
- data/ext/numo/narray/gen/def/sfloat.rb +1 -0
- data/ext/numo/narray/gen/narray_def.rb +14 -2
- data/ext/numo/narray/gen/spec.rb +26 -10
- data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
- data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
- data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
- data/ext/numo/narray/gen/tmpl/binary.c +149 -10
- data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
- data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
- data/ext/numo/narray/gen/tmpl/cast.c +1 -1
- data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
- data/ext/numo/narray/gen/tmpl/each.c +1 -1
- data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
- data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/numo/narray/gen/tmpl/lib.c +5 -0
- data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl/median.c +3 -2
- data/ext/numo/narray/gen/tmpl/pow.c +1 -1
- data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
- data/ext/numo/narray/gen/tmpl/store.c +4 -4
- data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
- data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
- data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
- data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
- data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
- data/ext/numo/narray/index.c +64 -37
- data/ext/numo/narray/math.c +4 -4
- data/ext/numo/narray/narray.c +54 -29
- data/ext/numo/narray/ndloop.c +7 -7
- data/ext/numo/narray/numo/narray.h +9 -2
- data/ext/numo/narray/numo/template.h +18 -0
- data/ext/numo/narray/numo/types/bit.h +5 -0
- data/ext/numo/narray/numo/types/complex_macro.h +5 -0
- data/ext/numo/narray/numo/types/float_macro.h +5 -0
- data/ext/numo/narray/numo/types/int_macro.h +24 -0
- data/ext/numo/narray/numo/types/robj_macro.h +5 -0
- data/ext/numo/narray/numo/types/uint_macro.h +24 -0
- data/ext/numo/narray/numo/types/xint_macro.h +5 -25
- data/ext/numo/narray/rand.c +2 -29
- data/ext/numo/narray/step.c +1 -28
- data/ext/numo/narray/struct.c +26 -22
- data/lib/numo/narray/extra.rb +50 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4081b1facf83501be82b2446c10bddefbf2e60530f1249074b880f7729968b4f
|
4
|
+
data.tar.gz: 19b64cf6e4778f25f60f821a6454cc1b5d2e9dc12f60ecb9f3bfb928155e45db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a7e32c3a9b208bb23d83c8d4b2f96a7e646e7e9db5bb42c2eea81598b9fdd4072f452f7318f4b9c594152bc9587c596896f1f55d4d41c3cb48ced01b0e764a28
|
7
|
+
data.tar.gz: b2686de4edbcbb35ff180c8a706f23f853740526249ac1f7c9729bd42fb34af5922eca9fd383a8cbea46ecc85280164a9346fbbc2f31fe4839a8bc15c7ccb8c3
|
data/Rakefile
CHANGED
@@ -40,7 +40,13 @@ namespace :build do
|
|
40
40
|
["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
|
41
41
|
["cd", build_dir],
|
42
42
|
["bundle"],
|
43
|
-
[
|
43
|
+
[
|
44
|
+
"rake",
|
45
|
+
"RUBY_CC_VERSION=2.5.0:2.4.0:2.3.0:2.2.2:2.1.6",
|
46
|
+
"cross",
|
47
|
+
"native",
|
48
|
+
"gem",
|
49
|
+
],
|
44
50
|
]
|
45
51
|
raw_commands = commands.collect do |command|
|
46
52
|
Shellwords.join(command)
|
data/ext/numo/narray/array.c
CHANGED
@@ -106,7 +106,7 @@ static VALUE
|
|
106
106
|
return type;
|
107
107
|
|
108
108
|
default:
|
109
|
-
if (
|
109
|
+
if (rb_obj_class(v) == rb_const_get( rb_cObject, id_Complex )) {
|
110
110
|
return NA_DCOMPLEX;
|
111
111
|
}
|
112
112
|
}
|
@@ -232,9 +232,9 @@ na_mdai_investigate(na_mdai_t *mdai, int ndim)
|
|
232
232
|
}
|
233
233
|
// type
|
234
234
|
if (NIL_P(mdai->na_type)) {
|
235
|
-
mdai->na_type =
|
235
|
+
mdai->na_type = rb_obj_class(v);
|
236
236
|
} else {
|
237
|
-
mdai->na_type = na_upcast(
|
237
|
+
mdai->na_type = na_upcast(rb_obj_class(v), mdai->na_type);
|
238
238
|
}
|
239
239
|
} else {
|
240
240
|
mdai->type = na_mdai_object_type(mdai->type, v);
|
@@ -423,7 +423,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
|
|
423
423
|
narray_t *na;
|
424
424
|
GetNArray(obj,na);
|
425
425
|
ndim = na->ndim;
|
426
|
-
dtype = update_type(ptype,
|
426
|
+
dtype = update_type(ptype, rb_obj_class(obj));
|
427
427
|
if (pshape) {
|
428
428
|
dshape = rb_ary_new2(ndim);
|
429
429
|
for (i=0; i<ndim; i++) {
|
@@ -436,7 +436,7 @@ na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
|
|
436
436
|
}
|
437
437
|
} else {
|
438
438
|
rb_raise(rb_eTypeError,"invalid type for NArray: %s",
|
439
|
-
rb_class2name(
|
439
|
+
rb_class2name(rb_obj_class(obj)));
|
440
440
|
}
|
441
441
|
}
|
442
442
|
|
@@ -539,7 +539,7 @@ na_mdai_for_struct(na_mdai_t *mdai, int ndim)
|
|
539
539
|
|
540
540
|
//fpintf(stderr,"val = "); rb_p(val);
|
541
541
|
|
542
|
-
if (
|
542
|
+
if (rb_obj_class(val) == mdai->na_type) {
|
543
543
|
GetNArray(val,na);
|
544
544
|
if ( ndim+na->ndim > mdai->capa ) {
|
545
545
|
abort();
|
data/ext/numo/narray/data.c
CHANGED
@@ -57,7 +57,7 @@ static ID id_swap_byte;
|
|
57
57
|
}
|
58
58
|
|
59
59
|
#define m_memcpy(src,dst) memcpy(dst,src,e)
|
60
|
-
void
|
60
|
+
static void
|
61
61
|
iter_copy_bytes(na_loop_t *const lp)
|
62
62
|
{
|
63
63
|
size_t e;
|
@@ -206,7 +206,7 @@ check_axis(int axis, int ndim)
|
|
206
206
|
# [[1, 5],
|
207
207
|
# [3, 7]]]
|
208
208
|
*/
|
209
|
-
VALUE
|
209
|
+
static VALUE
|
210
210
|
na_swapaxes(VALUE self, VALUE a1, VALUE a2)
|
211
211
|
{
|
212
212
|
int i, j, ndim;
|
@@ -232,7 +232,7 @@ na_swapaxes(VALUE self, VALUE a1, VALUE a2)
|
|
232
232
|
return view;
|
233
233
|
}
|
234
234
|
|
235
|
-
VALUE
|
235
|
+
static VALUE
|
236
236
|
na_transpose_map(VALUE self, int *map)
|
237
237
|
{
|
238
238
|
int i, ndim;
|
@@ -262,7 +262,7 @@ na_transpose_map(VALUE self, int *map)
|
|
262
262
|
|
263
263
|
#define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
|
264
264
|
|
265
|
-
VALUE
|
265
|
+
static VALUE
|
266
266
|
na_transpose(int argc, VALUE *argv, VALUE self)
|
267
267
|
{
|
268
268
|
int ndim, *map, *permute;
|
@@ -467,7 +467,7 @@ na_flatten_dim(VALUE self, int sd)
|
|
467
467
|
shape[sd] = size;
|
468
468
|
|
469
469
|
// new object
|
470
|
-
view = na_s_allocate_view(
|
470
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
471
471
|
na_copy_flags(self, view);
|
472
472
|
GetNArrayView(view, na2);
|
473
473
|
|
@@ -591,7 +591,7 @@ na_flatten(VALUE self)
|
|
591
591
|
[10, 11, 12, 3, 14],
|
592
592
|
[15, 16, 17, 18, 4]]
|
593
593
|
*/
|
594
|
-
VALUE
|
594
|
+
static VALUE
|
595
595
|
na_diagonal(int argc, VALUE *argv, VALUE self)
|
596
596
|
{
|
597
597
|
int i, k, nd;
|
@@ -690,7 +690,7 @@ na_diagonal(int argc, VALUE *argv, VALUE self)
|
|
690
690
|
shape[k] = diag_size;
|
691
691
|
|
692
692
|
// new object
|
693
|
-
view = na_s_allocate_view(
|
693
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
694
694
|
na_copy_flags(self, view);
|
695
695
|
GetNArrayView(view, na2);
|
696
696
|
|
@@ -803,7 +803,7 @@ na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
|
|
803
803
|
GetNArray(self,na);
|
804
804
|
nd = na->ndim;
|
805
805
|
|
806
|
-
view = na_s_allocate_view(
|
806
|
+
view = na_s_allocate_view(rb_obj_class(self));
|
807
807
|
|
808
808
|
na_copy_flags(self, view);
|
809
809
|
GetNArrayView(view, na2);
|
data/ext/numo/narray/depend.erb
CHANGED
@@ -13,14 +13,14 @@ TAGS : $(TAGSRC)
|
|
13
13
|
doc :
|
14
14
|
yard doc *.c types/*.c
|
15
15
|
|
16
|
-
C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
|
16
|
+
C_TMPL = <%=Dir.glob("#{__dir__}/gen/tmpl*/*.c").join(" ")%>
|
17
17
|
|
18
|
-
COGEN = gen/cogen.rb
|
19
|
-
DEPENDS = $(C_TMPL) gen/*.rb
|
18
|
+
COGEN = <%= __dir__ %>/gen/cogen.rb
|
19
|
+
DEPENDS = $(C_TMPL) <%= __dir__ %>/gen/*.rb
|
20
20
|
|
21
21
|
<%
|
22
22
|
type_c = []
|
23
|
-
type_rb = Dir.glob("gen/def/*.rb")
|
23
|
+
type_rb = Dir.glob("#{__dir__}/gen/def/*.rb")
|
24
24
|
type_rb.each do |s|
|
25
25
|
type_c << c = "types/"+File.basename(s,".rb")+".c"
|
26
26
|
%>
|
data/ext/numo/narray/extconf.rb
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
|
+
# Build gems for Windows by using fake RbConfig::CONFIG by rake-compiler.
|
4
|
+
fake_path = File.join(Dir.pwd, 'fake.rb')
|
5
|
+
if File.exist? fake_path
|
6
|
+
$:.unshift(Dir.pwd)
|
7
|
+
require 'fake'
|
8
|
+
end
|
9
|
+
|
3
10
|
thisdir = File.dirname(__FILE__)
|
4
11
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
|
5
12
|
$LOAD_PATH.unshift libpath
|
@@ -43,6 +50,12 @@ code = DefLib.new do
|
|
43
50
|
set file_name: $output||""
|
44
51
|
set include_files: ["numo/types/#{type_name}.h"]
|
45
52
|
set lib_name: "numo_"+type_name
|
53
|
+
|
54
|
+
if (::RbConfig::CONFIG['target_cpu'] == 'x86_64') or (::RbConfig::CONFIG['target_cpu'] == 'x64')
|
55
|
+
set is_simd: true
|
56
|
+
else
|
57
|
+
set is_simd: false
|
58
|
+
end
|
46
59
|
|
47
60
|
def_class do
|
48
61
|
extend NArrayMethod
|
@@ -228,13 +228,25 @@ class Store < DefMethod
|
|
228
228
|
def extract_data(ptr,pos,x)
|
229
229
|
case type_name
|
230
230
|
when "Bit"
|
231
|
-
"{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x =
|
231
|
+
"{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_sint(b);}"
|
232
232
|
when "RObject"
|
233
233
|
"#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
|
234
234
|
when /Complex/
|
235
235
|
"{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
|
236
|
-
|
236
|
+
when /Float/
|
237
237
|
"#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
|
238
|
+
when /UInt64/
|
239
|
+
"#{x} = m_from_uint64(*(#{dtype}*)(#{ptr}+#{pos}))"
|
240
|
+
when /UInt32/
|
241
|
+
"#{x} = m_from_uint32(*(#{dtype}*)(#{ptr}+#{pos}))"
|
242
|
+
when /Int64/
|
243
|
+
"#{x} = m_from_int64(*(#{dtype}*)(#{ptr}+#{pos}))"
|
244
|
+
when /Int32/
|
245
|
+
"#{x} = m_from_int32(*(#{dtype}*)(#{ptr}+#{pos}))"
|
246
|
+
when /Int/
|
247
|
+
"#{x} = m_from_sint(*(#{dtype}*)(#{ptr}+#{pos}))"
|
248
|
+
else
|
249
|
+
raise "unknown type: #{type_name}"
|
238
250
|
end
|
239
251
|
end
|
240
252
|
end
|
data/ext/numo/narray/gen/spec.rb
CHANGED
@@ -109,14 +109,14 @@ def_method "store" do
|
|
109
109
|
end
|
110
110
|
store_from "DFloat","double", "m_from_real"
|
111
111
|
store_from "SFloat","float", "m_from_real"
|
112
|
-
store_from "Int64", "int64_t", "
|
113
|
-
store_from "Int32", "int32_t", "
|
114
|
-
store_from "Int16", "int16_t", "
|
115
|
-
store_from "Int8", "int8_t", "
|
116
|
-
store_from "UInt64","u_int64_t","
|
117
|
-
store_from "UInt32","u_int32_t","
|
118
|
-
store_from "UInt16","u_int16_t","
|
119
|
-
store_from "UInt8", "u_int8_t", "
|
112
|
+
store_from "Int64", "int64_t", "m_from_int64"
|
113
|
+
store_from "Int32", "int32_t", "m_from_int32"
|
114
|
+
store_from "Int16", "int16_t", "m_from_sint"
|
115
|
+
store_from "Int8", "int8_t", "m_from_sint"
|
116
|
+
store_from "UInt64","u_int64_t","m_from_uint64"
|
117
|
+
store_from "UInt32","u_int32_t","m_from_uint32"
|
118
|
+
store_from "UInt16","u_int16_t","m_from_sint"
|
119
|
+
store_from "UInt8", "u_int8_t", "m_from_sint"
|
120
120
|
store_from "RObject", "VALUE", "m_num_to_data"
|
121
121
|
store_array
|
122
122
|
end
|
@@ -273,8 +273,19 @@ if is_float
|
|
273
273
|
cond_unary "isfinite"
|
274
274
|
end
|
275
275
|
|
276
|
-
|
277
|
-
|
276
|
+
if is_int
|
277
|
+
if is_unsigned
|
278
|
+
accum "sum","u_int64_t","numo_cUInt64"
|
279
|
+
accum "prod","u_int64_t","numo_cUInt64"
|
280
|
+
else
|
281
|
+
accum "sum","int64_t","numo_cInt64"
|
282
|
+
accum "prod","int64_t","numo_cInt64"
|
283
|
+
end
|
284
|
+
else
|
285
|
+
accum "sum","dtype","cT"
|
286
|
+
accum "prod","dtype","cT"
|
287
|
+
end
|
288
|
+
|
278
289
|
if is_double_precision
|
279
290
|
accum "kahan_sum","dtype","cT"
|
280
291
|
end
|
@@ -353,6 +364,8 @@ if has_math
|
|
353
364
|
fn = get(:full_class_name)
|
354
365
|
cn = get(:class_name)
|
355
366
|
nm = get(:name)
|
367
|
+
st = get(:simd_type)
|
368
|
+
dp = get(:is_double_precision)
|
356
369
|
is_c = is_complex
|
357
370
|
|
358
371
|
def_module do
|
@@ -363,6 +376,9 @@ def_module do
|
|
363
376
|
set full_module_name: fn+"::NMath"
|
364
377
|
set module_name: "Math"
|
365
378
|
set module_var: "mTM"
|
379
|
+
set simd_type: st
|
380
|
+
set is_double_precision: dp
|
381
|
+
set is_complex: is_c
|
366
382
|
|
367
383
|
math "sqrt"
|
368
384
|
math "cbrt"
|
@@ -89,7 +89,7 @@ static VALUE
|
|
89
89
|
//<% if is_object %>
|
90
90
|
return <%=c_func%>_self(argc, argv, self);
|
91
91
|
//<% else %>
|
92
|
-
klass = na_upcast(
|
92
|
+
klass = na_upcast(rb_obj_class(self),rb_obj_class(argv[0]));
|
93
93
|
if (klass==cT) {
|
94
94
|
return <%=c_func%>_self(argc, argv, self);
|
95
95
|
} else {
|
@@ -31,7 +31,17 @@ static void
|
|
31
31
|
@param [Numeric,Array,Range] axis Affected dimensions.
|
32
32
|
@return [Integer,Numo::Int] returns result index of <%=name%>.
|
33
33
|
@example
|
34
|
-
|
34
|
+
<% if name == 'min_index' %>
|
35
|
+
Numo::NArray[3,4,1,2].min_index => 2
|
36
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index => 4
|
37
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 1) => [2, 4]
|
38
|
+
Numo::NArray[[3,4,1],[2,0,5]].min_index(axis: 0) => [3, 4, 2]
|
39
|
+
<% else %>
|
40
|
+
Numo::NArray[3,4,1,2].max_index => 1
|
41
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index => 5
|
42
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 1) => [1, 5]
|
43
|
+
Numo::NArray[[3,4,1],[2,0,5]].max_index(axis: 0) => [0, 1, 5]
|
44
|
+
<% end %>
|
35
45
|
*/
|
36
46
|
static VALUE
|
37
47
|
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
|
@@ -70,7 +70,7 @@ static void
|
|
70
70
|
}
|
71
71
|
}
|
72
72
|
|
73
|
-
const rb_data_type_t <%=type_name%>_data_type = {
|
73
|
+
static const rb_data_type_t <%=type_name%>_data_type = {
|
74
74
|
"<%=full_class_name%>",
|
75
75
|
{<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
|
76
76
|
&na_data_type,
|
@@ -80,7 +80,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
|
|
80
80
|
|
81
81
|
<% else %>
|
82
82
|
|
83
|
-
const rb_data_type_t <%=type_name%>_data_type = {
|
83
|
+
static const rb_data_type_t <%=type_name%>_data_type = {
|
84
84
|
"<%=full_class_name%>",
|
85
85
|
{0, <%=type_name%>_free, <%=type_name%>_memsize,},
|
86
86
|
&na_data_type,
|
@@ -90,7 +90,7 @@ const rb_data_type_t <%=type_name%>_data_type = {
|
|
90
90
|
|
91
91
|
<% end %>
|
92
92
|
|
93
|
-
VALUE
|
93
|
+
static VALUE
|
94
94
|
<%=c_func(0)%>(VALUE klass)
|
95
95
|
{
|
96
96
|
narray_data_t *na = ALLOC(narray_data_t);
|
@@ -11,10 +11,24 @@
|
|
11
11
|
static void
|
12
12
|
<%=c_iter%>(na_loop_t *const lp)
|
13
13
|
{
|
14
|
-
size_t i
|
14
|
+
size_t i=0;
|
15
|
+
size_t n;
|
15
16
|
char *p1, *p2, *p3;
|
16
17
|
ssize_t s1, s2, s3;
|
17
18
|
|
19
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
20
|
+
size_t cnt;
|
21
|
+
size_t cnt_simd_loop = -1;
|
22
|
+
<% if is_double_precision %>
|
23
|
+
__m128d a;
|
24
|
+
__m128d b;
|
25
|
+
<% else %>
|
26
|
+
__m128 a;
|
27
|
+
__m128 b;
|
28
|
+
<% end %>
|
29
|
+
size_t num_pack; // Number of elements packed for SIMD.
|
30
|
+
num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
|
31
|
+
<% end %>
|
18
32
|
INIT_COUNTER(lp, n);
|
19
33
|
INIT_PTR(lp, 0, p1, s1);
|
20
34
|
INIT_PTR(lp, 1, p2, s2);
|
@@ -28,24 +42,149 @@ static void
|
|
28
42
|
if (s1 == sizeof(dtype) &&
|
29
43
|
s2 == sizeof(dtype) &&
|
30
44
|
s3 == sizeof(dtype) ) {
|
45
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
46
|
+
// Check number of elements. & Check same alignment.
|
47
|
+
if ((n >= num_pack) && is_same_aligned3(&((dtype*)p1)[i], &((dtype*)p2)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
|
48
|
+
// Calculate up to the position just before the start of SIMD computation.
|
49
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
|
50
|
+
if (p1 == p3) { // inplace case
|
51
|
+
for (; i < cnt; i++) {
|
52
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
53
|
+
}
|
54
|
+
} else {
|
55
|
+
for (; i < cnt; i++) {
|
56
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
57
|
+
}
|
58
|
+
}
|
31
59
|
|
32
|
-
|
33
|
-
|
34
|
-
|
60
|
+
// Get the count of SIMD computation loops.
|
61
|
+
cnt_simd_loop = (n - i) % num_pack;
|
62
|
+
|
63
|
+
// SIMD computation.
|
64
|
+
if (p1 == p3) { // inplace case
|
65
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
66
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
67
|
+
b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
|
68
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
69
|
+
_mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
|
70
|
+
}
|
71
|
+
} else {
|
72
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
73
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
74
|
+
b = _mm_load_<%=simd_type%>(&((dtype*)p2)[i]);
|
75
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
76
|
+
_mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
// Compute the remainder of the SIMD operation.
|
82
|
+
if (cnt_simd_loop != 0){
|
83
|
+
<% end %>
|
84
|
+
if (p1 == p3) { // inplace case
|
85
|
+
for (; i<n; i++) {
|
86
|
+
check_intdivzero(((dtype*)p2)[i]);
|
87
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
88
|
+
}
|
89
|
+
} else {
|
90
|
+
for (; i<n; i++) {
|
91
|
+
check_intdivzero(((dtype*)p2)[i]);
|
92
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],((dtype*)p2)[i]);
|
93
|
+
}
|
94
|
+
}
|
95
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
35
96
|
}
|
97
|
+
<% end %>
|
36
98
|
return;
|
37
99
|
}
|
100
|
+
|
38
101
|
if (is_aligned_step(s1,sizeof(dtype)) &&
|
39
102
|
is_aligned_step(s2,sizeof(dtype)) &&
|
40
103
|
is_aligned_step(s3,sizeof(dtype)) ) {
|
41
104
|
//<% end %>
|
42
|
-
|
105
|
+
|
106
|
+
if (s2 == 0){ // Broadcasting from scalar value.
|
43
107
|
check_intdivzero(*(dtype*)p2);
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
108
|
+
if (s1 == sizeof(dtype) &&
|
109
|
+
s3 == sizeof(dtype) ) {
|
110
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
111
|
+
// Broadcast a scalar value and use it for SIMD computation.
|
112
|
+
b = _mm_load1_<%=simd_type%>(&((dtype*)p2)[0]);
|
113
|
+
|
114
|
+
// Check number of elements. & Check same alignment.
|
115
|
+
if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p3)[i], SIMD_ALIGNMENT_SIZE)){
|
116
|
+
// Calculate up to the position just before the start of SIMD computation.
|
117
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
|
118
|
+
if (p1 == p3) { // inplace case
|
119
|
+
for (; i < cnt; i++) {
|
120
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
121
|
+
}
|
122
|
+
} else {
|
123
|
+
for (; i < cnt; i++) {
|
124
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
// Get the count of SIMD computation loops.
|
129
|
+
cnt_simd_loop = (n - i) % num_pack;
|
130
|
+
|
131
|
+
// SIMD computation.
|
132
|
+
if (p1 == p3) { // inplace case
|
133
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
134
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
135
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
136
|
+
_mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
|
137
|
+
}
|
138
|
+
} else {
|
139
|
+
for(; i < n - cnt_simd_loop; i += num_pack){
|
140
|
+
a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
|
141
|
+
a = _mm_<%=name%>_<%=simd_type%>(a, b);
|
142
|
+
_mm_stream_<%=simd_type%>(&((dtype*)p3)[i], a);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
// Compute the remainder of the SIMD operation.
|
148
|
+
if (cnt_simd_loop != 0){
|
149
|
+
<% end %>
|
150
|
+
if (p1 == p3) { // inplace case
|
151
|
+
for (; i<n; i++) {
|
152
|
+
((dtype*)p1)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
153
|
+
}
|
154
|
+
} else {
|
155
|
+
for (; i<n; i++) {
|
156
|
+
((dtype*)p3)[i] = m_<%=name%>(((dtype*)p1)[i],*(dtype*)p2);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
<% if is_simd and is_float and !is_complex and !is_object and %w[add sub mul div].include? name %>
|
160
|
+
}
|
161
|
+
<% end %>
|
162
|
+
} else {
|
163
|
+
for (i=0; i<n; i++) {
|
164
|
+
*(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
165
|
+
p1 += s1;
|
166
|
+
p3 += s3;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
} else {
|
170
|
+
if (p1 == p3) { // inplace case
|
171
|
+
for (i=0; i<n; i++) {
|
172
|
+
check_intdivzero(*(dtype*)p2);
|
173
|
+
*(dtype*)p1 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
174
|
+
p1 += s1;
|
175
|
+
p2 += s2;
|
176
|
+
}
|
177
|
+
} else {
|
178
|
+
for (i=0; i<n; i++) {
|
179
|
+
check_intdivzero(*(dtype*)p2);
|
180
|
+
*(dtype*)p3 = m_<%=name%>(*(dtype*)p1,*(dtype*)p2);
|
181
|
+
p1 += s1;
|
182
|
+
p2 += s2;
|
183
|
+
p3 += s3;
|
184
|
+
}
|
185
|
+
}
|
48
186
|
}
|
187
|
+
|
49
188
|
return;
|
50
189
|
//<% if need_align %>
|
51
190
|
}
|
@@ -86,7 +225,7 @@ static VALUE
|
|
86
225
|
<% else %>
|
87
226
|
VALUE klass, v;
|
88
227
|
|
89
|
-
klass = na_upcast(
|
228
|
+
klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
|
90
229
|
if (klass==cT) {
|
91
230
|
return <%=c_func%>_self(self, other);
|
92
231
|
} else {
|