numo-narray 0.9.1.2 → 0.9.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +7 -1
  3. data/ext/numo/narray/array.c +6 -6
  4. data/ext/numo/narray/data.c +8 -8
  5. data/ext/numo/narray/depend.erb +4 -4
  6. data/ext/numo/narray/extconf.rb +2 -2
  7. data/ext/numo/narray/gen/cogen.rb +13 -0
  8. data/ext/numo/narray/gen/def/dfloat.rb +1 -0
  9. data/ext/numo/narray/gen/def/sfloat.rb +1 -0
  10. data/ext/numo/narray/gen/narray_def.rb +14 -2
  11. data/ext/numo/narray/gen/spec.rb +26 -10
  12. data/ext/numo/narray/gen/tmpl/accum_binary.c +1 -1
  13. data/ext/numo/narray/gen/tmpl/accum_index.c +11 -1
  14. data/ext/numo/narray/gen/tmpl/alloc_func.c +3 -3
  15. data/ext/numo/narray/gen/tmpl/binary.c +149 -10
  16. data/ext/numo/narray/gen/tmpl/binary2.c +1 -1
  17. data/ext/numo/narray/gen/tmpl/bincount.c +1 -1
  18. data/ext/numo/narray/gen/tmpl/cast.c +1 -1
  19. data/ext/numo/narray/gen/tmpl/cond_binary.c +1 -1
  20. data/ext/numo/narray/gen/tmpl/each.c +1 -1
  21. data/ext/numo/narray/gen/tmpl/each_with_index.c +1 -1
  22. data/ext/numo/narray/gen/tmpl/extract_data.c +3 -3
  23. data/ext/numo/narray/gen/tmpl/inspect.c +1 -1
  24. data/ext/numo/narray/gen/tmpl/lib.c +5 -0
  25. data/ext/numo/narray/gen/tmpl/map_with_index.c +1 -1
  26. data/ext/numo/narray/gen/tmpl/median.c +3 -2
  27. data/ext/numo/narray/gen/tmpl/pow.c +1 -1
  28. data/ext/numo/narray/gen/tmpl/qsort.c +118 -56
  29. data/ext/numo/narray/gen/tmpl/store.c +4 -4
  30. data/ext/numo/narray/gen/tmpl/store_bit.c +4 -4
  31. data/ext/numo/narray/gen/tmpl/to_a.c +1 -1
  32. data/ext/numo/narray/gen/tmpl/unary_s.c +55 -9
  33. data/ext/numo/narray/gen/tmpl_bit/each.c +1 -1
  34. data/ext/numo/narray/gen/tmpl_bit/each_with_index.c +1 -1
  35. data/ext/numo/narray/gen/tmpl_bit/inspect.c +1 -1
  36. data/ext/numo/narray/gen/tmpl_bit/mask.c +1 -1
  37. data/ext/numo/narray/gen/tmpl_bit/to_a.c +1 -1
  38. data/ext/numo/narray/index.c +64 -37
  39. data/ext/numo/narray/math.c +4 -4
  40. data/ext/numo/narray/narray.c +54 -29
  41. data/ext/numo/narray/ndloop.c +7 -7
  42. data/ext/numo/narray/numo/narray.h +9 -2
  43. data/ext/numo/narray/numo/template.h +18 -0
  44. data/ext/numo/narray/numo/types/bit.h +5 -0
  45. data/ext/numo/narray/numo/types/complex_macro.h +5 -0
  46. data/ext/numo/narray/numo/types/float_macro.h +5 -0
  47. data/ext/numo/narray/numo/types/int_macro.h +24 -0
  48. data/ext/numo/narray/numo/types/robj_macro.h +5 -0
  49. data/ext/numo/narray/numo/types/uint_macro.h +24 -0
  50. data/ext/numo/narray/numo/types/xint_macro.h +5 -25
  51. data/ext/numo/narray/rand.c +2 -29
  52. data/ext/numo/narray/step.c +1 -28
  53. data/ext/numo/narray/struct.c +26 -22
  54. data/lib/numo/narray/extra.rb +50 -1
  55. metadata +2 -2
@@ -48,7 +48,7 @@ static VALUE
48
48
  return <%=c_func%>_self(self, other);
49
49
  <% else %>
50
50
  VALUE klass, v;
51
- klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
51
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
52
52
  if (klass==cT) {
53
53
  return <%=c_func%>_self(self, other);
54
54
  } else {
@@ -170,7 +170,7 @@ static VALUE
170
170
  return <%=c_func%>_32(self, length);
171
171
  }
172
172
  } else {
173
- wclass = CLASS_OF(weight);
173
+ wclass = rb_obj_class(weight);
174
174
  if (wclass == numo_cSFloat) {
175
175
  return <%=c_func%>_sf(self, weight, length);
176
176
  } else {
@@ -17,7 +17,7 @@ static VALUE
17
17
  narray_t *na;
18
18
  dtype x;
19
19
 
20
- if (CLASS_OF(obj)==cT) {
20
+ if (rb_obj_class(obj)==cT) {
21
21
  return obj;
22
22
  }
23
23
  if (RTEST(rb_obj_is_kind_of(obj,rb_cNumeric))) {
@@ -44,7 +44,7 @@ static VALUE
44
44
  return <%=c_func%>_self(self, other);
45
45
  <% else %>
46
46
  VALUE klass, v;
47
- klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
47
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
48
48
  if (klass==cT) {
49
49
  return <%=c_func%>_self(self, other);
50
50
  } else {
@@ -1,4 +1,4 @@
1
- void
1
+ static void
2
2
  <%=c_iter%>(na_loop_t *const lp)
3
3
  {
4
4
  size_t i, s1;
@@ -11,7 +11,7 @@ yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
11
11
  }
12
12
 
13
13
 
14
- void
14
+ static void
15
15
  <%=c_iter%>(na_loop_t *const lp)
16
16
  {
17
17
  size_t i, s1;
@@ -15,7 +15,7 @@ static dtype
15
15
  if (na->size != 1) {
16
16
  rb_raise(nary_eShapeError,"narray size should be 1");
17
17
  }
18
- klass = CLASS_OF(obj);
18
+ klass = rb_obj_class(obj);
19
19
  ptr = na_get_pointer_for_read(obj);
20
20
  pos = na_get_offset(obj);
21
21
  <% find_tmpl("store").definitions.select{|x| x.class==Store}.each do |x| %>
@@ -27,14 +27,14 @@ static dtype
27
27
 
28
28
  // coerce
29
29
  r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
30
- if (CLASS_OF(r)==cT) {
30
+ if (rb_obj_class(r)==cT) {
31
31
  return <%=c_func%>(r);
32
32
  }
33
33
  <% if is_object %>
34
34
  return obj;
35
35
  <% else %>
36
36
  rb_raise(nary_eCastError, "unknown conversion from %s to %s",
37
- rb_class2name(CLASS_OF(obj)),
37
+ rb_class2name(rb_obj_class(obj)),
38
38
  rb_class2name(cT));
39
39
  <% end %>
40
40
  }
@@ -13,7 +13,7 @@ static VALUE
13
13
  @overload inspect
14
14
  @return [String]
15
15
  */
16
- VALUE
16
+ static VALUE
17
17
  <%=c_func(0)%>(VALUE ary)
18
18
  {
19
19
  return na_ndloop_inspect(ary, <%=c_iter%>, Qnil);
@@ -14,6 +14,11 @@
14
14
 
15
15
  #define m_map(x) m_num_to_data(rb_yield(m_data_to_num(x)))
16
16
 
17
+ <% if is_simd %>
18
+ #include <emmintrin.h>
19
+ #define SIMD_ALIGNMENT_SIZE 16
20
+ <% end %>
21
+
17
22
  <% id_decl.each do |x| %>
18
23
  <%= x %>
19
24
  <% end %>
@@ -12,7 +12,7 @@ yield_map_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
12
12
  return m_num_to_data(y);
13
13
  }
14
14
 
15
- void
15
+ static void
16
16
  <%=c_iter%>(na_loop_t *const lp)
17
17
  {
18
18
  size_t i;
@@ -47,7 +47,7 @@ static void
47
47
  static VALUE
48
48
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
49
49
  {
50
- VALUE reduce;
50
+ VALUE v, reduce;
51
51
  ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_reduce,0}};
52
52
  ndfunc_arg_out_t aout[1] = {{INT2FIX(0),0}};
53
53
  ndfunc_t ndf = {0, NDF_HAS_LOOP|NDF_FLAT_REDUCE, 2,1, ain,aout};
@@ -60,5 +60,6 @@ static VALUE
60
60
  ndf.func = <%=c_iter%>;
61
61
  reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
62
62
  <% end %>
63
- return na_ndloop(&ndf, 2, self, reduce);
63
+ v = na_ndloop(&ndf, 2, self, reduce);
64
+ return <%=type_name%>_extract(v);
64
65
  }
@@ -67,7 +67,7 @@ static VALUE
67
67
  return <%=c_func%>_self(self,other);
68
68
  <% else %>
69
69
  VALUE klass, v;
70
- klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
70
+ klass = na_upcast(rb_obj_class(self),rb_obj_class(other));
71
71
  if (klass==cT) {
72
72
  return <%=c_func%>_self(self,other);
73
73
  } else {
@@ -53,12 +53,52 @@
53
53
  #define QSORT_INCL
54
54
  #define Min(x, y) ((x) < (y) ? (x) : (y))
55
55
 
56
- #define swap(type,a,b) \
57
- do {type tmp=*(type*)(a); *(type*)(a)=*(type*)(b); *(type*)(b)=tmp;} while(0)
56
+ /*
57
+ * Qsort routine based on J. L. Bentley and M. D. McIlroy,
58
+ * "Engineering a sort function",
59
+ * Software--Practice and Experience 23 (1993) 1249-1265.
60
+ * We have modified their original by adding a check for already-sorted input,
61
+ * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
62
+ */
63
+ #define swapcode(TYPE, parmi, parmj, n) \
64
+ do { \
65
+ size_t i = (n) / sizeof (TYPE); \
66
+ TYPE *pi = (TYPE *)(void *)(parmi); \
67
+ TYPE *pj = (TYPE *)(void *)(parmj); \
68
+ do { \
69
+ TYPE t = *pi; \
70
+ *pi++ = *pj; \
71
+ *pj++ = t; \
72
+ } while (--i > 0); \
73
+ } while (0)
74
+
75
+ #define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
76
+ (es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
77
+
78
+ static inline void
79
+ swapfunc(a, b, n, swaptype)
80
+ char *a,
81
+ *b;
82
+ size_t n;
83
+ int swaptype;
84
+ {
85
+ if (swaptype <= 1)
86
+ swapcode(long, a, b, n);
87
+ else
88
+ swapcode(char, a, b, n);
89
+ }
58
90
 
59
- #define vecswap(type, a, b, n) if ((n)>0) swap(type,(a),(b))
91
+ #define swap(a, b) \
92
+ if (swaptype == 0) { \
93
+ long t = *(long *)(void *)(a); \
94
+ *(long *)(void *)(a) = *(long *)(void *)(b); \
95
+ *(long *)(void *)(b) = t; \
96
+ } else \
97
+ swapfunc(a, b, es, swaptype)
60
98
 
61
- #define MED3(a,b,c) \
99
+ #define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype)
100
+
101
+ #define med3(a,b,c,_cmp) \
62
102
  (cmpgt(b,a) ? \
63
103
  (cmpgt(c,b) ? b : (cmpgt(c,a) ? c : a)) \
64
104
  : (cmpgt(b,c) ? b : (cmpgt(c,a) ? a : c)))
@@ -76,75 +116,97 @@
76
116
  <% end %>
77
117
  <% c_func(:nodef)%>
78
118
 
79
- void
119
+ static void
80
120
  <%=type_name%>_qsort<%=suffix%>(void *a, size_t n, ssize_t es)
81
121
  {
82
- char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
83
- int d, r, presorted;
122
+ char *pa,
123
+ *pb,
124
+ *pc,
125
+ *pd,
126
+ *pl,
127
+ *pm,
128
+ *pn;
129
+ int d,
130
+ r,
131
+ swaptype,
132
+ presorted;
84
133
 
85
- loop:
86
- if (n < 7) {
87
- for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
88
- for (pl = pm; pl > (char *) a && cmpgt(pl - es, pl);
89
- pl -= es)
90
- swap(qsort_dtype, pl, pl - es);
91
- return;
92
- }
134
+ loop:SWAPINIT(a, es);
135
+ if (n < 7)
136
+ {
137
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
138
+ for (pl = pm; pl > (char *) a && cmpgt(pl - es, pl);
139
+ pl -= es)
140
+ swap(pl, pl - es);
141
+ return;
142
+ }
93
143
  presorted = 1;
94
- for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) {
95
- if (cmpgt(pm - es, pm)) {
96
- presorted = 0;
97
- break;
144
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
145
+ {
146
+ if (cmpgt(pm - es, pm))
147
+ {
148
+ presorted = 0;
149
+ break;
150
+ }
98
151
  }
99
- }
100
152
  if (presorted)
101
153
  return;
102
154
  pm = (char *) a + (n / 2) * es;
103
- if (n > 7) {
104
- pl = (char *) a;
105
- pn = (char *) a + (n - 1) * es;
106
- if (n > 40) {
107
- d = (n / 8) * es;
108
- pl = MED3(pl, pl + d, pl + 2 * d);
109
- pm = MED3(pm - d, pm, pm + d);
110
- pn = MED3(pn - 2 * d, pn - d, pn);
155
+ if (n > 7)
156
+ {
157
+ pl = (char *) a;
158
+ pn = (char *) a + (n - 1) * es;
159
+ if (n > 40)
160
+ {
161
+ d = (n / 8) * es;
162
+ pl = med3(pl, pl + d, pl + 2 * d, cmp);
163
+ pm = med3(pm - d, pm, pm + d, cmp);
164
+ pn = med3(pn - 2 * d, pn - d, pn, cmp);
165
+ }
166
+ pm = med3(pl, pm, pn, cmp);
111
167
  }
112
- pm = MED3(pl, pm, pn);
113
- }
114
- swap(qsort_dtype, a, pm);
168
+ swap(a, pm);
115
169
  pa = pb = (char *) a + es;
116
170
  pc = pd = (char *) a + (n - 1) * es;
117
- for (;;) {
118
- while (pb <= pc && (r = cmp(pb, a)) <= 0) {
119
- if (r == 0) {
120
- swap(qsort_dtype, pa, pb);
121
- pa += es;
122
- }
171
+ for (;;)
172
+ {
173
+ while (pb <= pc && (r = cmp(pb, a)) <= 0)
174
+ {
175
+ if (r == 0)
176
+ {
177
+ swap(pa, pb);
178
+ pa += es;
179
+ }
180
+ pb += es;
181
+ }
182
+ while (pb <= pc && (r = cmp(pc, a)) >= 0)
183
+ {
184
+ if (r == 0)
185
+ {
186
+ swap(pc, pd);
187
+ pd -= es;
188
+ }
189
+ pc -= es;
190
+ }
191
+ if (pb > pc)
192
+ break;
193
+ swap(pb, pc);
123
194
  pb += es;
124
- }
125
- while (pb <= pc && (r = cmp(pc, a)) >= 0) {
126
- if (r == 0) {
127
- swap(qsort_dtype, pc, pd);
128
- pd -= es;
129
- }
130
195
  pc -= es;
131
196
  }
132
- if (pb > pc)
133
- break;
134
- swap(qsort_dtype, pb, pc);
135
- pb += es;
136
- pc -= es;
137
- }
138
197
  pn = (char *) a + n * es;
139
198
  r = Min(pa - (char *) a, pb - pa);
140
- vecswap(qsort_dtype, a, pb - r, r);
199
+ vecswap(a, pb - r, r);
141
200
  r = Min(pd - pc, pn - pd - es);
142
- vecswap(qsort_dtype, pb, pn - r, r);
201
+ vecswap(pb, pn - r, r);
143
202
  if ((r = pb - pa) > es)
144
203
  <%=type_name%>_qsort<%=suffix%>(a, r / es, es);
145
- if ((r = pd - pc) > es) {
146
- a = pn - r;
147
- n = r / es;
148
- goto loop;
149
- }
204
+ if ((r = pd - pc) > es)
205
+ {
206
+ /* Iterate rather than recurse to save stack space */
207
+ a = pn - r;
208
+ n = r / es;
209
+ goto loop;
210
+ }
211
+ /* qsort(pn - r, r / es, es, cmp);*/
150
212
  }
@@ -13,7 +13,7 @@ static VALUE
13
13
  {
14
14
  VALUE r, klass;
15
15
 
16
- klass = CLASS_OF(obj);
16
+ klass = rb_obj_class(obj);
17
17
 
18
18
  <% definitions.each do |x| %>
19
19
  if (<%=x.condition("klass")%>) {
@@ -24,7 +24,7 @@ static VALUE
24
24
 
25
25
  if (IsNArray(obj)) {
26
26
  r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
27
- if (CLASS_OF(r)==cT) {
27
+ if (rb_obj_class(r)==cT) {
28
28
  <%=c_func%>(self,r);
29
29
  return self;
30
30
  }
@@ -34,8 +34,8 @@ static VALUE
34
34
  robject_store_numeric(self,obj);
35
35
  <% else %>
36
36
  rb_raise(nary_eCastError, "unknown conversion from %s to %s",
37
- rb_class2name(CLASS_OF(obj)),
38
- rb_class2name(CLASS_OF(self)));
37
+ rb_class2name(rb_obj_class(obj)),
38
+ rb_class2name(rb_obj_class(self)));
39
39
  <% end %>
40
40
  return self;
41
41
  }
@@ -16,13 +16,13 @@ static void
16
16
  if (idx1) {
17
17
  for (; i--;) {
18
18
  LOAD_BIT(a2, p2+*idx2, x); idx2++;
19
- y = m_from_real(x);
19
+ y = m_from_sint(x);
20
20
  SET_DATA_INDEX(p1,idx1,dtype,y);
21
21
  }
22
22
  } else {
23
23
  for (; i--;) {
24
24
  LOAD_BIT(a2, p2+*idx2, x); idx2++;
25
- y = m_from_real(x);
25
+ y = m_from_sint(x);
26
26
  SET_DATA_STRIDE(p1,s1,dtype,y);
27
27
  }
28
28
  }
@@ -30,13 +30,13 @@ static void
30
30
  if (idx1) {
31
31
  for (; i--;) {
32
32
  LOAD_BIT(a2, p2, x); p2 += s2;
33
- y = m_from_real(x);
33
+ y = m_from_sint(x);
34
34
  SET_DATA_INDEX(p1,idx1,dtype,y);
35
35
  }
36
36
  } else {
37
37
  for (; i--;) {
38
38
  LOAD_BIT(a2, p2, x); p2 += s2;
39
- y = m_from_real(x);
39
+ y = m_from_sint(x);
40
40
  SET_DATA_STRIDE(p1,s1,dtype,y);
41
41
  }
42
42
  }
@@ -1,4 +1,4 @@
1
- void
1
+ static void
2
2
  <%=c_iter%>(na_loop_t *const lp)
3
3
  {
4
4
  size_t i, s1;
@@ -1,23 +1,36 @@
1
1
  static void
2
2
  <%=c_iter%>(na_loop_t *const lp)
3
3
  {
4
- size_t i;
4
+ size_t i=0, n;
5
5
  char *p1, *p2;
6
6
  ssize_t s1, s2;
7
7
  size_t *idx1, *idx2;
8
8
  dtype x;
9
- INIT_COUNTER(lp, i);
9
+
10
+ <% if is_simd and !is_complex and %w[sqrt].include? name %>
11
+ size_t cnt;
12
+ size_t cnt_simd_loop = -1;
13
+ <% if is_double_precision %>
14
+ __m128d a;
15
+ <% else %>
16
+ __m128 a;
17
+ <% end %>
18
+ size_t num_pack; // Number of elements packed for SIMD.
19
+ num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dtype);
20
+ <% end %>
21
+ INIT_COUNTER(lp, n);
10
22
  INIT_PTR_IDX(lp, 0, p1, s1, idx1);
11
23
  INIT_PTR_IDX(lp, 1, p2, s2, idx2);
24
+
12
25
  if (idx1) {
13
26
  if (idx2) {
14
- for (; i--;) {
27
+ for (i=0; i<n; i++) {
15
28
  GET_DATA_INDEX(p1,idx1,dtype,x);
16
29
  x = m_<%=name%>(x);
17
30
  SET_DATA_INDEX(p2,idx2,dtype,x);
18
31
  }
19
32
  } else {
20
- for (; i--;) {
33
+ for (i=0; i<n; i++) {
21
34
  GET_DATA_INDEX(p1,idx1,dtype,x);
22
35
  x = m_<%=name%>(x);
23
36
  SET_DATA_STRIDE(p2,s2,dtype,x);
@@ -25,17 +38,50 @@ static void
25
38
  }
26
39
  } else {
27
40
  if (idx2) {
28
- for (; i--;) {
41
+ for (i=0; i<n; i++) {
29
42
  GET_DATA_STRIDE(p1,s1,dtype,x);
30
43
  x = m_<%=name%>(x);
31
44
  SET_DATA_INDEX(p2,idx2,dtype,x);
32
45
  }
33
46
  } else {
34
- for (; i--;) {
35
- GET_DATA_STRIDE(p1,s1,dtype,x);
36
- x = m_<%=name%>(x);
37
- SET_DATA_STRIDE(p2,s2,dtype,x);
47
+ <% if is_simd and !is_complex and %w[sqrt].include? name %>
48
+ // Check number of elements. & Check same alignment.
49
+ if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p2)[i], SIMD_ALIGNMENT_SIZE)){
50
+ // Calculate up to the position just before the start of SIMD computation.
51
+ cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype));
52
+ for (i=0; i < cnt; i++) {
53
+ ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]);
54
+ }
55
+
56
+ // Get the count of SIMD computation loops.
57
+ cnt_simd_loop = (n - i) % num_pack;
58
+
59
+ // SIMD computation.
60
+ if (p1 == p2) { // inplace case
61
+ for(; i < n - cnt_simd_loop; i += num_pack){
62
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
63
+ a = _mm_<%=name%>_<%=simd_type%>(a);
64
+ _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a);
65
+ }
66
+ } else {
67
+ for(; i < n - cnt_simd_loop; i += num_pack){
68
+ a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]);
69
+ a = _mm_<%=name%>_<%=simd_type%>(a);
70
+ _mm_stream_<%=simd_type%>(&((dtype*)p2)[i], a);
71
+ }
72
+ }
73
+
74
+ }
75
+ // Compute the remainder of the SIMD operation.
76
+ if (cnt_simd_loop != 0){
77
+ <% end %>
78
+ for (; i<n; i++) {
79
+ ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]);
80
+ }
81
+ <% if is_simd and !is_complex and %w[sqrt].include? name %>
38
82
  }
83
+ <% end %>
84
+ return;
39
85
  }
40
86
  }
41
87
  }