vector_sse 0.0.1.pre

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,340 @@
1
+ //
2
+ // Copyright (c) 2015, Robert Glissmann
3
+ // All rights reserved.
4
+ //
5
+ // Redistribution and use in source and binary forms, with or without
6
+ // modification, are permitted provided that the following conditions are met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright notice, this
9
+ // list of conditions and the following disclaimer.
10
+ //
11
+ // * Redistributions in binary form must reproduce the above copyright notice,
12
+ // this list of conditions and the following disclaimer in the documentation
13
+ // and/or other materials provided with the distribution.
14
+ //
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ //
26
+
27
+ // %% license-end-token %%
28
+ //
29
+ // Author: Robert.Glissmann@gmail.com (Robert Glissmann)
30
+ //
31
+ //
32
+
33
+ #include <emmintrin.h>
34
+ #include "vector_sse_mul.h"
35
+
36
+ #define SSE_VECTOR_WIDTH (4)
37
+
38
+ VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
39
+ {
40
+ uint32_t left_row = 0;
41
+ uint32_t right_col = 0;
42
+ uint32_t common = 0;
43
+ uint32_t vector_pos = 0;
44
+ uint32_t input_index = 0;
45
+ uint32_t pos = 0;
46
+
47
+ int32_t left_segment[ SSE_VECTOR_WIDTH ];
48
+ int32_t right_segment[ SSE_VECTOR_WIDTH ];
49
+ int64_t result_segment[ SSE_VECTOR_WIDTH/2 ];
50
+
51
+ __m128i* left_vec = NULL;
52
+ __m128i* right_vec = NULL;
53
+ __m128i result_vec;
54
+
55
+ VALUE result = Qnil;
56
+
57
+ int64_t* result_native = NULL;
58
+
59
+ uint32_t left_rows = NUM2INT( left_rows_rb );
60
+ uint32_t left_cols = NUM2INT( left_cols_rb );
61
+ uint32_t right_rows = NUM2INT( right_rows_rb );
62
+ uint32_t right_cols = NUM2INT( right_cols_rb );
63
+
64
+ uint32_t left_length = left_rows * left_cols;
65
+ uint32_t right_length = right_rows * right_cols;
66
+ uint32_t result_length = left_rows * right_cols;
67
+
68
+ int32_t* left_native = NULL;
69
+ int32_t* right_native = NULL;
70
+ int64_t* partial_native = NULL;
71
+ int64_t* temp = NULL;
72
+
73
+ left_native = (int32_t*) malloc( left_length * sizeof(int32_t) );
74
+ right_native = (int32_t*) malloc( right_length * sizeof(int32_t) );
75
+ result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
76
+ partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
77
+
78
+ memset(partial_native,0,left_cols*sizeof(int64_t) );
79
+
80
+ for ( pos = 0; pos < left_length; ++pos )
81
+ {
82
+ left_native[ pos ] = NUM2INT( rb_ary_entry( left, pos ) );
83
+ }
84
+ for ( pos = 0; pos < right_length; ++pos )
85
+ {
86
+ right_native[ pos ] = NUM2INT( rb_ary_entry( right, pos ) );
87
+ }
88
+
89
+ for ( left_row = 0; left_row < left_rows; ++left_row )
90
+ {
91
+ for ( right_col = 0; right_col < right_cols; ++right_col )
92
+ {
93
+ for ( common = 0; common < left_cols; common += (SSE_VECTOR_WIDTH/2) )
94
+ {
95
+ memset( left_segment, 0, sizeof( left_segment ) );
96
+ memset( right_segment, 0, sizeof( right_segment ) );
97
+
98
+ input_index = common;
99
+ left_segment[ 0 ] = left_native[ left_row * left_cols + input_index ];
100
+ right_segment[ 0 ] = right_native[ input_index * right_cols + right_col ];
101
+
102
+ input_index = common + 1;
103
+ if ( input_index < left_cols )
104
+ {
105
+ left_segment[ 2 ] = left_native[ left_row * left_cols + input_index ];
106
+ right_segment[ 2 ] = right_native[ input_index * right_cols + right_col ];
107
+ }
108
+
109
+ left_vec = ( __m128i *)left_segment;
110
+ right_vec = ( __m128i *)right_segment;
111
+ result_vec = _mm_mul_epu32( *left_vec, *right_vec );
112
+
113
+ _mm_store_si128( (__m128i*)result_segment, result_vec );
114
+ for ( pos = 0; pos < SSE_VECTOR_WIDTH/2; ++pos )
115
+ {
116
+ if ( (common + pos) < left_cols )
117
+ {
118
+ partial_native[ common + pos ] = result_segment[ pos ];
119
+ }
120
+ }
121
+ }
122
+
123
+ result_native[ left_row * right_cols + right_col ] = 0;
124
+ temp = &result_native[ left_row * right_cols + right_col ];
125
+ for ( common = 0; common < left_cols; ++common )
126
+ {
127
+ (*temp) += partial_native[ common ];
128
+ }
129
+ }
130
+ }
131
+
132
+ result = rb_ary_new2( result_length );
133
+ for ( pos = 0; pos < result_length; ++pos )
134
+ {
135
+ rb_ary_push( result, INT2NUM( result_native[ pos ] ) );
136
+ }
137
+
138
+ free( left_native );
139
+ free( right_native );
140
+ free( result_native );
141
+ free( partial_native );
142
+
143
+ return result;
144
+ }
145
+
146
+ VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
147
+ {
148
+ uint32_t left_row = 0;
149
+ uint32_t right_col = 0;
150
+ uint32_t common = 0;
151
+ uint32_t vector_pos = 0;
152
+ uint32_t input_index = 0;
153
+ uint32_t pos = 0;
154
+
155
+ int64_t left_segment[ SSE_VECTOR_WIDTH ];
156
+ int64_t right_segment[ SSE_VECTOR_WIDTH ];
157
+
158
+ __m128i* left_vec = NULL;
159
+ __m128i* right_vec = NULL;
160
+ __m128i result_vec;
161
+
162
+ VALUE result = Qnil;
163
+
164
+ int64_t* result_native = NULL;
165
+
166
+ uint32_t left_rows = NUM2INT( left_rows_rb );
167
+ uint32_t left_cols = NUM2INT( left_cols_rb );
168
+ uint32_t right_rows = NUM2INT( right_rows_rb );
169
+ uint32_t right_cols = NUM2INT( right_cols_rb );
170
+
171
+ uint32_t left_length = left_rows * left_cols;
172
+ uint32_t right_length = right_rows * right_cols;
173
+ uint32_t result_length = left_rows * right_cols;
174
+
175
+ int64_t* left_native = NULL;
176
+ int64_t* right_native = NULL;
177
+ int64_t* partial_native = NULL;
178
+ int64_t* temp = NULL;
179
+
180
+
181
+ left_native = (int64_t*) malloc( left_length * sizeof(int64_t) );
182
+ right_native = (int64_t*) malloc( right_length * sizeof(int64_t) );
183
+ result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
184
+ partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
185
+
186
+ memset(partial_native,0,left_cols*sizeof(int64_t) );
187
+
188
+ for ( pos = 0; pos < left_length; ++pos )
189
+ {
190
+ left_native[ pos ] = NUM2LL( rb_ary_entry( left, pos ) );
191
+ }
192
+ for ( pos = 0; pos < right_length; ++pos )
193
+ {
194
+ right_native[ pos ] = NUM2LL( rb_ary_entry( right, pos ) );
195
+ }
196
+
197
+ for ( left_row = 0; left_row < left_rows; ++left_row )
198
+ {
199
+ for ( right_col = 0; right_col < right_cols; ++right_col )
200
+ {
201
+ for ( common = 0; common < left_cols; ++common )
202
+ {
203
+ partial_native[ common ] =
204
+ left_native[ left_row * left_cols + common ] *
205
+ right_native[ common * right_cols + right_col ];
206
+ }
207
+
208
+ result_native[ left_row * right_cols + right_col ] = 0;
209
+ temp = &result_native[ left_row * right_cols + right_col ];
210
+ for ( common = 0; common < left_cols; ++common )
211
+ {
212
+ (*temp) += partial_native[ common ];
213
+ }
214
+ }
215
+ }
216
+
217
+ result = rb_ary_new2( result_length );
218
+ for ( pos = 0; pos < result_length; ++pos )
219
+ {
220
+ rb_ary_push( result, LL2NUM( result_native[ pos ] ) );
221
+ }
222
+
223
+ free( left_native );
224
+ free( right_native );
225
+ free( result_native );
226
+ free( partial_native );
227
+
228
+ return result;
229
+ }
230
+
231
+ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
232
+ {
233
+ uint32_t left_row = 0;
234
+ uint32_t right_col = 0;
235
+ uint32_t common = 0;
236
+ uint32_t vector_pos = 0;
237
+ uint32_t input_index = 0;
238
+ uint32_t pos = 0;
239
+
240
+ float left_segment[ SSE_VECTOR_WIDTH ];
241
+ float right_segment[ SSE_VECTOR_WIDTH ];
242
+ float result_segment[ SSE_VECTOR_WIDTH ];
243
+
244
+ __m128i* left_vec = NULL;
245
+ __m128i* right_vec = NULL;
246
+ __m128i result_vec;
247
+
248
+ VALUE result = Qnil;
249
+
250
+ float* result_native = NULL;
251
+
252
+ uint32_t left_rows = NUM2UINT( left_rows_rb );
253
+ uint32_t left_cols = NUM2UINT( left_cols_rb );
254
+ uint32_t right_rows = NUM2UINT( right_rows_rb );
255
+ uint32_t right_cols = NUM2UINT( right_cols_rb );
256
+
257
+ uint32_t left_length = left_rows * left_cols;
258
+ uint32_t right_length = right_rows * right_cols;
259
+ uint32_t result_length = left_rows * right_cols;
260
+
261
+ float* left_native = NULL;
262
+ float* right_native = NULL;
263
+ float* partial_native = NULL;
264
+ float* temp = NULL;
265
+
266
+ left_native = (float*) malloc( left_length * sizeof(float) );
267
+ right_native = (float*) malloc( right_length * sizeof(float) );
268
+ result_native = (float*) malloc( result_length * sizeof(float) );
269
+ partial_native = (float*) malloc( left_cols * sizeof(float) );
270
+
271
+ memset( partial_native, 0, left_cols * sizeof(float) );
272
+
273
+ for ( pos = 0; pos < left_length; ++pos )
274
+ {
275
+ left_native[ pos ] = NUM2DBL( rb_ary_entry( left, pos ) );
276
+ }
277
+ for ( pos = 0; pos < right_length; ++pos )
278
+ {
279
+ right_native[ pos ] = NUM2DBL( rb_ary_entry( right, pos ) );
280
+ }
281
+
282
+ for ( left_row = 0; left_row < left_rows; ++left_row )
283
+ {
284
+ for ( right_col = 0; right_col < right_cols; ++right_col )
285
+ {
286
+ for ( common = 0; common < left_cols; common += SSE_VECTOR_WIDTH )
287
+ {
288
+ for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
289
+ {
290
+ input_index = common + pos;
291
+
292
+ if ( input_index < left_cols )
293
+ {
294
+ left_segment[ pos ] = left_native[ left_row * left_cols + input_index ];
295
+ right_segment[ pos ] = right_native[ input_index * right_cols + right_col ];
296
+ }
297
+ else
298
+ {
299
+ left_segment[ pos ] = 0;
300
+ right_segment[ pos ] = 0;
301
+ }
302
+ }
303
+
304
+ left_vec = ( __m128i *)left_segment;
305
+ right_vec = ( __m128i *)right_segment;
306
+ result_vec = _mm_mul_ps( *left_vec, *right_vec );
307
+
308
+ _mm_store_si128( (__m128i*)result_segment, result_vec );
309
+ for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
310
+ {
311
+ if ( (common + pos) < left_cols )
312
+ {
313
+ partial_native[ common + pos ] = result_segment[ pos ];
314
+ }
315
+ }
316
+ }
317
+
318
+ result_native[ left_row * right_cols + right_col ] = 0;
319
+ temp = &result_native[ left_row * right_cols + right_col ];
320
+ for ( common = 0; common < left_cols; ++common )
321
+ {
322
+ (*temp) += partial_native[ common ];
323
+ }
324
+ }
325
+ }
326
+
327
+ result = rb_ary_new2( result_length );
328
+ for ( pos = 0; pos < result_length; ++pos )
329
+ {
330
+ rb_ary_push( result, DBL2NUM( result_native[ pos ] ) );
331
+ }
332
+
333
+ free( left_native );
334
+ free( right_native );
335
+ free( result_native );
336
+ free( partial_native );
337
+
338
+ return result;
339
+ }
340
+
@@ -0,0 +1,43 @@
1
+ //
2
+ // Copyright (c) 2015, Robert Glissmann
3
+ // All rights reserved.
4
+ //
5
+ // Redistribution and use in source and binary forms, with or without
6
+ // modification, are permitted provided that the following conditions are met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright notice, this
9
+ // list of conditions and the following disclaimer.
10
+ //
11
+ // * Redistributions in binary form must reproduce the above copyright notice,
12
+ // this list of conditions and the following disclaimer in the documentation
13
+ // and/or other materials provided with the distribution.
14
+ //
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ //
26
+
27
+ // %% license-end-token %%
28
+ //
29
+ // Author: Robert.Glissmann@gmail.com (Robert Glissmann)
30
+ //
31
+ //
32
+
33
+ #ifndef VECTOR_SSE_MUL_H
34
+ #define VECTOR_SSE_MUL_H
35
+
36
+ #include "ruby.h"
37
+
38
+ VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
39
+ VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
40
+ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
41
+ VALUE method_mat_mul_f64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
42
+
43
+ #endif // VECTOR_SSE_MUL_H
@@ -0,0 +1,118 @@
1
+ //
2
+ // Copyright (c) 2015, Robert Glissmann
3
+ // All rights reserved.
4
+ //
5
+ // Redistribution and use in source and binary forms, with or without
6
+ // modification, are permitted provided that the following conditions are met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright notice, this
9
+ // list of conditions and the following disclaimer.
10
+ //
11
+ // * Redistributions in binary form must reproduce the above copyright notice,
12
+ // this list of conditions and the following disclaimer in the documentation
13
+ // and/or other materials provided with the distribution.
14
+ //
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ //
26
+
27
+ // %% license-end-token %%
28
+ //
29
+ // Author: Robert.Glissmann@gmail.com (Robert Glissmann)
30
+ //
31
+ //
32
+
33
+ #include <string.h>
34
+ #include <emmintrin.h>
35
+ #include "vector_sse_sum.h"
36
+
37
+ #define TEMPLATE_SUM_S( FUNC_NAME, TYPE, OFTYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, ADD ) \
38
+ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
39
+ { \
40
+ uint32_t length = 0; \
41
+ uint32_t offset = 0; \
42
+ uint32_t vector_pos = 0; \
43
+ uint32_t input_index = 0; \
44
+ \
45
+ TYPE result = 0; \
46
+ \
47
+ TYPE left_segment[ EL_PER_VEC ]; \
48
+ TYPE right_segment[ EL_PER_VEC ]; \
49
+ TYPE result_segment[ EL_PER_VEC ]; \
50
+ TYPE vector_segment[ EL_PER_VEC ]; \
51
+ \
52
+ __m128i left_vec; \
53
+ __m128i right_vec; \
54
+ __m128i result_vec; \
55
+ \
56
+ __m128i sign_left; \
57
+ __m128i sign_right; \
58
+ const OFTYPE OVERFLOW_MASK = ( (OFTYPE)0x1 << (TYPE_SIZE-1) ); \
59
+ OFTYPE overflow[ EL_PER_VEC ]; \
60
+ __m128i* overflow_vec = (__m128i*)overflow; \
61
+ \
62
+ Check_Type( vector, T_ARRAY ); \
63
+ \
64
+ length = RARRAY_LEN( vector ); \
65
+ \
66
+ if ( length > 0 ) \
67
+ { \
68
+ memset( &result_vec, 0, sizeof( result_vec ) ); \
69
+ \
70
+ for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
71
+ { \
72
+ for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
73
+ { \
74
+ input_index = offset + vector_pos; \
75
+ if ( input_index < length ) \
76
+ { \
77
+ vector_segment[ vector_pos ] = CONV_IN( rb_ary_entry( vector, input_index ) ); \
78
+ } \
79
+ else \
80
+ { \
81
+ vector_segment[ vector_pos ] = 0; \
82
+ } \
83
+ } \
84
+ \
85
+ right_vec = _mm_loadu_si128( (const __m128i *)vector_segment ); \
86
+ left_vec = _mm_loadu_si128( &result_vec ); \
87
+ \
88
+ result_vec = ADD( left_vec, right_vec ); \
89
+ \
90
+ sign_left = _mm_xor_si128(result_vec, left_vec); \
91
+ sign_right = _mm_xor_si128(result_vec, right_vec); \
92
+ *overflow_vec = _mm_and_si128(sign_left, sign_right); \
93
+ \
94
+ for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
95
+ { \
96
+ if ( ( (OFTYPE)overflow[ vector_pos ] & OVERFLOW_MASK ) ) \
97
+ { \
98
+ rb_raise( rb_eRuntimeError, "Vector addition overflow" ); \
99
+ } \
100
+ } \
101
+ } \
102
+ \
103
+ _mm_store_si128( (__m128i*)result_segment, result_vec ); \
104
+ \
105
+ for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
106
+ { \
107
+ result += result_segment[ vector_pos ]; \
108
+ } \
109
+ } \
110
+ \
111
+ return CONV_OUT( result ); \
112
+ }
113
+
114
+ TEMPLATE_SUM_S( method_vec_sum_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
115
+ TEMPLATE_SUM_S( method_vec_sum_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
116
+ TEMPLATE_SUM_S( method_vec_sum_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, _mm_add_ps );
117
+ TEMPLATE_SUM_S( method_vec_sum_f64, double, int64_t, 32, NUM2DBL, DBL2NUM, 2, _mm_add_pd );
118
+
@@ -0,0 +1,43 @@
1
+ //
2
+ // Copyright (c) 2015, Robert Glissmann
3
+ // All rights reserved.
4
+ //
5
+ // Redistribution and use in source and binary forms, with or without
6
+ // modification, are permitted provided that the following conditions are met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright notice, this
9
+ // list of conditions and the following disclaimer.
10
+ //
11
+ // * Redistributions in binary form must reproduce the above copyright notice,
12
+ // this list of conditions and the following disclaimer in the documentation
13
+ // and/or other materials provided with the distribution.
14
+ //
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ //
26
+
27
+ // %% license-end-token %%
28
+ //
29
+ // Author: Robert.Glissmann@gmail.com (Robert Glissmann)
30
+ //
31
+ //
32
+
33
+ #ifndef VECTOR_SSE_SUM_H
34
+ #define VECTOR_SSE_SUM_H
35
+
36
+ #include "ruby.h"
37
+
38
+ VALUE method_vec_sum_s32( VALUE self, VALUE vector );
39
+ VALUE method_vec_sum_s64( VALUE self, VALUE vector );
40
+ VALUE method_vec_sum_f32( VALUE self, VALUE vector );
41
+ VALUE method_vec_sum_f64( VALUE self, VALUE vector );
42
+
43
+ #endif // VECTOR_SSE_SUM_H
@@ -0,0 +1,157 @@
1
+ //
2
+ // Copyright (c) 2015, Robert Glissmann
3
+ // All rights reserved.
4
+ //
5
+ // Redistribution and use in source and binary forms, with or without
6
+ // modification, are permitted provided that the following conditions are met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright notice, this
9
+ // list of conditions and the following disclaimer.
10
+ //
11
+ // * Redistributions in binary form must reproduce the above copyright notice,
12
+ // this list of conditions and the following disclaimer in the documentation
13
+ // and/or other materials provided with the distribution.
14
+ //
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ //
26
+
27
+ // %% license-end-token %%
28
+ //
29
+ // Author: Robert.Glissmann@gmail.com (Robert Glissmann)
30
+ //
31
+ //
32
+
33
+ #include <string.h>
34
+ #include <emmintrin.h>
35
+ #ifdef __SSE4_1__ // modern CPU - use SSE 4.1
36
+ #include <smmintrin.h>
37
+ #endif
38
+ #include "vector_sse_vec_mul.h"
39
+
40
+ #define SSE_VECTOR_WIDTH (4)
41
+ // #define EL_PER_VEC SSE_VECTOR_WIDTH
42
+
43
+ static inline __m128i mul_s32( const __m128i* a, const __m128i* b )
44
+ {
45
+ #ifdef __SSE4_1__
46
+ return _mm_mullo_epi32(*a, *b);
47
+ #else // old CPU - use SSE 2
48
+ __m128i tmp1 = _mm_mul_epu32(*a,*b); /* mul 2,0*/
49
+ __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(*a,4), _mm_srli_si128(*b,4)); /* mul 3,1 */
50
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
51
+ #endif
52
+ }
53
+
54
+ static inline __m128i mul_s64( const __m128i* left_vec, const __m128i* right_vec )
55
+ {
56
+ // a * b = ( a0 * S + a1 ) * ( b0 * S + b1 )
57
+ // = a0 * b0 * S^2 + ( a0 * b1 + a1 * b0 ) * S + a1 * b1
58
+
59
+ int64_t left[ 2 ];
60
+ int64_t right[ 2 ];
61
+ int64_t result[ 2 ];
62
+ _mm_store_si128( (__m128i*)left, *left_vec );
63
+ _mm_store_si128( (__m128i*)right, *right_vec );
64
+ result[0] = left[0] * right[1];
65
+ result[1] = left[1] * right[1];
66
+ return _mm_loadu_si128( (const __m128i *)result );
67
+ }
68
+
69
+ static inline __m128i mul_f32(const __m128i* a, const __m128i* b )
70
+ {
71
+ return _mm_mul_ps( *a, *b );
72
+ }
73
+
74
+ static inline __m128i mul_f64(const __m128i* a, const __m128i* b )
75
+ {
76
+ return _mm_mul_pd( *a, *b );
77
+ }
78
+
79
+
80
+ #define TEMPLATE_VEC_MUL_S( FUNC_NAME, TYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, MULOP ) \
81
+ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
82
+ { \
83
+ uint32_t length = 0; \
84
+ uint32_t offset = 0; \
85
+ uint32_t vector_pos = 0; \
86
+ uint32_t input_index = 0; \
87
+ \
88
+ VALUE result = Qnil; \
89
+ \
90
+ TYPE left_segment[ EL_PER_VEC ]; \
91
+ TYPE right_segment[ EL_PER_VEC ]; \
92
+ \
93
+ __m128i left_vec; \
94
+ __m128i right_vec; \
95
+ \
96
+ TYPE result_segment[ EL_PER_VEC ]; \
97
+ __m128i result_vec; \
98
+ \
99
+ __m128i sign_left; \
100
+ __m128i sign_right; \
101
+ \
102
+ Check_Type( left, T_ARRAY ); \
103
+ Check_Type( right, T_ARRAY ); \
104
+ \
105
+ length = RARRAY_LEN( left ); \
106
+ result = rb_ary_new2( length ); \
107
+ \
108
+ if ( length > 0 ) \
109
+ { \
110
+ memset( &result_vec, 0, sizeof( result_vec ) ); \
111
+ \
112
+ for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
113
+ { \
114
+ for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
115
+ { \
116
+ input_index = offset + vector_pos; \
117
+ \
118
+ if ( input_index < length ) \
119
+ { \
120
+ left_segment[ vector_pos ] = CONV_IN( rb_ary_entry( left, input_index ) ); \
121
+ right_segment[ vector_pos ] = CONV_IN( rb_ary_entry( right, input_index ) ); \
122
+ } \
123
+ else \
124
+ { \
125
+ left_segment[ vector_pos ] = 0; \
126
+ right_segment[ vector_pos ] = 0; \
127
+ } \
128
+ } \
129
+ \
130
+ left_vec = _mm_loadu_si128( (const __m128i *)left_segment ); \
131
+ right_vec = _mm_loadu_si128( (const __m128i *)right_segment ); \
132
+ \
133
+ result_vec = MULOP( &left_vec, &right_vec ); \
134
+ \
135
+ _mm_store_si128( (__m128i*)result_segment, result_vec ); \
136
+ \
137
+ for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
138
+ { \
139
+ input_index = offset + vector_pos; \
140
+ \
141
+ if ( input_index < length ) \
142
+ { \
143
+ rb_ary_push( result, CONV_OUT( result_segment[ vector_pos ] ) ); \
144
+ } \
145
+ } \
146
+ } \
147
+ } \
148
+ \
149
+ return result; \
150
+ }
151
+
152
+
153
+ TEMPLATE_VEC_MUL_S( method_vec_mul_s32, int32_t, 32, NUM2INT, INT2NUM, 4, mul_s32 );
154
+ TEMPLATE_VEC_MUL_S( method_vec_mul_s64, int64_t, 64, NUM2LL, LL2NUM, 2, mul_s64 );
155
+ TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4, mul_f32 );
156
+ TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2, mul_f64 );
157
+