vector_sse 0.0.1.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/LICENSE.txt +23 -0
- data/README.md +100 -0
- data/Rakefile +9 -0
- data/ext/vector_sse/.gitignore +1 -0
- data/ext/vector_sse/extconf.rb +13 -0
- data/ext/vector_sse/vector_sse.c +80 -0
- data/ext/vector_sse/vector_sse_add.c +127 -0
- data/ext/vector_sse/vector_sse_add.h +48 -0
- data/ext/vector_sse/vector_sse_mul.c +340 -0
- data/ext/vector_sse/vector_sse_mul.h +43 -0
- data/ext/vector_sse/vector_sse_sum.c +118 -0
- data/ext/vector_sse/vector_sse_sum.h +43 -0
- data/ext/vector_sse/vector_sse_vec_mul.c +157 -0
- data/ext/vector_sse/vector_sse_vec_mul.h +43 -0
- data/lib/.gitignore +1 -0
- data/lib/vector_sse.rb +475 -0
- data/spec/vector_mat_spec.rb +374 -0
- data/spec/vector_vec_spec.rb +150 -0
- data/vector_sse.gemspec +19 -0
- metadata +93 -0
@@ -0,0 +1,340 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <emmintrin.h>
|
34
|
+
#include "vector_sse_mul.h"
|
35
|
+
|
36
|
+
#define SSE_VECTOR_WIDTH (4)
|
37
|
+
|
38
|
+
VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
39
|
+
{
|
40
|
+
uint32_t left_row = 0;
|
41
|
+
uint32_t right_col = 0;
|
42
|
+
uint32_t common = 0;
|
43
|
+
uint32_t vector_pos = 0;
|
44
|
+
uint32_t input_index = 0;
|
45
|
+
uint32_t pos = 0;
|
46
|
+
|
47
|
+
int32_t left_segment[ SSE_VECTOR_WIDTH ];
|
48
|
+
int32_t right_segment[ SSE_VECTOR_WIDTH ];
|
49
|
+
int64_t result_segment[ SSE_VECTOR_WIDTH/2 ];
|
50
|
+
|
51
|
+
__m128i* left_vec = NULL;
|
52
|
+
__m128i* right_vec = NULL;
|
53
|
+
__m128i result_vec;
|
54
|
+
|
55
|
+
VALUE result = Qnil;
|
56
|
+
|
57
|
+
int64_t* result_native = NULL;
|
58
|
+
|
59
|
+
uint32_t left_rows = NUM2INT( left_rows_rb );
|
60
|
+
uint32_t left_cols = NUM2INT( left_cols_rb );
|
61
|
+
uint32_t right_rows = NUM2INT( right_rows_rb );
|
62
|
+
uint32_t right_cols = NUM2INT( right_cols_rb );
|
63
|
+
|
64
|
+
uint32_t left_length = left_rows * left_cols;
|
65
|
+
uint32_t right_length = right_rows * right_cols;
|
66
|
+
uint32_t result_length = left_rows * right_cols;
|
67
|
+
|
68
|
+
int32_t* left_native = NULL;
|
69
|
+
int32_t* right_native = NULL;
|
70
|
+
int64_t* partial_native = NULL;
|
71
|
+
int64_t* temp = NULL;
|
72
|
+
|
73
|
+
left_native = (int32_t*) malloc( left_length * sizeof(int32_t) );
|
74
|
+
right_native = (int32_t*) malloc( right_length * sizeof(int32_t) );
|
75
|
+
result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
|
76
|
+
partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
|
77
|
+
|
78
|
+
memset(partial_native,0,left_cols*sizeof(int64_t) );
|
79
|
+
|
80
|
+
for ( pos = 0; pos < left_length; ++pos )
|
81
|
+
{
|
82
|
+
left_native[ pos ] = NUM2INT( rb_ary_entry( left, pos ) );
|
83
|
+
}
|
84
|
+
for ( pos = 0; pos < right_length; ++pos )
|
85
|
+
{
|
86
|
+
right_native[ pos ] = NUM2INT( rb_ary_entry( right, pos ) );
|
87
|
+
}
|
88
|
+
|
89
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
90
|
+
{
|
91
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
92
|
+
{
|
93
|
+
for ( common = 0; common < left_cols; common += (SSE_VECTOR_WIDTH/2) )
|
94
|
+
{
|
95
|
+
memset( left_segment, 0, sizeof( left_segment ) );
|
96
|
+
memset( right_segment, 0, sizeof( right_segment ) );
|
97
|
+
|
98
|
+
input_index = common;
|
99
|
+
left_segment[ 0 ] = left_native[ left_row * left_cols + input_index ];
|
100
|
+
right_segment[ 0 ] = right_native[ input_index * right_cols + right_col ];
|
101
|
+
|
102
|
+
input_index = common + 1;
|
103
|
+
if ( input_index < left_cols )
|
104
|
+
{
|
105
|
+
left_segment[ 2 ] = left_native[ left_row * left_cols + input_index ];
|
106
|
+
right_segment[ 2 ] = right_native[ input_index * right_cols + right_col ];
|
107
|
+
}
|
108
|
+
|
109
|
+
left_vec = ( __m128i *)left_segment;
|
110
|
+
right_vec = ( __m128i *)right_segment;
|
111
|
+
result_vec = _mm_mul_epu32( *left_vec, *right_vec );
|
112
|
+
|
113
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
114
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH/2; ++pos )
|
115
|
+
{
|
116
|
+
if ( (common + pos) < left_cols )
|
117
|
+
{
|
118
|
+
partial_native[ common + pos ] = result_segment[ pos ];
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
124
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
125
|
+
for ( common = 0; common < left_cols; ++common )
|
126
|
+
{
|
127
|
+
(*temp) += partial_native[ common ];
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
result = rb_ary_new2( result_length );
|
133
|
+
for ( pos = 0; pos < result_length; ++pos )
|
134
|
+
{
|
135
|
+
rb_ary_push( result, INT2NUM( result_native[ pos ] ) );
|
136
|
+
}
|
137
|
+
|
138
|
+
free( left_native );
|
139
|
+
free( right_native );
|
140
|
+
free( result_native );
|
141
|
+
free( partial_native );
|
142
|
+
|
143
|
+
return result;
|
144
|
+
}
|
145
|
+
|
146
|
+
VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
147
|
+
{
|
148
|
+
uint32_t left_row = 0;
|
149
|
+
uint32_t right_col = 0;
|
150
|
+
uint32_t common = 0;
|
151
|
+
uint32_t vector_pos = 0;
|
152
|
+
uint32_t input_index = 0;
|
153
|
+
uint32_t pos = 0;
|
154
|
+
|
155
|
+
int64_t left_segment[ SSE_VECTOR_WIDTH ];
|
156
|
+
int64_t right_segment[ SSE_VECTOR_WIDTH ];
|
157
|
+
|
158
|
+
__m128i* left_vec = NULL;
|
159
|
+
__m128i* right_vec = NULL;
|
160
|
+
__m128i result_vec;
|
161
|
+
|
162
|
+
VALUE result = Qnil;
|
163
|
+
|
164
|
+
int64_t* result_native = NULL;
|
165
|
+
|
166
|
+
uint32_t left_rows = NUM2INT( left_rows_rb );
|
167
|
+
uint32_t left_cols = NUM2INT( left_cols_rb );
|
168
|
+
uint32_t right_rows = NUM2INT( right_rows_rb );
|
169
|
+
uint32_t right_cols = NUM2INT( right_cols_rb );
|
170
|
+
|
171
|
+
uint32_t left_length = left_rows * left_cols;
|
172
|
+
uint32_t right_length = right_rows * right_cols;
|
173
|
+
uint32_t result_length = left_rows * right_cols;
|
174
|
+
|
175
|
+
int64_t* left_native = NULL;
|
176
|
+
int64_t* right_native = NULL;
|
177
|
+
int64_t* partial_native = NULL;
|
178
|
+
int64_t* temp = NULL;
|
179
|
+
|
180
|
+
|
181
|
+
left_native = (int64_t*) malloc( left_length * sizeof(int64_t) );
|
182
|
+
right_native = (int64_t*) malloc( right_length * sizeof(int64_t) );
|
183
|
+
result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
|
184
|
+
partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
|
185
|
+
|
186
|
+
memset(partial_native,0,left_cols*sizeof(int64_t) );
|
187
|
+
|
188
|
+
for ( pos = 0; pos < left_length; ++pos )
|
189
|
+
{
|
190
|
+
left_native[ pos ] = NUM2LL( rb_ary_entry( left, pos ) );
|
191
|
+
}
|
192
|
+
for ( pos = 0; pos < right_length; ++pos )
|
193
|
+
{
|
194
|
+
right_native[ pos ] = NUM2LL( rb_ary_entry( right, pos ) );
|
195
|
+
}
|
196
|
+
|
197
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
198
|
+
{
|
199
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
200
|
+
{
|
201
|
+
for ( common = 0; common < left_cols; ++common )
|
202
|
+
{
|
203
|
+
partial_native[ common ] =
|
204
|
+
left_native[ left_row * left_cols + common ] *
|
205
|
+
right_native[ common * right_cols + right_col ];
|
206
|
+
}
|
207
|
+
|
208
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
209
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
210
|
+
for ( common = 0; common < left_cols; ++common )
|
211
|
+
{
|
212
|
+
(*temp) += partial_native[ common ];
|
213
|
+
}
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
result = rb_ary_new2( result_length );
|
218
|
+
for ( pos = 0; pos < result_length; ++pos )
|
219
|
+
{
|
220
|
+
rb_ary_push( result, LL2NUM( result_native[ pos ] ) );
|
221
|
+
}
|
222
|
+
|
223
|
+
free( left_native );
|
224
|
+
free( right_native );
|
225
|
+
free( result_native );
|
226
|
+
free( partial_native );
|
227
|
+
|
228
|
+
return result;
|
229
|
+
}
|
230
|
+
|
231
|
+
VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
232
|
+
{
|
233
|
+
uint32_t left_row = 0;
|
234
|
+
uint32_t right_col = 0;
|
235
|
+
uint32_t common = 0;
|
236
|
+
uint32_t vector_pos = 0;
|
237
|
+
uint32_t input_index = 0;
|
238
|
+
uint32_t pos = 0;
|
239
|
+
|
240
|
+
float left_segment[ SSE_VECTOR_WIDTH ];
|
241
|
+
float right_segment[ SSE_VECTOR_WIDTH ];
|
242
|
+
float result_segment[ SSE_VECTOR_WIDTH ];
|
243
|
+
|
244
|
+
__m128i* left_vec = NULL;
|
245
|
+
__m128i* right_vec = NULL;
|
246
|
+
__m128i result_vec;
|
247
|
+
|
248
|
+
VALUE result = Qnil;
|
249
|
+
|
250
|
+
float* result_native = NULL;
|
251
|
+
|
252
|
+
uint32_t left_rows = NUM2UINT( left_rows_rb );
|
253
|
+
uint32_t left_cols = NUM2UINT( left_cols_rb );
|
254
|
+
uint32_t right_rows = NUM2UINT( right_rows_rb );
|
255
|
+
uint32_t right_cols = NUM2UINT( right_cols_rb );
|
256
|
+
|
257
|
+
uint32_t left_length = left_rows * left_cols;
|
258
|
+
uint32_t right_length = right_rows * right_cols;
|
259
|
+
uint32_t result_length = left_rows * right_cols;
|
260
|
+
|
261
|
+
float* left_native = NULL;
|
262
|
+
float* right_native = NULL;
|
263
|
+
float* partial_native = NULL;
|
264
|
+
float* temp = NULL;
|
265
|
+
|
266
|
+
left_native = (float*) malloc( left_length * sizeof(float) );
|
267
|
+
right_native = (float*) malloc( right_length * sizeof(float) );
|
268
|
+
result_native = (float*) malloc( result_length * sizeof(float) );
|
269
|
+
partial_native = (float*) malloc( left_cols * sizeof(float) );
|
270
|
+
|
271
|
+
memset( partial_native, 0, left_cols * sizeof(float) );
|
272
|
+
|
273
|
+
for ( pos = 0; pos < left_length; ++pos )
|
274
|
+
{
|
275
|
+
left_native[ pos ] = NUM2DBL( rb_ary_entry( left, pos ) );
|
276
|
+
}
|
277
|
+
for ( pos = 0; pos < right_length; ++pos )
|
278
|
+
{
|
279
|
+
right_native[ pos ] = NUM2DBL( rb_ary_entry( right, pos ) );
|
280
|
+
}
|
281
|
+
|
282
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
283
|
+
{
|
284
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
285
|
+
{
|
286
|
+
for ( common = 0; common < left_cols; common += SSE_VECTOR_WIDTH )
|
287
|
+
{
|
288
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
289
|
+
{
|
290
|
+
input_index = common + pos;
|
291
|
+
|
292
|
+
if ( input_index < left_cols )
|
293
|
+
{
|
294
|
+
left_segment[ pos ] = left_native[ left_row * left_cols + input_index ];
|
295
|
+
right_segment[ pos ] = right_native[ input_index * right_cols + right_col ];
|
296
|
+
}
|
297
|
+
else
|
298
|
+
{
|
299
|
+
left_segment[ pos ] = 0;
|
300
|
+
right_segment[ pos ] = 0;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
left_vec = ( __m128i *)left_segment;
|
305
|
+
right_vec = ( __m128i *)right_segment;
|
306
|
+
result_vec = _mm_mul_ps( *left_vec, *right_vec );
|
307
|
+
|
308
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
309
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
310
|
+
{
|
311
|
+
if ( (common + pos) < left_cols )
|
312
|
+
{
|
313
|
+
partial_native[ common + pos ] = result_segment[ pos ];
|
314
|
+
}
|
315
|
+
}
|
316
|
+
}
|
317
|
+
|
318
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
319
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
320
|
+
for ( common = 0; common < left_cols; ++common )
|
321
|
+
{
|
322
|
+
(*temp) += partial_native[ common ];
|
323
|
+
}
|
324
|
+
}
|
325
|
+
}
|
326
|
+
|
327
|
+
result = rb_ary_new2( result_length );
|
328
|
+
for ( pos = 0; pos < result_length; ++pos )
|
329
|
+
{
|
330
|
+
rb_ary_push( result, DBL2NUM( result_native[ pos ] ) );
|
331
|
+
}
|
332
|
+
|
333
|
+
free( left_native );
|
334
|
+
free( right_native );
|
335
|
+
free( result_native );
|
336
|
+
free( partial_native );
|
337
|
+
|
338
|
+
return result;
|
339
|
+
}
|
340
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#ifndef VECTOR_SSE_MUL_H
|
34
|
+
#define VECTOR_SSE_MUL_H
|
35
|
+
|
36
|
+
#include "ruby.h"
|
37
|
+
|
38
|
+
VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
39
|
+
VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
40
|
+
VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
41
|
+
VALUE method_mat_mul_f64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
42
|
+
|
43
|
+
#endif // VECTOR_SSE_MUL_H
|
@@ -0,0 +1,118 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <string.h>
|
34
|
+
#include <emmintrin.h>
|
35
|
+
#include "vector_sse_sum.h"
|
36
|
+
|
37
|
+
#define TEMPLATE_SUM_S( FUNC_NAME, TYPE, OFTYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, ADD ) \
|
38
|
+
VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
39
|
+
{ \
|
40
|
+
uint32_t length = 0; \
|
41
|
+
uint32_t offset = 0; \
|
42
|
+
uint32_t vector_pos = 0; \
|
43
|
+
uint32_t input_index = 0; \
|
44
|
+
\
|
45
|
+
TYPE result = 0; \
|
46
|
+
\
|
47
|
+
TYPE left_segment[ EL_PER_VEC ]; \
|
48
|
+
TYPE right_segment[ EL_PER_VEC ]; \
|
49
|
+
TYPE result_segment[ EL_PER_VEC ]; \
|
50
|
+
TYPE vector_segment[ EL_PER_VEC ]; \
|
51
|
+
\
|
52
|
+
__m128i left_vec; \
|
53
|
+
__m128i right_vec; \
|
54
|
+
__m128i result_vec; \
|
55
|
+
\
|
56
|
+
__m128i sign_left; \
|
57
|
+
__m128i sign_right; \
|
58
|
+
const OFTYPE OVERFLOW_MASK = ( (OFTYPE)0x1 << (TYPE_SIZE-1) ); \
|
59
|
+
OFTYPE overflow[ EL_PER_VEC ]; \
|
60
|
+
__m128i* overflow_vec = (__m128i*)overflow; \
|
61
|
+
\
|
62
|
+
Check_Type( vector, T_ARRAY ); \
|
63
|
+
\
|
64
|
+
length = RARRAY_LEN( vector ); \
|
65
|
+
\
|
66
|
+
if ( length > 0 ) \
|
67
|
+
{ \
|
68
|
+
memset( &result_vec, 0, sizeof( result_vec ) ); \
|
69
|
+
\
|
70
|
+
for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
|
71
|
+
{ \
|
72
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
73
|
+
{ \
|
74
|
+
input_index = offset + vector_pos; \
|
75
|
+
if ( input_index < length ) \
|
76
|
+
{ \
|
77
|
+
vector_segment[ vector_pos ] = CONV_IN( rb_ary_entry( vector, input_index ) ); \
|
78
|
+
} \
|
79
|
+
else \
|
80
|
+
{ \
|
81
|
+
vector_segment[ vector_pos ] = 0; \
|
82
|
+
} \
|
83
|
+
} \
|
84
|
+
\
|
85
|
+
right_vec = _mm_loadu_si128( (const __m128i *)vector_segment ); \
|
86
|
+
left_vec = _mm_loadu_si128( &result_vec ); \
|
87
|
+
\
|
88
|
+
result_vec = ADD( left_vec, right_vec ); \
|
89
|
+
\
|
90
|
+
sign_left = _mm_xor_si128(result_vec, left_vec); \
|
91
|
+
sign_right = _mm_xor_si128(result_vec, right_vec); \
|
92
|
+
*overflow_vec = _mm_and_si128(sign_left, sign_right); \
|
93
|
+
\
|
94
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
95
|
+
{ \
|
96
|
+
if ( ( (OFTYPE)overflow[ vector_pos ] & OVERFLOW_MASK ) ) \
|
97
|
+
{ \
|
98
|
+
rb_raise( rb_eRuntimeError, "Vector addition overflow" ); \
|
99
|
+
} \
|
100
|
+
} \
|
101
|
+
} \
|
102
|
+
\
|
103
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
104
|
+
\
|
105
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
106
|
+
{ \
|
107
|
+
result += result_segment[ vector_pos ]; \
|
108
|
+
} \
|
109
|
+
} \
|
110
|
+
\
|
111
|
+
return CONV_OUT( result ); \
|
112
|
+
}
|
113
|
+
|
114
|
+
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
115
|
+
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
116
|
+
TEMPLATE_SUM_S( method_vec_sum_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, _mm_add_ps );
|
117
|
+
TEMPLATE_SUM_S( method_vec_sum_f64, double, int64_t, 32, NUM2DBL, DBL2NUM, 2, _mm_add_pd );
|
118
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#ifndef VECTOR_SSE_SUM_H
|
34
|
+
#define VECTOR_SSE_SUM_H
|
35
|
+
|
36
|
+
#include "ruby.h"
|
37
|
+
|
38
|
+
VALUE method_vec_sum_s32( VALUE self, VALUE vector );
|
39
|
+
VALUE method_vec_sum_s64( VALUE self, VALUE vector );
|
40
|
+
VALUE method_vec_sum_f32( VALUE self, VALUE vector );
|
41
|
+
VALUE method_vec_sum_f64( VALUE self, VALUE vector );
|
42
|
+
|
43
|
+
#endif // VECTOR_SSE_SUM_H
|
@@ -0,0 +1,157 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <string.h>
|
34
|
+
#include <emmintrin.h>
|
35
|
+
#ifdef __SSE4_1__ // modern CPU - use SSE 4.1
|
36
|
+
#include <smmintrin.h>
|
37
|
+
#endif
|
38
|
+
#include "vector_sse_vec_mul.h"
|
39
|
+
|
40
|
+
#define SSE_VECTOR_WIDTH (4)
|
41
|
+
// #define EL_PER_VEC SSE_VECTOR_WIDTH
|
42
|
+
|
43
|
+
static inline __m128i mul_s32( const __m128i* a, const __m128i* b )
|
44
|
+
{
|
45
|
+
#ifdef __SSE4_1__
|
46
|
+
return _mm_mullo_epi32(*a, *b);
|
47
|
+
#else // old CPU - use SSE 2
|
48
|
+
__m128i tmp1 = _mm_mul_epu32(*a,*b); /* mul 2,0*/
|
49
|
+
__m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(*a,4), _mm_srli_si128(*b,4)); /* mul 3,1 */
|
50
|
+
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
|
51
|
+
#endif
|
52
|
+
}
|
53
|
+
|
54
|
+
static inline __m128i mul_s64( const __m128i* left_vec, const __m128i* right_vec )
|
55
|
+
{
|
56
|
+
// a * b = ( a0 * S + a1 ) * ( b0 * S + b1 )
|
57
|
+
// = a0 * b0 * S^2 + ( a0 * b1 + a1 * b0 ) * S + a1 * b1
|
58
|
+
|
59
|
+
int64_t left[ 2 ];
|
60
|
+
int64_t right[ 2 ];
|
61
|
+
int64_t result[ 2 ];
|
62
|
+
_mm_store_si128( (__m128i*)left, *left_vec );
|
63
|
+
_mm_store_si128( (__m128i*)right, *right_vec );
|
64
|
+
result[0] = left[0] * right[1];
|
65
|
+
result[1] = left[1] * right[1];
|
66
|
+
return _mm_loadu_si128( (const __m128i *)result );
|
67
|
+
}
|
68
|
+
|
69
|
+
static inline __m128i mul_f32(const __m128i* a, const __m128i* b )
|
70
|
+
{
|
71
|
+
return _mm_mul_ps( *a, *b );
|
72
|
+
}
|
73
|
+
|
74
|
+
static inline __m128i mul_f64(const __m128i* a, const __m128i* b )
|
75
|
+
{
|
76
|
+
return _mm_mul_pd( *a, *b );
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
#define TEMPLATE_VEC_MUL_S( FUNC_NAME, TYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, MULOP ) \
|
81
|
+
VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
82
|
+
{ \
|
83
|
+
uint32_t length = 0; \
|
84
|
+
uint32_t offset = 0; \
|
85
|
+
uint32_t vector_pos = 0; \
|
86
|
+
uint32_t input_index = 0; \
|
87
|
+
\
|
88
|
+
VALUE result = Qnil; \
|
89
|
+
\
|
90
|
+
TYPE left_segment[ EL_PER_VEC ]; \
|
91
|
+
TYPE right_segment[ EL_PER_VEC ]; \
|
92
|
+
\
|
93
|
+
__m128i left_vec; \
|
94
|
+
__m128i right_vec; \
|
95
|
+
\
|
96
|
+
TYPE result_segment[ EL_PER_VEC ]; \
|
97
|
+
__m128i result_vec; \
|
98
|
+
\
|
99
|
+
__m128i sign_left; \
|
100
|
+
__m128i sign_right; \
|
101
|
+
\
|
102
|
+
Check_Type( left, T_ARRAY ); \
|
103
|
+
Check_Type( right, T_ARRAY ); \
|
104
|
+
\
|
105
|
+
length = RARRAY_LEN( left ); \
|
106
|
+
result = rb_ary_new2( length ); \
|
107
|
+
\
|
108
|
+
if ( length > 0 ) \
|
109
|
+
{ \
|
110
|
+
memset( &result_vec, 0, sizeof( result_vec ) ); \
|
111
|
+
\
|
112
|
+
for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
|
113
|
+
{ \
|
114
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
115
|
+
{ \
|
116
|
+
input_index = offset + vector_pos; \
|
117
|
+
\
|
118
|
+
if ( input_index < length ) \
|
119
|
+
{ \
|
120
|
+
left_segment[ vector_pos ] = CONV_IN( rb_ary_entry( left, input_index ) ); \
|
121
|
+
right_segment[ vector_pos ] = CONV_IN( rb_ary_entry( right, input_index ) ); \
|
122
|
+
} \
|
123
|
+
else \
|
124
|
+
{ \
|
125
|
+
left_segment[ vector_pos ] = 0; \
|
126
|
+
right_segment[ vector_pos ] = 0; \
|
127
|
+
} \
|
128
|
+
} \
|
129
|
+
\
|
130
|
+
left_vec = _mm_loadu_si128( (const __m128i *)left_segment ); \
|
131
|
+
right_vec = _mm_loadu_si128( (const __m128i *)right_segment ); \
|
132
|
+
\
|
133
|
+
result_vec = MULOP( &left_vec, &right_vec ); \
|
134
|
+
\
|
135
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
136
|
+
\
|
137
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
138
|
+
{ \
|
139
|
+
input_index = offset + vector_pos; \
|
140
|
+
\
|
141
|
+
if ( input_index < length ) \
|
142
|
+
{ \
|
143
|
+
rb_ary_push( result, CONV_OUT( result_segment[ vector_pos ] ) ); \
|
144
|
+
} \
|
145
|
+
} \
|
146
|
+
} \
|
147
|
+
} \
|
148
|
+
\
|
149
|
+
return result; \
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_s32, int32_t, 32, NUM2INT, INT2NUM, 4, mul_s32 );
|
154
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_s64, int64_t, 64, NUM2LL, LL2NUM, 2, mul_s64 );
|
155
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4, mul_f32 );
|
156
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2, mul_f64 );
|
157
|
+
|