vector_sse 0.0.1.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/LICENSE.txt +23 -0
- data/README.md +100 -0
- data/Rakefile +9 -0
- data/ext/vector_sse/.gitignore +1 -0
- data/ext/vector_sse/extconf.rb +13 -0
- data/ext/vector_sse/vector_sse.c +80 -0
- data/ext/vector_sse/vector_sse_add.c +127 -0
- data/ext/vector_sse/vector_sse_add.h +48 -0
- data/ext/vector_sse/vector_sse_mul.c +340 -0
- data/ext/vector_sse/vector_sse_mul.h +43 -0
- data/ext/vector_sse/vector_sse_sum.c +118 -0
- data/ext/vector_sse/vector_sse_sum.h +43 -0
- data/ext/vector_sse/vector_sse_vec_mul.c +157 -0
- data/ext/vector_sse/vector_sse_vec_mul.h +43 -0
- data/lib/.gitignore +1 -0
- data/lib/vector_sse.rb +475 -0
- data/spec/vector_mat_spec.rb +374 -0
- data/spec/vector_vec_spec.rb +150 -0
- data/vector_sse.gemspec +19 -0
- metadata +93 -0
@@ -0,0 +1,340 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <emmintrin.h>
|
34
|
+
#include "vector_sse_mul.h"
|
35
|
+
|
36
|
+
#define SSE_VECTOR_WIDTH (4)
|
37
|
+
|
38
|
+
VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
39
|
+
{
|
40
|
+
uint32_t left_row = 0;
|
41
|
+
uint32_t right_col = 0;
|
42
|
+
uint32_t common = 0;
|
43
|
+
uint32_t vector_pos = 0;
|
44
|
+
uint32_t input_index = 0;
|
45
|
+
uint32_t pos = 0;
|
46
|
+
|
47
|
+
int32_t left_segment[ SSE_VECTOR_WIDTH ];
|
48
|
+
int32_t right_segment[ SSE_VECTOR_WIDTH ];
|
49
|
+
int64_t result_segment[ SSE_VECTOR_WIDTH/2 ];
|
50
|
+
|
51
|
+
__m128i* left_vec = NULL;
|
52
|
+
__m128i* right_vec = NULL;
|
53
|
+
__m128i result_vec;
|
54
|
+
|
55
|
+
VALUE result = Qnil;
|
56
|
+
|
57
|
+
int64_t* result_native = NULL;
|
58
|
+
|
59
|
+
uint32_t left_rows = NUM2INT( left_rows_rb );
|
60
|
+
uint32_t left_cols = NUM2INT( left_cols_rb );
|
61
|
+
uint32_t right_rows = NUM2INT( right_rows_rb );
|
62
|
+
uint32_t right_cols = NUM2INT( right_cols_rb );
|
63
|
+
|
64
|
+
uint32_t left_length = left_rows * left_cols;
|
65
|
+
uint32_t right_length = right_rows * right_cols;
|
66
|
+
uint32_t result_length = left_rows * right_cols;
|
67
|
+
|
68
|
+
int32_t* left_native = NULL;
|
69
|
+
int32_t* right_native = NULL;
|
70
|
+
int64_t* partial_native = NULL;
|
71
|
+
int64_t* temp = NULL;
|
72
|
+
|
73
|
+
left_native = (int32_t*) malloc( left_length * sizeof(int32_t) );
|
74
|
+
right_native = (int32_t*) malloc( right_length * sizeof(int32_t) );
|
75
|
+
result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
|
76
|
+
partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
|
77
|
+
|
78
|
+
memset(partial_native,0,left_cols*sizeof(int64_t) );
|
79
|
+
|
80
|
+
for ( pos = 0; pos < left_length; ++pos )
|
81
|
+
{
|
82
|
+
left_native[ pos ] = NUM2INT( rb_ary_entry( left, pos ) );
|
83
|
+
}
|
84
|
+
for ( pos = 0; pos < right_length; ++pos )
|
85
|
+
{
|
86
|
+
right_native[ pos ] = NUM2INT( rb_ary_entry( right, pos ) );
|
87
|
+
}
|
88
|
+
|
89
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
90
|
+
{
|
91
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
92
|
+
{
|
93
|
+
for ( common = 0; common < left_cols; common += (SSE_VECTOR_WIDTH/2) )
|
94
|
+
{
|
95
|
+
memset( left_segment, 0, sizeof( left_segment ) );
|
96
|
+
memset( right_segment, 0, sizeof( right_segment ) );
|
97
|
+
|
98
|
+
input_index = common;
|
99
|
+
left_segment[ 0 ] = left_native[ left_row * left_cols + input_index ];
|
100
|
+
right_segment[ 0 ] = right_native[ input_index * right_cols + right_col ];
|
101
|
+
|
102
|
+
input_index = common + 1;
|
103
|
+
if ( input_index < left_cols )
|
104
|
+
{
|
105
|
+
left_segment[ 2 ] = left_native[ left_row * left_cols + input_index ];
|
106
|
+
right_segment[ 2 ] = right_native[ input_index * right_cols + right_col ];
|
107
|
+
}
|
108
|
+
|
109
|
+
left_vec = ( __m128i *)left_segment;
|
110
|
+
right_vec = ( __m128i *)right_segment;
|
111
|
+
result_vec = _mm_mul_epu32( *left_vec, *right_vec );
|
112
|
+
|
113
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
114
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH/2; ++pos )
|
115
|
+
{
|
116
|
+
if ( (common + pos) < left_cols )
|
117
|
+
{
|
118
|
+
partial_native[ common + pos ] = result_segment[ pos ];
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
124
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
125
|
+
for ( common = 0; common < left_cols; ++common )
|
126
|
+
{
|
127
|
+
(*temp) += partial_native[ common ];
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
result = rb_ary_new2( result_length );
|
133
|
+
for ( pos = 0; pos < result_length; ++pos )
|
134
|
+
{
|
135
|
+
rb_ary_push( result, INT2NUM( result_native[ pos ] ) );
|
136
|
+
}
|
137
|
+
|
138
|
+
free( left_native );
|
139
|
+
free( right_native );
|
140
|
+
free( result_native );
|
141
|
+
free( partial_native );
|
142
|
+
|
143
|
+
return result;
|
144
|
+
}
|
145
|
+
|
146
|
+
VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
147
|
+
{
|
148
|
+
uint32_t left_row = 0;
|
149
|
+
uint32_t right_col = 0;
|
150
|
+
uint32_t common = 0;
|
151
|
+
uint32_t vector_pos = 0;
|
152
|
+
uint32_t input_index = 0;
|
153
|
+
uint32_t pos = 0;
|
154
|
+
|
155
|
+
int64_t left_segment[ SSE_VECTOR_WIDTH ];
|
156
|
+
int64_t right_segment[ SSE_VECTOR_WIDTH ];
|
157
|
+
|
158
|
+
__m128i* left_vec = NULL;
|
159
|
+
__m128i* right_vec = NULL;
|
160
|
+
__m128i result_vec;
|
161
|
+
|
162
|
+
VALUE result = Qnil;
|
163
|
+
|
164
|
+
int64_t* result_native = NULL;
|
165
|
+
|
166
|
+
uint32_t left_rows = NUM2INT( left_rows_rb );
|
167
|
+
uint32_t left_cols = NUM2INT( left_cols_rb );
|
168
|
+
uint32_t right_rows = NUM2INT( right_rows_rb );
|
169
|
+
uint32_t right_cols = NUM2INT( right_cols_rb );
|
170
|
+
|
171
|
+
uint32_t left_length = left_rows * left_cols;
|
172
|
+
uint32_t right_length = right_rows * right_cols;
|
173
|
+
uint32_t result_length = left_rows * right_cols;
|
174
|
+
|
175
|
+
int64_t* left_native = NULL;
|
176
|
+
int64_t* right_native = NULL;
|
177
|
+
int64_t* partial_native = NULL;
|
178
|
+
int64_t* temp = NULL;
|
179
|
+
|
180
|
+
|
181
|
+
left_native = (int64_t*) malloc( left_length * sizeof(int64_t) );
|
182
|
+
right_native = (int64_t*) malloc( right_length * sizeof(int64_t) );
|
183
|
+
result_native = (int64_t*) malloc( result_length * sizeof(int64_t) );
|
184
|
+
partial_native = (int64_t*) malloc( left_cols * sizeof(int64_t) );
|
185
|
+
|
186
|
+
memset(partial_native,0,left_cols*sizeof(int64_t) );
|
187
|
+
|
188
|
+
for ( pos = 0; pos < left_length; ++pos )
|
189
|
+
{
|
190
|
+
left_native[ pos ] = NUM2LL( rb_ary_entry( left, pos ) );
|
191
|
+
}
|
192
|
+
for ( pos = 0; pos < right_length; ++pos )
|
193
|
+
{
|
194
|
+
right_native[ pos ] = NUM2LL( rb_ary_entry( right, pos ) );
|
195
|
+
}
|
196
|
+
|
197
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
198
|
+
{
|
199
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
200
|
+
{
|
201
|
+
for ( common = 0; common < left_cols; ++common )
|
202
|
+
{
|
203
|
+
partial_native[ common ] =
|
204
|
+
left_native[ left_row * left_cols + common ] *
|
205
|
+
right_native[ common * right_cols + right_col ];
|
206
|
+
}
|
207
|
+
|
208
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
209
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
210
|
+
for ( common = 0; common < left_cols; ++common )
|
211
|
+
{
|
212
|
+
(*temp) += partial_native[ common ];
|
213
|
+
}
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
result = rb_ary_new2( result_length );
|
218
|
+
for ( pos = 0; pos < result_length; ++pos )
|
219
|
+
{
|
220
|
+
rb_ary_push( result, LL2NUM( result_native[ pos ] ) );
|
221
|
+
}
|
222
|
+
|
223
|
+
free( left_native );
|
224
|
+
free( right_native );
|
225
|
+
free( result_native );
|
226
|
+
free( partial_native );
|
227
|
+
|
228
|
+
return result;
|
229
|
+
}
|
230
|
+
|
231
|
+
VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb )
|
232
|
+
{
|
233
|
+
uint32_t left_row = 0;
|
234
|
+
uint32_t right_col = 0;
|
235
|
+
uint32_t common = 0;
|
236
|
+
uint32_t vector_pos = 0;
|
237
|
+
uint32_t input_index = 0;
|
238
|
+
uint32_t pos = 0;
|
239
|
+
|
240
|
+
float left_segment[ SSE_VECTOR_WIDTH ];
|
241
|
+
float right_segment[ SSE_VECTOR_WIDTH ];
|
242
|
+
float result_segment[ SSE_VECTOR_WIDTH ];
|
243
|
+
|
244
|
+
__m128i* left_vec = NULL;
|
245
|
+
__m128i* right_vec = NULL;
|
246
|
+
__m128i result_vec;
|
247
|
+
|
248
|
+
VALUE result = Qnil;
|
249
|
+
|
250
|
+
float* result_native = NULL;
|
251
|
+
|
252
|
+
uint32_t left_rows = NUM2UINT( left_rows_rb );
|
253
|
+
uint32_t left_cols = NUM2UINT( left_cols_rb );
|
254
|
+
uint32_t right_rows = NUM2UINT( right_rows_rb );
|
255
|
+
uint32_t right_cols = NUM2UINT( right_cols_rb );
|
256
|
+
|
257
|
+
uint32_t left_length = left_rows * left_cols;
|
258
|
+
uint32_t right_length = right_rows * right_cols;
|
259
|
+
uint32_t result_length = left_rows * right_cols;
|
260
|
+
|
261
|
+
float* left_native = NULL;
|
262
|
+
float* right_native = NULL;
|
263
|
+
float* partial_native = NULL;
|
264
|
+
float* temp = NULL;
|
265
|
+
|
266
|
+
left_native = (float*) malloc( left_length * sizeof(float) );
|
267
|
+
right_native = (float*) malloc( right_length * sizeof(float) );
|
268
|
+
result_native = (float*) malloc( result_length * sizeof(float) );
|
269
|
+
partial_native = (float*) malloc( left_cols * sizeof(float) );
|
270
|
+
|
271
|
+
memset( partial_native, 0, left_cols * sizeof(float) );
|
272
|
+
|
273
|
+
for ( pos = 0; pos < left_length; ++pos )
|
274
|
+
{
|
275
|
+
left_native[ pos ] = NUM2DBL( rb_ary_entry( left, pos ) );
|
276
|
+
}
|
277
|
+
for ( pos = 0; pos < right_length; ++pos )
|
278
|
+
{
|
279
|
+
right_native[ pos ] = NUM2DBL( rb_ary_entry( right, pos ) );
|
280
|
+
}
|
281
|
+
|
282
|
+
for ( left_row = 0; left_row < left_rows; ++left_row )
|
283
|
+
{
|
284
|
+
for ( right_col = 0; right_col < right_cols; ++right_col )
|
285
|
+
{
|
286
|
+
for ( common = 0; common < left_cols; common += SSE_VECTOR_WIDTH )
|
287
|
+
{
|
288
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
289
|
+
{
|
290
|
+
input_index = common + pos;
|
291
|
+
|
292
|
+
if ( input_index < left_cols )
|
293
|
+
{
|
294
|
+
left_segment[ pos ] = left_native[ left_row * left_cols + input_index ];
|
295
|
+
right_segment[ pos ] = right_native[ input_index * right_cols + right_col ];
|
296
|
+
}
|
297
|
+
else
|
298
|
+
{
|
299
|
+
left_segment[ pos ] = 0;
|
300
|
+
right_segment[ pos ] = 0;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
left_vec = ( __m128i *)left_segment;
|
305
|
+
right_vec = ( __m128i *)right_segment;
|
306
|
+
result_vec = _mm_mul_ps( *left_vec, *right_vec );
|
307
|
+
|
308
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
309
|
+
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
310
|
+
{
|
311
|
+
if ( (common + pos) < left_cols )
|
312
|
+
{
|
313
|
+
partial_native[ common + pos ] = result_segment[ pos ];
|
314
|
+
}
|
315
|
+
}
|
316
|
+
}
|
317
|
+
|
318
|
+
result_native[ left_row * right_cols + right_col ] = 0;
|
319
|
+
temp = &result_native[ left_row * right_cols + right_col ];
|
320
|
+
for ( common = 0; common < left_cols; ++common )
|
321
|
+
{
|
322
|
+
(*temp) += partial_native[ common ];
|
323
|
+
}
|
324
|
+
}
|
325
|
+
}
|
326
|
+
|
327
|
+
result = rb_ary_new2( result_length );
|
328
|
+
for ( pos = 0; pos < result_length; ++pos )
|
329
|
+
{
|
330
|
+
rb_ary_push( result, DBL2NUM( result_native[ pos ] ) );
|
331
|
+
}
|
332
|
+
|
333
|
+
free( left_native );
|
334
|
+
free( right_native );
|
335
|
+
free( result_native );
|
336
|
+
free( partial_native );
|
337
|
+
|
338
|
+
return result;
|
339
|
+
}
|
340
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#ifndef VECTOR_SSE_MUL_H
|
34
|
+
#define VECTOR_SSE_MUL_H
|
35
|
+
|
36
|
+
#include "ruby.h"
|
37
|
+
|
38
|
+
VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
39
|
+
VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
40
|
+
VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
41
|
+
VALUE method_mat_mul_f64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left_cols_rb, VALUE right, VALUE right_rows_rb, VALUE right_cols_rb );
|
42
|
+
|
43
|
+
#endif // VECTOR_SSE_MUL_H
|
@@ -0,0 +1,118 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <string.h>
|
34
|
+
#include <emmintrin.h>
|
35
|
+
#include "vector_sse_sum.h"
|
36
|
+
|
37
|
+
#define TEMPLATE_SUM_S( FUNC_NAME, TYPE, OFTYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, ADD ) \
|
38
|
+
VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
39
|
+
{ \
|
40
|
+
uint32_t length = 0; \
|
41
|
+
uint32_t offset = 0; \
|
42
|
+
uint32_t vector_pos = 0; \
|
43
|
+
uint32_t input_index = 0; \
|
44
|
+
\
|
45
|
+
TYPE result = 0; \
|
46
|
+
\
|
47
|
+
TYPE left_segment[ EL_PER_VEC ]; \
|
48
|
+
TYPE right_segment[ EL_PER_VEC ]; \
|
49
|
+
TYPE result_segment[ EL_PER_VEC ]; \
|
50
|
+
TYPE vector_segment[ EL_PER_VEC ]; \
|
51
|
+
\
|
52
|
+
__m128i left_vec; \
|
53
|
+
__m128i right_vec; \
|
54
|
+
__m128i result_vec; \
|
55
|
+
\
|
56
|
+
__m128i sign_left; \
|
57
|
+
__m128i sign_right; \
|
58
|
+
const OFTYPE OVERFLOW_MASK = ( (OFTYPE)0x1 << (TYPE_SIZE-1) ); \
|
59
|
+
OFTYPE overflow[ EL_PER_VEC ]; \
|
60
|
+
__m128i* overflow_vec = (__m128i*)overflow; \
|
61
|
+
\
|
62
|
+
Check_Type( vector, T_ARRAY ); \
|
63
|
+
\
|
64
|
+
length = RARRAY_LEN( vector ); \
|
65
|
+
\
|
66
|
+
if ( length > 0 ) \
|
67
|
+
{ \
|
68
|
+
memset( &result_vec, 0, sizeof( result_vec ) ); \
|
69
|
+
\
|
70
|
+
for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
|
71
|
+
{ \
|
72
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
73
|
+
{ \
|
74
|
+
input_index = offset + vector_pos; \
|
75
|
+
if ( input_index < length ) \
|
76
|
+
{ \
|
77
|
+
vector_segment[ vector_pos ] = CONV_IN( rb_ary_entry( vector, input_index ) ); \
|
78
|
+
} \
|
79
|
+
else \
|
80
|
+
{ \
|
81
|
+
vector_segment[ vector_pos ] = 0; \
|
82
|
+
} \
|
83
|
+
} \
|
84
|
+
\
|
85
|
+
right_vec = _mm_loadu_si128( (const __m128i *)vector_segment ); \
|
86
|
+
left_vec = _mm_loadu_si128( &result_vec ); \
|
87
|
+
\
|
88
|
+
result_vec = ADD( left_vec, right_vec ); \
|
89
|
+
\
|
90
|
+
sign_left = _mm_xor_si128(result_vec, left_vec); \
|
91
|
+
sign_right = _mm_xor_si128(result_vec, right_vec); \
|
92
|
+
*overflow_vec = _mm_and_si128(sign_left, sign_right); \
|
93
|
+
\
|
94
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
95
|
+
{ \
|
96
|
+
if ( ( (OFTYPE)overflow[ vector_pos ] & OVERFLOW_MASK ) ) \
|
97
|
+
{ \
|
98
|
+
rb_raise( rb_eRuntimeError, "Vector addition overflow" ); \
|
99
|
+
} \
|
100
|
+
} \
|
101
|
+
} \
|
102
|
+
\
|
103
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
104
|
+
\
|
105
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
106
|
+
{ \
|
107
|
+
result += result_segment[ vector_pos ]; \
|
108
|
+
} \
|
109
|
+
} \
|
110
|
+
\
|
111
|
+
return CONV_OUT( result ); \
|
112
|
+
}
|
113
|
+
|
114
|
+
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
115
|
+
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
116
|
+
TEMPLATE_SUM_S( method_vec_sum_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, _mm_add_ps );
|
117
|
+
TEMPLATE_SUM_S( method_vec_sum_f64, double, int64_t, 32, NUM2DBL, DBL2NUM, 2, _mm_add_pd );
|
118
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#ifndef VECTOR_SSE_SUM_H
|
34
|
+
#define VECTOR_SSE_SUM_H
|
35
|
+
|
36
|
+
#include "ruby.h"
|
37
|
+
|
38
|
+
VALUE method_vec_sum_s32( VALUE self, VALUE vector );
|
39
|
+
VALUE method_vec_sum_s64( VALUE self, VALUE vector );
|
40
|
+
VALUE method_vec_sum_f32( VALUE self, VALUE vector );
|
41
|
+
VALUE method_vec_sum_f64( VALUE self, VALUE vector );
|
42
|
+
|
43
|
+
#endif // VECTOR_SSE_SUM_H
|
@@ -0,0 +1,157 @@
|
|
1
|
+
//
|
2
|
+
// Copyright (c) 2015, Robert Glissmann
|
3
|
+
// All rights reserved.
|
4
|
+
//
|
5
|
+
// Redistribution and use in source and binary forms, with or without
|
6
|
+
// modification, are permitted provided that the following conditions are met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright notice, this
|
9
|
+
// list of conditions and the following disclaimer.
|
10
|
+
//
|
11
|
+
// * Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
// this list of conditions and the following disclaimer in the documentation
|
13
|
+
// and/or other materials provided with the distribution.
|
14
|
+
//
|
15
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19
|
+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20
|
+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21
|
+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22
|
+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23
|
+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
//
|
26
|
+
|
27
|
+
// %% license-end-token %%
|
28
|
+
//
|
29
|
+
// Author: Robert.Glissmann@gmail.com (Robert Glissmann)
|
30
|
+
//
|
31
|
+
//
|
32
|
+
|
33
|
+
#include <string.h>
|
34
|
+
#include <emmintrin.h>
|
35
|
+
#ifdef __SSE4_1__ // modern CPU - use SSE 4.1
|
36
|
+
#include <smmintrin.h>
|
37
|
+
#endif
|
38
|
+
#include "vector_sse_vec_mul.h"
|
39
|
+
|
40
|
+
#define SSE_VECTOR_WIDTH (4)
|
41
|
+
// #define EL_PER_VEC SSE_VECTOR_WIDTH
|
42
|
+
|
43
|
+
static inline __m128i mul_s32( const __m128i* a, const __m128i* b )
|
44
|
+
{
|
45
|
+
#ifdef __SSE4_1__
|
46
|
+
return _mm_mullo_epi32(*a, *b);
|
47
|
+
#else // old CPU - use SSE 2
|
48
|
+
__m128i tmp1 = _mm_mul_epu32(*a,*b); /* mul 2,0*/
|
49
|
+
__m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(*a,4), _mm_srli_si128(*b,4)); /* mul 3,1 */
|
50
|
+
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
|
51
|
+
#endif
|
52
|
+
}
|
53
|
+
|
54
|
+
static inline __m128i mul_s64( const __m128i* left_vec, const __m128i* right_vec )
|
55
|
+
{
|
56
|
+
// a * b = ( a0 * S + a1 ) * ( b0 * S + b1 )
|
57
|
+
// = a0 * b0 * S^2 + ( a0 * b1 + a1 * b0 ) * S + a1 * b1
|
58
|
+
|
59
|
+
int64_t left[ 2 ];
|
60
|
+
int64_t right[ 2 ];
|
61
|
+
int64_t result[ 2 ];
|
62
|
+
_mm_store_si128( (__m128i*)left, *left_vec );
|
63
|
+
_mm_store_si128( (__m128i*)right, *right_vec );
|
64
|
+
result[0] = left[0] * right[1];
|
65
|
+
result[1] = left[1] * right[1];
|
66
|
+
return _mm_loadu_si128( (const __m128i *)result );
|
67
|
+
}
|
68
|
+
|
69
|
+
static inline __m128i mul_f32(const __m128i* a, const __m128i* b )
|
70
|
+
{
|
71
|
+
return _mm_mul_ps( *a, *b );
|
72
|
+
}
|
73
|
+
|
74
|
+
static inline __m128i mul_f64(const __m128i* a, const __m128i* b )
|
75
|
+
{
|
76
|
+
return _mm_mul_pd( *a, *b );
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
#define TEMPLATE_VEC_MUL_S( FUNC_NAME, TYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, MULOP ) \
|
81
|
+
VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
82
|
+
{ \
|
83
|
+
uint32_t length = 0; \
|
84
|
+
uint32_t offset = 0; \
|
85
|
+
uint32_t vector_pos = 0; \
|
86
|
+
uint32_t input_index = 0; \
|
87
|
+
\
|
88
|
+
VALUE result = Qnil; \
|
89
|
+
\
|
90
|
+
TYPE left_segment[ EL_PER_VEC ]; \
|
91
|
+
TYPE right_segment[ EL_PER_VEC ]; \
|
92
|
+
\
|
93
|
+
__m128i left_vec; \
|
94
|
+
__m128i right_vec; \
|
95
|
+
\
|
96
|
+
TYPE result_segment[ EL_PER_VEC ]; \
|
97
|
+
__m128i result_vec; \
|
98
|
+
\
|
99
|
+
__m128i sign_left; \
|
100
|
+
__m128i sign_right; \
|
101
|
+
\
|
102
|
+
Check_Type( left, T_ARRAY ); \
|
103
|
+
Check_Type( right, T_ARRAY ); \
|
104
|
+
\
|
105
|
+
length = RARRAY_LEN( left ); \
|
106
|
+
result = rb_ary_new2( length ); \
|
107
|
+
\
|
108
|
+
if ( length > 0 ) \
|
109
|
+
{ \
|
110
|
+
memset( &result_vec, 0, sizeof( result_vec ) ); \
|
111
|
+
\
|
112
|
+
for ( offset = 0; offset < length; offset += EL_PER_VEC ) \
|
113
|
+
{ \
|
114
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
115
|
+
{ \
|
116
|
+
input_index = offset + vector_pos; \
|
117
|
+
\
|
118
|
+
if ( input_index < length ) \
|
119
|
+
{ \
|
120
|
+
left_segment[ vector_pos ] = CONV_IN( rb_ary_entry( left, input_index ) ); \
|
121
|
+
right_segment[ vector_pos ] = CONV_IN( rb_ary_entry( right, input_index ) ); \
|
122
|
+
} \
|
123
|
+
else \
|
124
|
+
{ \
|
125
|
+
left_segment[ vector_pos ] = 0; \
|
126
|
+
right_segment[ vector_pos ] = 0; \
|
127
|
+
} \
|
128
|
+
} \
|
129
|
+
\
|
130
|
+
left_vec = _mm_loadu_si128( (const __m128i *)left_segment ); \
|
131
|
+
right_vec = _mm_loadu_si128( (const __m128i *)right_segment ); \
|
132
|
+
\
|
133
|
+
result_vec = MULOP( &left_vec, &right_vec ); \
|
134
|
+
\
|
135
|
+
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
136
|
+
\
|
137
|
+
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
138
|
+
{ \
|
139
|
+
input_index = offset + vector_pos; \
|
140
|
+
\
|
141
|
+
if ( input_index < length ) \
|
142
|
+
{ \
|
143
|
+
rb_ary_push( result, CONV_OUT( result_segment[ vector_pos ] ) ); \
|
144
|
+
} \
|
145
|
+
} \
|
146
|
+
} \
|
147
|
+
} \
|
148
|
+
\
|
149
|
+
return result; \
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_s32, int32_t, 32, NUM2INT, INT2NUM, 4, mul_s32 );
|
154
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_s64, int64_t, 64, NUM2LL, LL2NUM, 2, mul_s64 );
|
155
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4, mul_f32 );
|
156
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2, mul_f64 );
|
157
|
+
|