vector_sse 0.0.1.pre → 0.0.2.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -1
- data/Gemfile +5 -0
- data/ext/.gitignore +1 -0
- data/ext/vector_sse/extconf.rb +1 -1
- data/ext/vector_sse/vector_sse_add.c +5 -4
- data/ext/vector_sse/vector_sse_common.c +31 -0
- data/ext/vector_sse/vector_sse_common.h +13 -0
- data/ext/vector_sse/vector_sse_mul.c +2 -12
- data/ext/vector_sse/vector_sse_sum.c +30 -28
- data/ext/vector_sse/vector_sse_vec_mul.c +7 -9
- data/lib/vector_sse.rb +1 -1
- metadata +13 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fdabd1511ca1ab68168a0723f06d26afe3312627
|
4
|
+
data.tar.gz: 9bb853808e5c0a8ec8e8b1592e84a0537954f6dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 213dffc68fc752e4520f0d5f7e910467b3fb51faba6ed5f6052b38d6a677928ead0b899b7658427fa78e08c7400f9ac874112b8c325174a328a9c137cdf34d6d
|
7
|
+
data.tar.gz: 10b7386669a7a1acbbcad0a9f24323e703487997d18969f9b02a6f0d5ba02ac2bd9ff1c50ba5df71f33d18413e38545454eb48700570046bc2c2c5fb1f54f712
|
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/ext/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*DS_Store
|
data/ext/vector_sse/extconf.rb
CHANGED
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
#include <emmintrin.h>
|
34
34
|
#include "vector_sse_add.h"
|
35
|
+
#include "vector_sse_common.h"
|
35
36
|
|
36
37
|
#define TEMPLATE_ADD_S( FUNC_NAME, TYPE, OFTYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, ADD ) \
|
37
38
|
VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
@@ -118,10 +119,10 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
118
119
|
|
119
120
|
TEMPLATE_ADD_S( method_vec_add_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
120
121
|
TEMPLATE_ADD_S( method_vec_add_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
121
|
-
TEMPLATE_ADD_S( method_vec_add_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4,
|
122
|
-
TEMPLATE_ADD_S( method_vec_add_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2,
|
122
|
+
TEMPLATE_ADD_S( method_vec_add_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, add_f32 );
|
123
|
+
TEMPLATE_ADD_S( method_vec_add_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2, add_f64 );
|
123
124
|
|
124
125
|
TEMPLATE_ADD_S( method_vec_sub_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_sub_epi32 );
|
125
126
|
TEMPLATE_ADD_S( method_vec_sub_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_sub_epi64 );
|
126
|
-
TEMPLATE_ADD_S( method_vec_sub_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4,
|
127
|
-
TEMPLATE_ADD_S( method_vec_sub_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2,
|
127
|
+
TEMPLATE_ADD_S( method_vec_sub_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, sub_f32 );
|
128
|
+
TEMPLATE_ADD_S( method_vec_sub_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2, sub_f64 );
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#include "vector_sse_common.h"
|
2
|
+
|
3
|
+
__m128i add_f32( const __m128i left, const __m128i right )
|
4
|
+
{
|
5
|
+
return _mm_castps_si128( _mm_add_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
6
|
+
}
|
7
|
+
|
8
|
+
__m128i add_f64( const __m128i left, const __m128i right )
|
9
|
+
{
|
10
|
+
return _mm_cvtpd_epi32( _mm_add_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
11
|
+
}
|
12
|
+
|
13
|
+
__m128i sub_f32( const __m128i left, const __m128i right )
|
14
|
+
{
|
15
|
+
return _mm_castps_si128( _mm_sub_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
16
|
+
}
|
17
|
+
|
18
|
+
__m128i sub_f64( const __m128i left, const __m128i right )
|
19
|
+
{
|
20
|
+
return _mm_cvtpd_epi32( _mm_sub_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
21
|
+
}
|
22
|
+
|
23
|
+
__m128i mul_f32( const __m128i left, const __m128i right )
|
24
|
+
{
|
25
|
+
return _mm_castps_si128( _mm_mul_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
26
|
+
}
|
27
|
+
|
28
|
+
__m128i mul_f64( const __m128i left, const __m128i right )
|
29
|
+
{
|
30
|
+
return _mm_cvtpd_epi32( _mm_mul_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
31
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifndef VECTOR_SSE_COMMON_H
|
2
|
+
#define VECTOR_SSE_COMMON_H
|
3
|
+
|
4
|
+
#include <emmintrin.h>
|
5
|
+
|
6
|
+
__m128i add_f32( const __m128i left, const __m128i right );
|
7
|
+
__m128i add_f64( const __m128i left, const __m128i right );
|
8
|
+
__m128i sub_f32( const __m128i left, const __m128i right );
|
9
|
+
__m128i sub_f64( const __m128i left, const __m128i right );
|
10
|
+
__m128i mul_f32( const __m128i left, const __m128i right );
|
11
|
+
__m128i mul_f64( const __m128i left, const __m128i right );
|
12
|
+
|
13
|
+
#endif // VECTOR_SSE_COMMON_H
|
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
#include <emmintrin.h>
|
34
34
|
#include "vector_sse_mul.h"
|
35
|
+
#include "vector_sse_common.h"
|
35
36
|
|
36
37
|
#define SSE_VECTOR_WIDTH (4)
|
37
38
|
|
@@ -40,7 +41,6 @@ VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
40
41
|
uint32_t left_row = 0;
|
41
42
|
uint32_t right_col = 0;
|
42
43
|
uint32_t common = 0;
|
43
|
-
uint32_t vector_pos = 0;
|
44
44
|
uint32_t input_index = 0;
|
45
45
|
uint32_t pos = 0;
|
46
46
|
|
@@ -148,17 +148,8 @@ VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
148
148
|
uint32_t left_row = 0;
|
149
149
|
uint32_t right_col = 0;
|
150
150
|
uint32_t common = 0;
|
151
|
-
uint32_t vector_pos = 0;
|
152
|
-
uint32_t input_index = 0;
|
153
151
|
uint32_t pos = 0;
|
154
152
|
|
155
|
-
int64_t left_segment[ SSE_VECTOR_WIDTH ];
|
156
|
-
int64_t right_segment[ SSE_VECTOR_WIDTH ];
|
157
|
-
|
158
|
-
__m128i* left_vec = NULL;
|
159
|
-
__m128i* right_vec = NULL;
|
160
|
-
__m128i result_vec;
|
161
|
-
|
162
153
|
VALUE result = Qnil;
|
163
154
|
|
164
155
|
int64_t* result_native = NULL;
|
@@ -233,7 +224,6 @@ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
233
224
|
uint32_t left_row = 0;
|
234
225
|
uint32_t right_col = 0;
|
235
226
|
uint32_t common = 0;
|
236
|
-
uint32_t vector_pos = 0;
|
237
227
|
uint32_t input_index = 0;
|
238
228
|
uint32_t pos = 0;
|
239
229
|
|
@@ -303,7 +293,7 @@ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
303
293
|
|
304
294
|
left_vec = ( __m128i *)left_segment;
|
305
295
|
right_vec = ( __m128i *)right_segment;
|
306
|
-
result_vec =
|
296
|
+
result_vec = mul_f32( *left_vec, *right_vec );
|
307
297
|
|
308
298
|
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
309
299
|
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
@@ -32,9 +32,31 @@
|
|
32
32
|
|
33
33
|
#include <string.h>
|
34
34
|
#include <emmintrin.h>
|
35
|
+
#include <ruby.h>
|
35
36
|
#include "vector_sse_sum.h"
|
37
|
+
#include "vector_sse_common.h"
|
36
38
|
|
37
|
-
|
39
|
+
|
40
|
+
// Check for overflow
|
41
|
+
// __m128i sign_left;
|
42
|
+
// __m128i sign_right;
|
43
|
+
// const int32_t OVERFLOW_MASK = ( (int32_t)0x1 << (32-1) );
|
44
|
+
// int32_t overflow[ 4 ];
|
45
|
+
// __m128i* overflow_vec = (__m128i*)overflow;
|
46
|
+
// sign_left = _mm_xor_si128(result_vec, left_vec);
|
47
|
+
// sign_right = _mm_xor_si128(result_vec, right_vec);
|
48
|
+
// *overflow_vec = _mm_and_si128(sign_left, sign_right);
|
49
|
+
|
50
|
+
// for ( vector_pos = 0; vector_pos < 4; ++vector_pos )
|
51
|
+
// {
|
52
|
+
// if ( ( (int32_t)overflow[ vector_pos ] & OVERFLOW_MASK ) )
|
53
|
+
// {
|
54
|
+
// rb_raise( rb_eRuntimeError, "Vector addition overflow" );
|
55
|
+
// }
|
56
|
+
// }
|
57
|
+
|
58
|
+
|
59
|
+
#define TEMPLATE_SUM_S( FUNC_NAME, TYPE, CONV_IN, CONV_OUT, EL_PER_VEC, ADDER ) \
|
38
60
|
VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
39
61
|
{ \
|
40
62
|
uint32_t length = 0; \
|
@@ -44,20 +66,12 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
44
66
|
\
|
45
67
|
TYPE result = 0; \
|
46
68
|
\
|
47
|
-
TYPE left_segment[ EL_PER_VEC ]; \
|
48
|
-
TYPE right_segment[ EL_PER_VEC ]; \
|
49
69
|
TYPE result_segment[ EL_PER_VEC ]; \
|
50
70
|
TYPE vector_segment[ EL_PER_VEC ]; \
|
51
71
|
\
|
52
|
-
__m128i left_vec;
|
53
|
-
__m128i right_vec;
|
72
|
+
__m128i left_vec; \
|
73
|
+
__m128i right_vec; \
|
54
74
|
__m128i result_vec; \
|
55
|
-
\
|
56
|
-
__m128i sign_left; \
|
57
|
-
__m128i sign_right; \
|
58
|
-
const OFTYPE OVERFLOW_MASK = ( (OFTYPE)0x1 << (TYPE_SIZE-1) ); \
|
59
|
-
OFTYPE overflow[ EL_PER_VEC ]; \
|
60
|
-
__m128i* overflow_vec = (__m128i*)overflow; \
|
61
75
|
\
|
62
76
|
Check_Type( vector, T_ARRAY ); \
|
63
77
|
\
|
@@ -85,19 +99,7 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
85
99
|
right_vec = _mm_loadu_si128( (const __m128i *)vector_segment ); \
|
86
100
|
left_vec = _mm_loadu_si128( &result_vec ); \
|
87
101
|
\
|
88
|
-
result_vec =
|
89
|
-
\
|
90
|
-
sign_left = _mm_xor_si128(result_vec, left_vec); \
|
91
|
-
sign_right = _mm_xor_si128(result_vec, right_vec); \
|
92
|
-
*overflow_vec = _mm_and_si128(sign_left, sign_right); \
|
93
|
-
\
|
94
|
-
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
95
|
-
{ \
|
96
|
-
if ( ( (OFTYPE)overflow[ vector_pos ] & OVERFLOW_MASK ) ) \
|
97
|
-
{ \
|
98
|
-
rb_raise( rb_eRuntimeError, "Vector addition overflow" ); \
|
99
|
-
} \
|
100
|
-
} \
|
102
|
+
result_vec = ADDER( left_vec, right_vec ); \
|
101
103
|
} \
|
102
104
|
\
|
103
105
|
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
@@ -111,8 +113,8 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
111
113
|
return CONV_OUT( result ); \
|
112
114
|
}
|
113
115
|
|
114
|
-
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t,
|
115
|
-
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t,
|
116
|
-
TEMPLATE_SUM_S( method_vec_sum_f32, float,
|
117
|
-
TEMPLATE_SUM_S( method_vec_sum_f64, double,
|
116
|
+
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
117
|
+
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
118
|
+
TEMPLATE_SUM_S( method_vec_sum_f32, float, NUM2DBL, DBL2NUM, 4, add_f32 );
|
119
|
+
TEMPLATE_SUM_S( method_vec_sum_f64, double, NUM2DBL, DBL2NUM, 2, add_f64 );
|
118
120
|
|
@@ -36,6 +36,7 @@
|
|
36
36
|
#include <smmintrin.h>
|
37
37
|
#endif
|
38
38
|
#include "vector_sse_vec_mul.h"
|
39
|
+
#include "vector_sse_common.h"
|
39
40
|
|
40
41
|
#define SSE_VECTOR_WIDTH (4)
|
41
42
|
// #define EL_PER_VEC SSE_VECTOR_WIDTH
|
@@ -66,14 +67,14 @@ static inline __m128i mul_s64( const __m128i* left_vec, const __m128i* right_vec
|
|
66
67
|
return _mm_loadu_si128( (const __m128i *)result );
|
67
68
|
}
|
68
69
|
|
69
|
-
static inline __m128i
|
70
|
+
static inline __m128i mul_f32_ptr(const __m128i* a, const __m128i* b )
|
70
71
|
{
|
71
|
-
return
|
72
|
+
return mul_f32( *a, *b );
|
72
73
|
}
|
73
74
|
|
74
|
-
static inline __m128i
|
75
|
+
static inline __m128i mul_f64_ptr(const __m128i* a, const __m128i* b )
|
75
76
|
{
|
76
|
-
return
|
77
|
+
return mul_f64( *a, *b );
|
77
78
|
}
|
78
79
|
|
79
80
|
|
@@ -95,9 +96,6 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
95
96
|
\
|
96
97
|
TYPE result_segment[ EL_PER_VEC ]; \
|
97
98
|
__m128i result_vec; \
|
98
|
-
\
|
99
|
-
__m128i sign_left; \
|
100
|
-
__m128i sign_right; \
|
101
99
|
\
|
102
100
|
Check_Type( left, T_ARRAY ); \
|
103
101
|
Check_Type( right, T_ARRAY ); \
|
@@ -152,6 +150,6 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
152
150
|
|
153
151
|
TEMPLATE_VEC_MUL_S( method_vec_mul_s32, int32_t, 32, NUM2INT, INT2NUM, 4, mul_s32 );
|
154
152
|
TEMPLATE_VEC_MUL_S( method_vec_mul_s64, int64_t, 64, NUM2LL, LL2NUM, 2, mul_s64 );
|
155
|
-
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4,
|
156
|
-
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2,
|
153
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4, mul_f32_ptr );
|
154
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2, mul_f64_ptr );
|
157
155
|
|
data/lib/vector_sse.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_sse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2.pre
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert Glissmann
|
@@ -14,28 +14,28 @@ dependencies:
|
|
14
14
|
name: rake-compiler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 0.9.5
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.9.5
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 3.1.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.1.0
|
41
41
|
description: VectorSse employs x86 Streaming SIMD Extensions (SSE), v3 or greater,
|
@@ -46,15 +46,19 @@ extensions:
|
|
46
46
|
- ext/vector_sse/extconf.rb
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
-
- .gitignore
|
49
|
+
- ".gitignore"
|
50
|
+
- Gemfile
|
50
51
|
- LICENSE.txt
|
51
52
|
- README.md
|
52
53
|
- Rakefile
|
54
|
+
- ext/.gitignore
|
53
55
|
- ext/vector_sse/.gitignore
|
54
56
|
- ext/vector_sse/extconf.rb
|
55
57
|
- ext/vector_sse/vector_sse.c
|
56
58
|
- ext/vector_sse/vector_sse_add.c
|
57
59
|
- ext/vector_sse/vector_sse_add.h
|
60
|
+
- ext/vector_sse/vector_sse_common.c
|
61
|
+
- ext/vector_sse/vector_sse_common.h
|
58
62
|
- ext/vector_sse/vector_sse_mul.c
|
59
63
|
- ext/vector_sse/vector_sse_mul.h
|
60
64
|
- ext/vector_sse/vector_sse_sum.c
|
@@ -76,17 +80,17 @@ require_paths:
|
|
76
80
|
- lib
|
77
81
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
82
|
requirements:
|
79
|
-
- -
|
83
|
+
- - ">="
|
80
84
|
- !ruby/object:Gem::Version
|
81
85
|
version: '0'
|
82
86
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
87
|
requirements:
|
84
|
-
- -
|
88
|
+
- - ">"
|
85
89
|
- !ruby/object:Gem::Version
|
86
90
|
version: 1.3.1
|
87
91
|
requirements: []
|
88
92
|
rubyforge_project:
|
89
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.4.8
|
90
94
|
signing_key:
|
91
95
|
specification_version: 4
|
92
96
|
summary: SIMD accelerated vector and matrix operations
|