vector_sse 0.0.1.pre → 0.0.2.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -1
- data/Gemfile +5 -0
- data/ext/.gitignore +1 -0
- data/ext/vector_sse/extconf.rb +1 -1
- data/ext/vector_sse/vector_sse_add.c +5 -4
- data/ext/vector_sse/vector_sse_common.c +31 -0
- data/ext/vector_sse/vector_sse_common.h +13 -0
- data/ext/vector_sse/vector_sse_mul.c +2 -12
- data/ext/vector_sse/vector_sse_sum.c +30 -28
- data/ext/vector_sse/vector_sse_vec_mul.c +7 -9
- data/lib/vector_sse.rb +1 -1
- metadata +13 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fdabd1511ca1ab68168a0723f06d26afe3312627
|
4
|
+
data.tar.gz: 9bb853808e5c0a8ec8e8b1592e84a0537954f6dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 213dffc68fc752e4520f0d5f7e910467b3fb51faba6ed5f6052b38d6a677928ead0b899b7658427fa78e08c7400f9ac874112b8c325174a328a9c137cdf34d6d
|
7
|
+
data.tar.gz: 10b7386669a7a1acbbcad0a9f24323e703487997d18969f9b02a6f0d5ba02ac2bd9ff1c50ba5df71f33d18413e38545454eb48700570046bc2c2c5fb1f54f712
|
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/ext/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*DS_Store
|
data/ext/vector_sse/extconf.rb
CHANGED
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
#include <emmintrin.h>
|
34
34
|
#include "vector_sse_add.h"
|
35
|
+
#include "vector_sse_common.h"
|
35
36
|
|
36
37
|
#define TEMPLATE_ADD_S( FUNC_NAME, TYPE, OFTYPE, TYPE_SIZE, CONV_IN, CONV_OUT, EL_PER_VEC, ADD ) \
|
37
38
|
VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
@@ -118,10 +119,10 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
118
119
|
|
119
120
|
TEMPLATE_ADD_S( method_vec_add_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
120
121
|
TEMPLATE_ADD_S( method_vec_add_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
121
|
-
TEMPLATE_ADD_S( method_vec_add_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4,
|
122
|
-
TEMPLATE_ADD_S( method_vec_add_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2,
|
122
|
+
TEMPLATE_ADD_S( method_vec_add_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, add_f32 );
|
123
|
+
TEMPLATE_ADD_S( method_vec_add_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2, add_f64 );
|
123
124
|
|
124
125
|
TEMPLATE_ADD_S( method_vec_sub_s32, int32_t, int32_t, 32, NUM2INT, INT2NUM, 4, _mm_sub_epi32 );
|
125
126
|
TEMPLATE_ADD_S( method_vec_sub_s64, int64_t, int64_t, 64, NUM2LL, LL2NUM, 2, _mm_sub_epi64 );
|
126
|
-
TEMPLATE_ADD_S( method_vec_sub_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4,
|
127
|
-
TEMPLATE_ADD_S( method_vec_sub_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2,
|
127
|
+
TEMPLATE_ADD_S( method_vec_sub_f32, float, int32_t, 32, NUM2DBL, DBL2NUM, 4, sub_f32 );
|
128
|
+
TEMPLATE_ADD_S( method_vec_sub_f64, double, int64_t, 64, NUM2DBL, DBL2NUM, 2, sub_f64 );
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#include "vector_sse_common.h"
|
2
|
+
|
3
|
+
__m128i add_f32( const __m128i left, const __m128i right )
|
4
|
+
{
|
5
|
+
return _mm_castps_si128( _mm_add_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
6
|
+
}
|
7
|
+
|
8
|
+
__m128i add_f64( const __m128i left, const __m128i right )
|
9
|
+
{
|
10
|
+
return _mm_cvtpd_epi32( _mm_add_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
11
|
+
}
|
12
|
+
|
13
|
+
__m128i sub_f32( const __m128i left, const __m128i right )
|
14
|
+
{
|
15
|
+
return _mm_castps_si128( _mm_sub_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
16
|
+
}
|
17
|
+
|
18
|
+
__m128i sub_f64( const __m128i left, const __m128i right )
|
19
|
+
{
|
20
|
+
return _mm_cvtpd_epi32( _mm_sub_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
21
|
+
}
|
22
|
+
|
23
|
+
__m128i mul_f32( const __m128i left, const __m128i right )
|
24
|
+
{
|
25
|
+
return _mm_castps_si128( _mm_mul_ps( _mm_castsi128_ps( left ), _mm_castsi128_ps( right ) ) );
|
26
|
+
}
|
27
|
+
|
28
|
+
__m128i mul_f64( const __m128i left, const __m128i right )
|
29
|
+
{
|
30
|
+
return _mm_cvtpd_epi32( _mm_mul_pd( _mm_castsi128_pd( left ), _mm_castsi128_pd( right ) ) );
|
31
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifndef VECTOR_SSE_COMMON_H
|
2
|
+
#define VECTOR_SSE_COMMON_H
|
3
|
+
|
4
|
+
#include <emmintrin.h>
|
5
|
+
|
6
|
+
__m128i add_f32( const __m128i left, const __m128i right );
|
7
|
+
__m128i add_f64( const __m128i left, const __m128i right );
|
8
|
+
__m128i sub_f32( const __m128i left, const __m128i right );
|
9
|
+
__m128i sub_f64( const __m128i left, const __m128i right );
|
10
|
+
__m128i mul_f32( const __m128i left, const __m128i right );
|
11
|
+
__m128i mul_f64( const __m128i left, const __m128i right );
|
12
|
+
|
13
|
+
#endif // VECTOR_SSE_COMMON_H
|
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
#include <emmintrin.h>
|
34
34
|
#include "vector_sse_mul.h"
|
35
|
+
#include "vector_sse_common.h"
|
35
36
|
|
36
37
|
#define SSE_VECTOR_WIDTH (4)
|
37
38
|
|
@@ -40,7 +41,6 @@ VALUE method_mat_mul_s32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
40
41
|
uint32_t left_row = 0;
|
41
42
|
uint32_t right_col = 0;
|
42
43
|
uint32_t common = 0;
|
43
|
-
uint32_t vector_pos = 0;
|
44
44
|
uint32_t input_index = 0;
|
45
45
|
uint32_t pos = 0;
|
46
46
|
|
@@ -148,17 +148,8 @@ VALUE method_mat_mul_s64( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
148
148
|
uint32_t left_row = 0;
|
149
149
|
uint32_t right_col = 0;
|
150
150
|
uint32_t common = 0;
|
151
|
-
uint32_t vector_pos = 0;
|
152
|
-
uint32_t input_index = 0;
|
153
151
|
uint32_t pos = 0;
|
154
152
|
|
155
|
-
int64_t left_segment[ SSE_VECTOR_WIDTH ];
|
156
|
-
int64_t right_segment[ SSE_VECTOR_WIDTH ];
|
157
|
-
|
158
|
-
__m128i* left_vec = NULL;
|
159
|
-
__m128i* right_vec = NULL;
|
160
|
-
__m128i result_vec;
|
161
|
-
|
162
153
|
VALUE result = Qnil;
|
163
154
|
|
164
155
|
int64_t* result_native = NULL;
|
@@ -233,7 +224,6 @@ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
233
224
|
uint32_t left_row = 0;
|
234
225
|
uint32_t right_col = 0;
|
235
226
|
uint32_t common = 0;
|
236
|
-
uint32_t vector_pos = 0;
|
237
227
|
uint32_t input_index = 0;
|
238
228
|
uint32_t pos = 0;
|
239
229
|
|
@@ -303,7 +293,7 @@ VALUE method_mat_mul_f32( VALUE self, VALUE left, VALUE left_rows_rb, VALUE left
|
|
303
293
|
|
304
294
|
left_vec = ( __m128i *)left_segment;
|
305
295
|
right_vec = ( __m128i *)right_segment;
|
306
|
-
result_vec =
|
296
|
+
result_vec = mul_f32( *left_vec, *right_vec );
|
307
297
|
|
308
298
|
_mm_store_si128( (__m128i*)result_segment, result_vec );
|
309
299
|
for ( pos = 0; pos < SSE_VECTOR_WIDTH; ++pos )
|
@@ -32,9 +32,31 @@
|
|
32
32
|
|
33
33
|
#include <string.h>
|
34
34
|
#include <emmintrin.h>
|
35
|
+
#include <ruby.h>
|
35
36
|
#include "vector_sse_sum.h"
|
37
|
+
#include "vector_sse_common.h"
|
36
38
|
|
37
|
-
|
39
|
+
|
40
|
+
// Check for overflow
|
41
|
+
// __m128i sign_left;
|
42
|
+
// __m128i sign_right;
|
43
|
+
// const int32_t OVERFLOW_MASK = ( (int32_t)0x1 << (32-1) );
|
44
|
+
// int32_t overflow[ 4 ];
|
45
|
+
// __m128i* overflow_vec = (__m128i*)overflow;
|
46
|
+
// sign_left = _mm_xor_si128(result_vec, left_vec);
|
47
|
+
// sign_right = _mm_xor_si128(result_vec, right_vec);
|
48
|
+
// *overflow_vec = _mm_and_si128(sign_left, sign_right);
|
49
|
+
|
50
|
+
// for ( vector_pos = 0; vector_pos < 4; ++vector_pos )
|
51
|
+
// {
|
52
|
+
// if ( ( (int32_t)overflow[ vector_pos ] & OVERFLOW_MASK ) )
|
53
|
+
// {
|
54
|
+
// rb_raise( rb_eRuntimeError, "Vector addition overflow" );
|
55
|
+
// }
|
56
|
+
// }
|
57
|
+
|
58
|
+
|
59
|
+
#define TEMPLATE_SUM_S( FUNC_NAME, TYPE, CONV_IN, CONV_OUT, EL_PER_VEC, ADDER ) \
|
38
60
|
VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
39
61
|
{ \
|
40
62
|
uint32_t length = 0; \
|
@@ -44,20 +66,12 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
44
66
|
\
|
45
67
|
TYPE result = 0; \
|
46
68
|
\
|
47
|
-
TYPE left_segment[ EL_PER_VEC ]; \
|
48
|
-
TYPE right_segment[ EL_PER_VEC ]; \
|
49
69
|
TYPE result_segment[ EL_PER_VEC ]; \
|
50
70
|
TYPE vector_segment[ EL_PER_VEC ]; \
|
51
71
|
\
|
52
|
-
__m128i left_vec;
|
53
|
-
__m128i right_vec;
|
72
|
+
__m128i left_vec; \
|
73
|
+
__m128i right_vec; \
|
54
74
|
__m128i result_vec; \
|
55
|
-
\
|
56
|
-
__m128i sign_left; \
|
57
|
-
__m128i sign_right; \
|
58
|
-
const OFTYPE OVERFLOW_MASK = ( (OFTYPE)0x1 << (TYPE_SIZE-1) ); \
|
59
|
-
OFTYPE overflow[ EL_PER_VEC ]; \
|
60
|
-
__m128i* overflow_vec = (__m128i*)overflow; \
|
61
75
|
\
|
62
76
|
Check_Type( vector, T_ARRAY ); \
|
63
77
|
\
|
@@ -85,19 +99,7 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
85
99
|
right_vec = _mm_loadu_si128( (const __m128i *)vector_segment ); \
|
86
100
|
left_vec = _mm_loadu_si128( &result_vec ); \
|
87
101
|
\
|
88
|
-
result_vec =
|
89
|
-
\
|
90
|
-
sign_left = _mm_xor_si128(result_vec, left_vec); \
|
91
|
-
sign_right = _mm_xor_si128(result_vec, right_vec); \
|
92
|
-
*overflow_vec = _mm_and_si128(sign_left, sign_right); \
|
93
|
-
\
|
94
|
-
for ( vector_pos = 0; vector_pos < EL_PER_VEC; ++vector_pos ) \
|
95
|
-
{ \
|
96
|
-
if ( ( (OFTYPE)overflow[ vector_pos ] & OVERFLOW_MASK ) ) \
|
97
|
-
{ \
|
98
|
-
rb_raise( rb_eRuntimeError, "Vector addition overflow" ); \
|
99
|
-
} \
|
100
|
-
} \
|
102
|
+
result_vec = ADDER( left_vec, right_vec ); \
|
101
103
|
} \
|
102
104
|
\
|
103
105
|
_mm_store_si128( (__m128i*)result_segment, result_vec ); \
|
@@ -111,8 +113,8 @@ VALUE FUNC_NAME( VALUE self, VALUE vector ) \
|
|
111
113
|
return CONV_OUT( result ); \
|
112
114
|
}
|
113
115
|
|
114
|
-
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t,
|
115
|
-
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t,
|
116
|
-
TEMPLATE_SUM_S( method_vec_sum_f32, float,
|
117
|
-
TEMPLATE_SUM_S( method_vec_sum_f64, double,
|
116
|
+
TEMPLATE_SUM_S( method_vec_sum_s32, int32_t, NUM2INT, INT2NUM, 4, _mm_add_epi32 );
|
117
|
+
TEMPLATE_SUM_S( method_vec_sum_s64, int64_t, NUM2LL, LL2NUM, 2, _mm_add_epi64 );
|
118
|
+
TEMPLATE_SUM_S( method_vec_sum_f32, float, NUM2DBL, DBL2NUM, 4, add_f32 );
|
119
|
+
TEMPLATE_SUM_S( method_vec_sum_f64, double, NUM2DBL, DBL2NUM, 2, add_f64 );
|
118
120
|
|
@@ -36,6 +36,7 @@
|
|
36
36
|
#include <smmintrin.h>
|
37
37
|
#endif
|
38
38
|
#include "vector_sse_vec_mul.h"
|
39
|
+
#include "vector_sse_common.h"
|
39
40
|
|
40
41
|
#define SSE_VECTOR_WIDTH (4)
|
41
42
|
// #define EL_PER_VEC SSE_VECTOR_WIDTH
|
@@ -66,14 +67,14 @@ static inline __m128i mul_s64( const __m128i* left_vec, const __m128i* right_vec
|
|
66
67
|
return _mm_loadu_si128( (const __m128i *)result );
|
67
68
|
}
|
68
69
|
|
69
|
-
static inline __m128i
|
70
|
+
static inline __m128i mul_f32_ptr(const __m128i* a, const __m128i* b )
|
70
71
|
{
|
71
|
-
return
|
72
|
+
return mul_f32( *a, *b );
|
72
73
|
}
|
73
74
|
|
74
|
-
static inline __m128i
|
75
|
+
static inline __m128i mul_f64_ptr(const __m128i* a, const __m128i* b )
|
75
76
|
{
|
76
|
-
return
|
77
|
+
return mul_f64( *a, *b );
|
77
78
|
}
|
78
79
|
|
79
80
|
|
@@ -95,9 +96,6 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
95
96
|
\
|
96
97
|
TYPE result_segment[ EL_PER_VEC ]; \
|
97
98
|
__m128i result_vec; \
|
98
|
-
\
|
99
|
-
__m128i sign_left; \
|
100
|
-
__m128i sign_right; \
|
101
99
|
\
|
102
100
|
Check_Type( left, T_ARRAY ); \
|
103
101
|
Check_Type( right, T_ARRAY ); \
|
@@ -152,6 +150,6 @@ VALUE FUNC_NAME( VALUE self, VALUE left, VALUE right ) \
|
|
152
150
|
|
153
151
|
TEMPLATE_VEC_MUL_S( method_vec_mul_s32, int32_t, 32, NUM2INT, INT2NUM, 4, mul_s32 );
|
154
152
|
TEMPLATE_VEC_MUL_S( method_vec_mul_s64, int64_t, 64, NUM2LL, LL2NUM, 2, mul_s64 );
|
155
|
-
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4,
|
156
|
-
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2,
|
153
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f32, float, 32, NUM2DBL, DBL2NUM, 4, mul_f32_ptr );
|
154
|
+
TEMPLATE_VEC_MUL_S( method_vec_mul_f64, double, 64, NUM2DBL, DBL2NUM, 2, mul_f64_ptr );
|
157
155
|
|
data/lib/vector_sse.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_sse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2.pre
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert Glissmann
|
@@ -14,28 +14,28 @@ dependencies:
|
|
14
14
|
name: rake-compiler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 0.9.5
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.9.5
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 3.1.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.1.0
|
41
41
|
description: VectorSse employs x86 Streaming SIMD Extensions (SSE), v3 or greater,
|
@@ -46,15 +46,19 @@ extensions:
|
|
46
46
|
- ext/vector_sse/extconf.rb
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
-
- .gitignore
|
49
|
+
- ".gitignore"
|
50
|
+
- Gemfile
|
50
51
|
- LICENSE.txt
|
51
52
|
- README.md
|
52
53
|
- Rakefile
|
54
|
+
- ext/.gitignore
|
53
55
|
- ext/vector_sse/.gitignore
|
54
56
|
- ext/vector_sse/extconf.rb
|
55
57
|
- ext/vector_sse/vector_sse.c
|
56
58
|
- ext/vector_sse/vector_sse_add.c
|
57
59
|
- ext/vector_sse/vector_sse_add.h
|
60
|
+
- ext/vector_sse/vector_sse_common.c
|
61
|
+
- ext/vector_sse/vector_sse_common.h
|
58
62
|
- ext/vector_sse/vector_sse_mul.c
|
59
63
|
- ext/vector_sse/vector_sse_mul.h
|
60
64
|
- ext/vector_sse/vector_sse_sum.c
|
@@ -76,17 +80,17 @@ require_paths:
|
|
76
80
|
- lib
|
77
81
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
82
|
requirements:
|
79
|
-
- -
|
83
|
+
- - ">="
|
80
84
|
- !ruby/object:Gem::Version
|
81
85
|
version: '0'
|
82
86
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
87
|
requirements:
|
84
|
-
- -
|
88
|
+
- - ">"
|
85
89
|
- !ruby/object:Gem::Version
|
86
90
|
version: 1.3.1
|
87
91
|
requirements: []
|
88
92
|
rubyforge_project:
|
89
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.4.8
|
90
94
|
signing_key:
|
91
95
|
specification_version: 4
|
92
96
|
summary: SIMD accelerated vector and matrix operations
|