cglm 0.1.0 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/Gemfile.lock +18 -16
- data/README.md +2 -0
- data/cglm.gemspec +3 -3
- data/ext/cglm/cglm-0.6.2/include/cglm/affine-mat.h +168 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/affine.h +490 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/applesimd.h +95 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/bezier.h +154 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/box.h +279 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/affine.h +117 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/bezier.h +31 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/box.h +79 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/cam.h +143 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/curve.h +23 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/ease.h +143 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/euler.h +55 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/frustum.h +41 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/io.h +44 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/mat3.h +86 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/mat4.h +127 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/plane.h +23 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/project.h +33 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/quat.h +159 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/sphere.h +39 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/vec3.h +312 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call/vec4.h +290 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/call.h +36 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/cam.h +585 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/cglm.h +32 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/color.h +26 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/common.h +37 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/curve.h +40 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/ease.h +317 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/euler.h +453 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/frustum.h +255 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/io.h +203 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/mat3.h +422 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/mat4.h +726 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/plane.h +36 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/project.h +118 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/quat.h +828 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/arm.h +83 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/avx/affine.h +66 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/avx/mat4.h +66 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/intrin.h +90 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/neon/mat4.h +57 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/affine.h +111 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/mat3.h +59 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/mat4.h +405 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/quat.h +46 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/simd/x86.h +192 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/sphere.h +99 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/affine.h +337 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/box.h +256 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/cam.h +451 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/color.h +27 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/curve.h +40 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/euler.h +152 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/frustum.h +155 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/io.h +82 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/mat3.h +285 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/mat4.h +459 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/plane.h +40 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/project.h +104 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/quat.h +532 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/sphere.h +93 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec3-ext.h +257 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec3.h +970 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec4-ext.h +257 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec4.h +814 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/struct.h +36 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/types-struct.h +129 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/types.h +76 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/util.h +328 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/vec3-ext.h +272 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/vec3.h +1078 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/vec4-ext.h +315 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/vec4.h +1078 -0
- data/ext/cglm/cglm-0.6.2/include/cglm/version.h +15 -0
- data/ext/cglm/extconf.rb +2 -3
- data/ext/cglm/rb_cglm.h +5 -3
- data/ext/cglm/rb_cglm_mat3.c +3 -3
- data/ext/cglm/rb_cglm_mat4.c +3 -3
- data/ext/cglm/rb_cglm_quat.c +2 -2
- data/ext/cglm/rb_cglm_vec3.c +63 -61
- data/ext/cglm/rb_cglm_vec4.c +2 -0
- data/ext/cglm/ruby_pre27.h +35 -0
- data/lib/cglm/vec3.rb +2 -2
- data/lib/cglm/vec4.rb +2 -2
- data/lib/cglm/vector_type.rb +15 -0
- data/lib/cglm/version.rb +1 -1
- metadata +89 -13
@@ -0,0 +1,83 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_simd_arm_h
|
9
|
+
#define cglm_simd_arm_h
|
10
|
+
#include "intrin.h"
|
11
|
+
#ifdef CGLM_SIMD_ARM
|
12
|
+
|
13
|
+
#define glmm_load(p) vld1q_f32(p)
|
14
|
+
#define glmm_store(p, a) vst1q_f32(p, a)
|
15
|
+
|
16
|
+
static inline
|
17
|
+
float32x4_t
|
18
|
+
glmm_abs(float32x4_t v) {
|
19
|
+
return vabsq_f32(v);
|
20
|
+
}
|
21
|
+
|
22
|
+
static inline
|
23
|
+
float
|
24
|
+
glmm_hadd(float32x4_t v) {
|
25
|
+
#if defined(__aarch64__)
|
26
|
+
return vaddvq_f32(v);
|
27
|
+
#else
|
28
|
+
v = vaddq_f32(v, vrev64q_f32(v));
|
29
|
+
v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
|
30
|
+
return vgetq_lane_f32(v, 0);
|
31
|
+
#endif
|
32
|
+
}
|
33
|
+
|
34
|
+
static inline
|
35
|
+
float
|
36
|
+
glmm_hmin(float32x4_t v) {
|
37
|
+
float32x2_t t;
|
38
|
+
t = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
|
39
|
+
t = vpmin_f32(t, t);
|
40
|
+
return vget_lane_f32(t, 0);
|
41
|
+
}
|
42
|
+
|
43
|
+
static inline
|
44
|
+
float
|
45
|
+
glmm_hmax(float32x4_t v) {
|
46
|
+
float32x2_t t;
|
47
|
+
t = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
|
48
|
+
t = vpmax_f32(t, t);
|
49
|
+
return vget_lane_f32(t, 0);
|
50
|
+
}
|
51
|
+
|
52
|
+
static inline
|
53
|
+
float
|
54
|
+
glmm_dot(float32x4_t a, float32x4_t b) {
|
55
|
+
return glmm_hadd(vmulq_f32(a, b));
|
56
|
+
}
|
57
|
+
|
58
|
+
static inline
|
59
|
+
float
|
60
|
+
glmm_norm(float32x4_t a) {
|
61
|
+
return sqrtf(glmm_dot(a, a));
|
62
|
+
}
|
63
|
+
|
64
|
+
static inline
|
65
|
+
float
|
66
|
+
glmm_norm2(float32x4_t a) {
|
67
|
+
return glmm_dot(a, a);
|
68
|
+
}
|
69
|
+
|
70
|
+
static inline
|
71
|
+
float
|
72
|
+
glmm_norm_one(float32x4_t a) {
|
73
|
+
return glmm_hadd(glmm_abs(a));
|
74
|
+
}
|
75
|
+
|
76
|
+
static inline
|
77
|
+
float
|
78
|
+
glmm_norm_inf(float32x4_t a) {
|
79
|
+
return glmm_hmax(glmm_abs(a));
|
80
|
+
}
|
81
|
+
|
82
|
+
#endif
|
83
|
+
#endif /* cglm_simd_arm_h */
|
@@ -0,0 +1,66 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_affine_mat_avx_h
|
9
|
+
#define cglm_affine_mat_avx_h
|
10
|
+
#ifdef __AVX__
|
11
|
+
|
12
|
+
#include "../../common.h"
|
13
|
+
#include "../intrin.h"
|
14
|
+
|
15
|
+
#include <immintrin.h>
|
16
|
+
|
17
|
+
CGLM_INLINE
|
18
|
+
void
|
19
|
+
glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
|
20
|
+
/* D = R * L (Column-Major) */
|
21
|
+
|
22
|
+
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
|
23
|
+
|
24
|
+
y0 = glmm_load256(m2[0]); /* h g f e d c b a */
|
25
|
+
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
|
26
|
+
|
27
|
+
y2 = glmm_load256(m1[0]); /* h g f e d c b a */
|
28
|
+
y3 = glmm_load256(m1[2]); /* p o n m l k j i */
|
29
|
+
|
30
|
+
/* 0x03: 0b00000011 */
|
31
|
+
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
|
32
|
+
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
|
33
|
+
|
34
|
+
/* f f f f a a a a */
|
35
|
+
/* h h h h c c c c */
|
36
|
+
/* e e e e b b b b */
|
37
|
+
/* g g g g d d d d */
|
38
|
+
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
39
|
+
y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
40
|
+
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
41
|
+
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
42
|
+
|
43
|
+
glmm_store256(dest[0],
|
44
|
+
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
45
|
+
_mm256_mul_ps(y3, y7)),
|
46
|
+
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
47
|
+
_mm256_mul_ps(y5, y9))));
|
48
|
+
|
49
|
+
/* n n n n i i i i */
|
50
|
+
/* p p p p k k k k */
|
51
|
+
/* m m m m j j j j */
|
52
|
+
/* o o o o l l l l */
|
53
|
+
y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
54
|
+
y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
55
|
+
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
56
|
+
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
57
|
+
|
58
|
+
glmm_store256(dest[2],
|
59
|
+
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
60
|
+
_mm256_mul_ps(y3, y7)),
|
61
|
+
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
62
|
+
_mm256_mul_ps(y5, y9))));
|
63
|
+
}
|
64
|
+
|
65
|
+
#endif
|
66
|
+
#endif /* cglm_affine_mat_avx_h */
|
@@ -0,0 +1,66 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_mat_simd_avx_h
|
9
|
+
#define cglm_mat_simd_avx_h
|
10
|
+
#ifdef __AVX__
|
11
|
+
|
12
|
+
#include "../../common.h"
|
13
|
+
#include "../intrin.h"
|
14
|
+
|
15
|
+
#include <immintrin.h>
|
16
|
+
|
17
|
+
CGLM_INLINE
|
18
|
+
void
|
19
|
+
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
|
20
|
+
/* D = R * L (Column-Major) */
|
21
|
+
|
22
|
+
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
|
23
|
+
|
24
|
+
y0 = glmm_load256(m2[0]); /* h g f e d c b a */
|
25
|
+
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
|
26
|
+
|
27
|
+
y2 = glmm_load256(m1[0]); /* h g f e d c b a */
|
28
|
+
y3 = glmm_load256(m1[2]); /* p o n m l k j i */
|
29
|
+
|
30
|
+
/* 0x03: 0b00000011 */
|
31
|
+
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
|
32
|
+
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
|
33
|
+
|
34
|
+
/* f f f f a a a a */
|
35
|
+
/* h h h h c c c c */
|
36
|
+
/* e e e e b b b b */
|
37
|
+
/* g g g g d d d d */
|
38
|
+
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
39
|
+
y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
40
|
+
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
41
|
+
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
42
|
+
|
43
|
+
glmm_store256(dest[0],
|
44
|
+
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
45
|
+
_mm256_mul_ps(y3, y7)),
|
46
|
+
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
47
|
+
_mm256_mul_ps(y5, y9))));
|
48
|
+
|
49
|
+
/* n n n n i i i i */
|
50
|
+
/* p p p p k k k k */
|
51
|
+
/* m m m m j j j j */
|
52
|
+
/* o o o o l l l l */
|
53
|
+
y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
54
|
+
y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
55
|
+
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
56
|
+
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
57
|
+
|
58
|
+
glmm_store256(dest[2],
|
59
|
+
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
60
|
+
_mm256_mul_ps(y3, y7)),
|
61
|
+
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
62
|
+
_mm256_mul_ps(y5, y9))));
|
63
|
+
}
|
64
|
+
|
65
|
+
#endif
|
66
|
+
#endif /* cglm_mat_simd_avx_h */
|
@@ -0,0 +1,90 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_intrin_h
|
9
|
+
#define cglm_intrin_h
|
10
|
+
|
11
|
+
#if defined( _MSC_VER )
|
12
|
+
# if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
|
13
|
+
# ifndef __SSE2__
|
14
|
+
# define __SSE2__
|
15
|
+
# endif
|
16
|
+
# elif _M_IX86_FP == 1
|
17
|
+
# ifndef __SSE__
|
18
|
+
# define __SSE__
|
19
|
+
# endif
|
20
|
+
# endif
|
21
|
+
/* do not use alignment for older visual studio versions */
|
22
|
+
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
|
23
|
+
# define CGLM_ALL_UNALIGNED
|
24
|
+
# endif
|
25
|
+
#endif
|
26
|
+
|
27
|
+
#if defined( __SSE__ ) || defined( __SSE2__ )
|
28
|
+
# include <xmmintrin.h>
|
29
|
+
# include <emmintrin.h>
|
30
|
+
# define CGLM_SSE_FP 1
|
31
|
+
# ifndef CGLM_SIMD_x86
|
32
|
+
# define CGLM_SIMD_x86
|
33
|
+
# endif
|
34
|
+
#endif
|
35
|
+
|
36
|
+
#if defined(__SSE3__)
|
37
|
+
# include <x86intrin.h>
|
38
|
+
# ifndef CGLM_SIMD_x86
|
39
|
+
# define CGLM_SIMD_x86
|
40
|
+
# endif
|
41
|
+
#endif
|
42
|
+
|
43
|
+
#if defined(__SSE4_1__)
|
44
|
+
# include <smmintrin.h>
|
45
|
+
# ifndef CGLM_SIMD_x86
|
46
|
+
# define CGLM_SIMD_x86
|
47
|
+
# endif
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#if defined(__SSE4_2__)
|
51
|
+
# include <nmmintrin.h>
|
52
|
+
# ifndef CGLM_SIMD_x86
|
53
|
+
# define CGLM_SIMD_x86
|
54
|
+
# endif
|
55
|
+
#endif
|
56
|
+
|
57
|
+
#ifdef __AVX__
|
58
|
+
# include <immintrin.h>
|
59
|
+
# define CGLM_AVX_FP 1
|
60
|
+
# ifndef CGLM_SIMD_x86
|
61
|
+
# define CGLM_SIMD_x86
|
62
|
+
# endif
|
63
|
+
#endif
|
64
|
+
|
65
|
+
/* ARM Neon */
|
66
|
+
#if defined(__ARM_NEON)
|
67
|
+
# include <arm_neon.h>
|
68
|
+
# if defined(__ARM_NEON_FP)
|
69
|
+
# define CGLM_NEON_FP 1
|
70
|
+
# ifndef CGLM_SIMD_ARM
|
71
|
+
# define CGLM_SIMD_ARM
|
72
|
+
# endif
|
73
|
+
# endif
|
74
|
+
#endif
|
75
|
+
|
76
|
+
#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
|
77
|
+
# ifndef CGLM_SIMD
|
78
|
+
# define CGLM_SIMD
|
79
|
+
# endif
|
80
|
+
#endif
|
81
|
+
|
82
|
+
#if defined(CGLM_SIMD_x86)
|
83
|
+
# include "x86.h"
|
84
|
+
#endif
|
85
|
+
|
86
|
+
#if defined(CGLM_SIMD_ARM)
|
87
|
+
# include "arm.h"
|
88
|
+
#endif
|
89
|
+
|
90
|
+
#endif /* cglm_intrin_h */
|
@@ -0,0 +1,57 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_mat4_neon_h
|
9
|
+
#define cglm_mat4_neon_h
|
10
|
+
#if defined(__ARM_NEON_FP)
|
11
|
+
|
12
|
+
#include "../../common.h"
|
13
|
+
#include "../intrin.h"
|
14
|
+
|
15
|
+
CGLM_INLINE
|
16
|
+
void
|
17
|
+
glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
|
18
|
+
/* D = R * L (Column-Major) */
|
19
|
+
float32x4_t l0, l1, l2, l3, r, d0, d1, d2, d3;
|
20
|
+
|
21
|
+
l0 = vld1q_f32(m2[0]);
|
22
|
+
l1 = vld1q_f32(m2[1]);
|
23
|
+
l2 = vld1q_f32(m2[2]);
|
24
|
+
l3 = vld1q_f32(m2[3]);
|
25
|
+
|
26
|
+
r = vld1q_f32(m1[0]);
|
27
|
+
d0 = vmulq_lane_f32(r, vget_low_f32(l0), 0);
|
28
|
+
d1 = vmulq_lane_f32(r, vget_low_f32(l1), 0);
|
29
|
+
d2 = vmulq_lane_f32(r, vget_low_f32(l2), 0);
|
30
|
+
d3 = vmulq_lane_f32(r, vget_low_f32(l3), 0);
|
31
|
+
|
32
|
+
r = vld1q_f32(m1[1]);
|
33
|
+
d0 = vmlaq_lane_f32(d0, r, vget_low_f32(l0), 1);
|
34
|
+
d1 = vmlaq_lane_f32(d1, r, vget_low_f32(l1), 1);
|
35
|
+
d2 = vmlaq_lane_f32(d2, r, vget_low_f32(l2), 1);
|
36
|
+
d3 = vmlaq_lane_f32(d3, r, vget_low_f32(l3), 1);
|
37
|
+
|
38
|
+
r = vld1q_f32(m1[2]);
|
39
|
+
d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 0);
|
40
|
+
d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 0);
|
41
|
+
d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 0);
|
42
|
+
d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 0);
|
43
|
+
|
44
|
+
r = vld1q_f32(m1[3]);
|
45
|
+
d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 1);
|
46
|
+
d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 1);
|
47
|
+
d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 1);
|
48
|
+
d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 1);
|
49
|
+
|
50
|
+
vst1q_f32(dest[0], d0);
|
51
|
+
vst1q_f32(dest[1], d1);
|
52
|
+
vst1q_f32(dest[2], d2);
|
53
|
+
vst1q_f32(dest[3], d3);
|
54
|
+
}
|
55
|
+
|
56
|
+
#endif
|
57
|
+
#endif /* cglm_mat4_neon_h */
|
@@ -0,0 +1,111 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_affine_mat_sse2_h
|
9
|
+
#define cglm_affine_mat_sse2_h
|
10
|
+
#if defined( __SSE__ ) || defined( __SSE2__ )
|
11
|
+
|
12
|
+
#include "../../common.h"
|
13
|
+
#include "../intrin.h"
|
14
|
+
|
15
|
+
CGLM_INLINE
|
16
|
+
void
|
17
|
+
glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
|
18
|
+
/* D = R * L (Column-Major) */
|
19
|
+
__m128 l0, l1, l2, l3, r;
|
20
|
+
|
21
|
+
l0 = glmm_load(m1[0]);
|
22
|
+
l1 = glmm_load(m1[1]);
|
23
|
+
l2 = glmm_load(m1[2]);
|
24
|
+
l3 = glmm_load(m1[3]);
|
25
|
+
|
26
|
+
r = glmm_load(m2[0]);
|
27
|
+
glmm_store(dest[0],
|
28
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
29
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
30
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
31
|
+
|
32
|
+
r = glmm_load(m2[1]);
|
33
|
+
glmm_store(dest[1],
|
34
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
35
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
36
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
37
|
+
|
38
|
+
r = glmm_load(m2[2]);
|
39
|
+
glmm_store(dest[2],
|
40
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
41
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
42
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
43
|
+
|
44
|
+
r = glmm_load(m2[3]);
|
45
|
+
glmm_store(dest[3],
|
46
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
47
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
48
|
+
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
|
49
|
+
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
|
50
|
+
}
|
51
|
+
|
52
|
+
CGLM_INLINE
|
53
|
+
void
|
54
|
+
glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
|
55
|
+
/* D = R * L (Column-Major) */
|
56
|
+
__m128 l0, l1, l2, l3, r;
|
57
|
+
|
58
|
+
l0 = glmm_load(m1[0]);
|
59
|
+
l1 = glmm_load(m1[1]);
|
60
|
+
l2 = glmm_load(m1[2]);
|
61
|
+
l3 = glmm_load(m1[3]);
|
62
|
+
|
63
|
+
r = glmm_load(m2[0]);
|
64
|
+
glmm_store(dest[0],
|
65
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
66
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
67
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
68
|
+
|
69
|
+
r = glmm_load(m2[1]);
|
70
|
+
glmm_store(dest[1],
|
71
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
72
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
73
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
74
|
+
|
75
|
+
r = glmm_load(m2[2]);
|
76
|
+
glmm_store(dest[2],
|
77
|
+
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
|
78
|
+
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
|
79
|
+
_mm_mul_ps(glmm_shuff1x(r, 2), l2)));
|
80
|
+
|
81
|
+
glmm_store(dest[3], l3);
|
82
|
+
}
|
83
|
+
|
84
|
+
CGLM_INLINE
|
85
|
+
void
|
86
|
+
glm_inv_tr_sse2(mat4 mat) {
|
87
|
+
__m128 r0, r1, r2, r3, x0, x1;
|
88
|
+
|
89
|
+
r0 = glmm_load(mat[0]);
|
90
|
+
r1 = glmm_load(mat[1]);
|
91
|
+
r2 = glmm_load(mat[2]);
|
92
|
+
r3 = glmm_load(mat[3]);
|
93
|
+
x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
|
94
|
+
|
95
|
+
_MM_TRANSPOSE4_PS(r0, r1, r2, x1);
|
96
|
+
|
97
|
+
x0 = _mm_add_ps(_mm_mul_ps(r0, glmm_shuff1(r3, 0, 0, 0, 0)),
|
98
|
+
_mm_mul_ps(r1, glmm_shuff1(r3, 1, 1, 1, 1)));
|
99
|
+
x0 = _mm_add_ps(x0, _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2)));
|
100
|
+
x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f));
|
101
|
+
|
102
|
+
x0 = _mm_add_ps(x0, x1);
|
103
|
+
|
104
|
+
glmm_store(mat[0], r0);
|
105
|
+
glmm_store(mat[1], r1);
|
106
|
+
glmm_store(mat[2], r2);
|
107
|
+
glmm_store(mat[3], x0);
|
108
|
+
}
|
109
|
+
|
110
|
+
#endif
|
111
|
+
#endif /* cglm_affine_mat_sse2_h */
|
@@ -0,0 +1,59 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c), Recep Aslantas.
|
3
|
+
*
|
4
|
+
* MIT License (MIT), http://opensource.org/licenses/MIT
|
5
|
+
* Full license can be found in the LICENSE file
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef cglm_mat3_sse_h
|
9
|
+
#define cglm_mat3_sse_h
|
10
|
+
#if defined( __SSE__ ) || defined( __SSE2__ )
|
11
|
+
|
12
|
+
#include "../../common.h"
|
13
|
+
#include "../intrin.h"
|
14
|
+
|
15
|
+
CGLM_INLINE
|
16
|
+
void
|
17
|
+
glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
|
18
|
+
__m128 l0, l1, l2;
|
19
|
+
__m128 r0, r1, r2;
|
20
|
+
__m128 x0, x1, x2;
|
21
|
+
|
22
|
+
l0 = _mm_loadu_ps(m1[0]);
|
23
|
+
l1 = _mm_loadu_ps(&m1[1][1]);
|
24
|
+
l2 = _mm_set1_ps(m1[2][2]);
|
25
|
+
|
26
|
+
r0 = _mm_loadu_ps(m2[0]);
|
27
|
+
r1 = _mm_loadu_ps(&m2[1][1]);
|
28
|
+
r2 = _mm_set1_ps(m2[2][2]);
|
29
|
+
|
30
|
+
x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0);
|
31
|
+
x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0);
|
32
|
+
|
33
|
+
x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 0, 2, 1, 0),
|
34
|
+
glmm_shuff1(r0, 3, 0, 0, 0)),
|
35
|
+
_mm_mul_ps(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0)));
|
36
|
+
|
37
|
+
x0 = _mm_add_ps(x0,
|
38
|
+
_mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)));
|
39
|
+
|
40
|
+
_mm_storeu_ps(dest[0], x0);
|
41
|
+
|
42
|
+
x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 1, 0, 2, 1),
|
43
|
+
_mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3))),
|
44
|
+
_mm_mul_ps(glmm_shuff1(x1, 1, 0, 2, 1),
|
45
|
+
glmm_shuff1(r1, 3, 3, 0, 0)));
|
46
|
+
|
47
|
+
x0 = _mm_add_ps(x0,
|
48
|
+
_mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
|
49
|
+
_mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))));
|
50
|
+
|
51
|
+
_mm_storeu_ps(&dest[1][1], x0);
|
52
|
+
|
53
|
+
dest[2][2] = m1[0][2] * m2[2][0]
|
54
|
+
+ m1[1][2] * m2[2][1]
|
55
|
+
+ m1[2][2] * m2[2][2];
|
56
|
+
}
|
57
|
+
|
58
|
+
#endif
|
59
|
+
#endif /* cglm_mat3_sse_h */
|