cglm 0.1.0 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -1
  3. data/Gemfile.lock +18 -16
  4. data/README.md +2 -0
  5. data/cglm.gemspec +3 -3
  6. data/ext/cglm/cglm-0.6.2/include/cglm/affine-mat.h +168 -0
  7. data/ext/cglm/cglm-0.6.2/include/cglm/affine.h +490 -0
  8. data/ext/cglm/cglm-0.6.2/include/cglm/applesimd.h +95 -0
  9. data/ext/cglm/cglm-0.6.2/include/cglm/bezier.h +154 -0
  10. data/ext/cglm/cglm-0.6.2/include/cglm/box.h +279 -0
  11. data/ext/cglm/cglm-0.6.2/include/cglm/call/affine.h +117 -0
  12. data/ext/cglm/cglm-0.6.2/include/cglm/call/bezier.h +31 -0
  13. data/ext/cglm/cglm-0.6.2/include/cglm/call/box.h +79 -0
  14. data/ext/cglm/cglm-0.6.2/include/cglm/call/cam.h +143 -0
  15. data/ext/cglm/cglm-0.6.2/include/cglm/call/curve.h +23 -0
  16. data/ext/cglm/cglm-0.6.2/include/cglm/call/ease.h +143 -0
  17. data/ext/cglm/cglm-0.6.2/include/cglm/call/euler.h +55 -0
  18. data/ext/cglm/cglm-0.6.2/include/cglm/call/frustum.h +41 -0
  19. data/ext/cglm/cglm-0.6.2/include/cglm/call/io.h +44 -0
  20. data/ext/cglm/cglm-0.6.2/include/cglm/call/mat3.h +86 -0
  21. data/ext/cglm/cglm-0.6.2/include/cglm/call/mat4.h +127 -0
  22. data/ext/cglm/cglm-0.6.2/include/cglm/call/plane.h +23 -0
  23. data/ext/cglm/cglm-0.6.2/include/cglm/call/project.h +33 -0
  24. data/ext/cglm/cglm-0.6.2/include/cglm/call/quat.h +159 -0
  25. data/ext/cglm/cglm-0.6.2/include/cglm/call/sphere.h +39 -0
  26. data/ext/cglm/cglm-0.6.2/include/cglm/call/vec3.h +312 -0
  27. data/ext/cglm/cglm-0.6.2/include/cglm/call/vec4.h +290 -0
  28. data/ext/cglm/cglm-0.6.2/include/cglm/call.h +36 -0
  29. data/ext/cglm/cglm-0.6.2/include/cglm/cam.h +585 -0
  30. data/ext/cglm/cglm-0.6.2/include/cglm/cglm.h +32 -0
  31. data/ext/cglm/cglm-0.6.2/include/cglm/color.h +26 -0
  32. data/ext/cglm/cglm-0.6.2/include/cglm/common.h +37 -0
  33. data/ext/cglm/cglm-0.6.2/include/cglm/curve.h +40 -0
  34. data/ext/cglm/cglm-0.6.2/include/cglm/ease.h +317 -0
  35. data/ext/cglm/cglm-0.6.2/include/cglm/euler.h +453 -0
  36. data/ext/cglm/cglm-0.6.2/include/cglm/frustum.h +255 -0
  37. data/ext/cglm/cglm-0.6.2/include/cglm/io.h +203 -0
  38. data/ext/cglm/cglm-0.6.2/include/cglm/mat3.h +422 -0
  39. data/ext/cglm/cglm-0.6.2/include/cglm/mat4.h +726 -0
  40. data/ext/cglm/cglm-0.6.2/include/cglm/plane.h +36 -0
  41. data/ext/cglm/cglm-0.6.2/include/cglm/project.h +118 -0
  42. data/ext/cglm/cglm-0.6.2/include/cglm/quat.h +828 -0
  43. data/ext/cglm/cglm-0.6.2/include/cglm/simd/arm.h +83 -0
  44. data/ext/cglm/cglm-0.6.2/include/cglm/simd/avx/affine.h +66 -0
  45. data/ext/cglm/cglm-0.6.2/include/cglm/simd/avx/mat4.h +66 -0
  46. data/ext/cglm/cglm-0.6.2/include/cglm/simd/intrin.h +90 -0
  47. data/ext/cglm/cglm-0.6.2/include/cglm/simd/neon/mat4.h +57 -0
  48. data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/affine.h +111 -0
  49. data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/mat3.h +59 -0
  50. data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/mat4.h +405 -0
  51. data/ext/cglm/cglm-0.6.2/include/cglm/simd/sse2/quat.h +46 -0
  52. data/ext/cglm/cglm-0.6.2/include/cglm/simd/x86.h +192 -0
  53. data/ext/cglm/cglm-0.6.2/include/cglm/sphere.h +99 -0
  54. data/ext/cglm/cglm-0.6.2/include/cglm/struct/affine.h +337 -0
  55. data/ext/cglm/cglm-0.6.2/include/cglm/struct/box.h +256 -0
  56. data/ext/cglm/cglm-0.6.2/include/cglm/struct/cam.h +451 -0
  57. data/ext/cglm/cglm-0.6.2/include/cglm/struct/color.h +27 -0
  58. data/ext/cglm/cglm-0.6.2/include/cglm/struct/curve.h +40 -0
  59. data/ext/cglm/cglm-0.6.2/include/cglm/struct/euler.h +152 -0
  60. data/ext/cglm/cglm-0.6.2/include/cglm/struct/frustum.h +155 -0
  61. data/ext/cglm/cglm-0.6.2/include/cglm/struct/io.h +82 -0
  62. data/ext/cglm/cglm-0.6.2/include/cglm/struct/mat3.h +285 -0
  63. data/ext/cglm/cglm-0.6.2/include/cglm/struct/mat4.h +459 -0
  64. data/ext/cglm/cglm-0.6.2/include/cglm/struct/plane.h +40 -0
  65. data/ext/cglm/cglm-0.6.2/include/cglm/struct/project.h +104 -0
  66. data/ext/cglm/cglm-0.6.2/include/cglm/struct/quat.h +532 -0
  67. data/ext/cglm/cglm-0.6.2/include/cglm/struct/sphere.h +93 -0
  68. data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec3-ext.h +257 -0
  69. data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec3.h +970 -0
  70. data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec4-ext.h +257 -0
  71. data/ext/cglm/cglm-0.6.2/include/cglm/struct/vec4.h +814 -0
  72. data/ext/cglm/cglm-0.6.2/include/cglm/struct.h +36 -0
  73. data/ext/cglm/cglm-0.6.2/include/cglm/types-struct.h +129 -0
  74. data/ext/cglm/cglm-0.6.2/include/cglm/types.h +76 -0
  75. data/ext/cglm/cglm-0.6.2/include/cglm/util.h +328 -0
  76. data/ext/cglm/cglm-0.6.2/include/cglm/vec3-ext.h +272 -0
  77. data/ext/cglm/cglm-0.6.2/include/cglm/vec3.h +1078 -0
  78. data/ext/cglm/cglm-0.6.2/include/cglm/vec4-ext.h +315 -0
  79. data/ext/cglm/cglm-0.6.2/include/cglm/vec4.h +1078 -0
  80. data/ext/cglm/cglm-0.6.2/include/cglm/version.h +15 -0
  81. data/ext/cglm/extconf.rb +2 -3
  82. data/ext/cglm/rb_cglm.h +5 -3
  83. data/ext/cglm/rb_cglm_mat3.c +3 -3
  84. data/ext/cglm/rb_cglm_mat4.c +3 -3
  85. data/ext/cglm/rb_cglm_quat.c +2 -2
  86. data/ext/cglm/rb_cglm_vec3.c +63 -61
  87. data/ext/cglm/rb_cglm_vec4.c +2 -0
  88. data/ext/cglm/ruby_pre27.h +35 -0
  89. data/lib/cglm/vec3.rb +2 -2
  90. data/lib/cglm/vec4.rb +2 -2
  91. data/lib/cglm/vector_type.rb +15 -0
  92. data/lib/cglm/version.rb +1 -1
  93. metadata +89 -13
@@ -0,0 +1,83 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_simd_arm_h
9
+ #define cglm_simd_arm_h
10
+ #include "intrin.h"
11
+ #ifdef CGLM_SIMD_ARM
12
+
13
+ #define glmm_load(p) vld1q_f32(p)
14
+ #define glmm_store(p, a) vst1q_f32(p, a)
15
+
16
+ static inline
17
+ float32x4_t
18
+ glmm_abs(float32x4_t v) {
19
+ return vabsq_f32(v);
20
+ }
21
+
22
+ static inline
23
+ float
24
+ glmm_hadd(float32x4_t v) {
25
+ #if defined(__aarch64__)
26
+ return vaddvq_f32(v);
27
+ #else
28
+ v = vaddq_f32(v, vrev64q_f32(v));
29
+ v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
30
+ return vgetq_lane_f32(v, 0);
31
+ #endif
32
+ }
33
+
34
+ static inline
35
+ float
36
+ glmm_hmin(float32x4_t v) {
37
+ float32x2_t t;
38
+ t = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
39
+ t = vpmin_f32(t, t);
40
+ return vget_lane_f32(t, 0);
41
+ }
42
+
43
+ static inline
44
+ float
45
+ glmm_hmax(float32x4_t v) {
46
+ float32x2_t t;
47
+ t = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
48
+ t = vpmax_f32(t, t);
49
+ return vget_lane_f32(t, 0);
50
+ }
51
+
52
+ static inline
53
+ float
54
+ glmm_dot(float32x4_t a, float32x4_t b) {
55
+ return glmm_hadd(vmulq_f32(a, b));
56
+ }
57
+
58
+ static inline
59
+ float
60
+ glmm_norm(float32x4_t a) {
61
+ return sqrtf(glmm_dot(a, a));
62
+ }
63
+
64
+ static inline
65
+ float
66
+ glmm_norm2(float32x4_t a) {
67
+ return glmm_dot(a, a);
68
+ }
69
+
70
+ static inline
71
+ float
72
+ glmm_norm_one(float32x4_t a) {
73
+ return glmm_hadd(glmm_abs(a));
74
+ }
75
+
76
+ static inline
77
+ float
78
+ glmm_norm_inf(float32x4_t a) {
79
+ return glmm_hmax(glmm_abs(a));
80
+ }
81
+
82
+ #endif
83
+ #endif /* cglm_simd_arm_h */
@@ -0,0 +1,66 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_affine_mat_avx_h
9
+ #define cglm_affine_mat_avx_h
10
+ #ifdef __AVX__
11
+
12
+ #include "../../common.h"
13
+ #include "../intrin.h"
14
+
15
+ #include <immintrin.h>
16
+
17
+ CGLM_INLINE
18
+ void
19
+ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
20
+ /* D = R * L (Column-Major) */
21
+
22
+ __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
23
+
24
+ y0 = glmm_load256(m2[0]); /* h g f e d c b a */
25
+ y1 = glmm_load256(m2[2]); /* p o n m l k j i */
26
+
27
+ y2 = glmm_load256(m1[0]); /* h g f e d c b a */
28
+ y3 = glmm_load256(m1[2]); /* p o n m l k j i */
29
+
30
+ /* 0x03: 0b00000011 */
31
+ y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
32
+ y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
33
+
34
+ /* f f f f a a a a */
35
+ /* h h h h c c c c */
36
+ /* e e e e b b b b */
37
+ /* g g g g d d d d */
38
+ y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
39
+ y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
40
+ y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
41
+ y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
42
+
43
+ glmm_store256(dest[0],
44
+ _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
45
+ _mm256_mul_ps(y3, y7)),
46
+ _mm256_add_ps(_mm256_mul_ps(y4, y8),
47
+ _mm256_mul_ps(y5, y9))));
48
+
49
+ /* n n n n i i i i */
50
+ /* p p p p k k k k */
51
+ /* m m m m j j j j */
52
+ /* o o o o l l l l */
53
+ y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
54
+ y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
55
+ y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
56
+ y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
57
+
58
+ glmm_store256(dest[2],
59
+ _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
60
+ _mm256_mul_ps(y3, y7)),
61
+ _mm256_add_ps(_mm256_mul_ps(y4, y8),
62
+ _mm256_mul_ps(y5, y9))));
63
+ }
64
+
65
+ #endif
66
+ #endif /* cglm_affine_mat_avx_h */
@@ -0,0 +1,66 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_mat_simd_avx_h
9
+ #define cglm_mat_simd_avx_h
10
+ #ifdef __AVX__
11
+
12
+ #include "../../common.h"
13
+ #include "../intrin.h"
14
+
15
+ #include <immintrin.h>
16
+
17
+ CGLM_INLINE
18
+ void
19
+ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
20
+ /* D = R * L (Column-Major) */
21
+
22
+ __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
23
+
24
+ y0 = glmm_load256(m2[0]); /* h g f e d c b a */
25
+ y1 = glmm_load256(m2[2]); /* p o n m l k j i */
26
+
27
+ y2 = glmm_load256(m1[0]); /* h g f e d c b a */
28
+ y3 = glmm_load256(m1[2]); /* p o n m l k j i */
29
+
30
+ /* 0x03: 0b00000011 */
31
+ y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
32
+ y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
33
+
34
+ /* f f f f a a a a */
35
+ /* h h h h c c c c */
36
+ /* e e e e b b b b */
37
+ /* g g g g d d d d */
38
+ y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
39
+ y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
40
+ y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
41
+ y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
42
+
43
+ glmm_store256(dest[0],
44
+ _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
45
+ _mm256_mul_ps(y3, y7)),
46
+ _mm256_add_ps(_mm256_mul_ps(y4, y8),
47
+ _mm256_mul_ps(y5, y9))));
48
+
49
+ /* n n n n i i i i */
50
+ /* p p p p k k k k */
51
+ /* m m m m j j j j */
52
+ /* o o o o l l l l */
53
+ y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
54
+ y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
55
+ y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
56
+ y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
57
+
58
+ glmm_store256(dest[2],
59
+ _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
60
+ _mm256_mul_ps(y3, y7)),
61
+ _mm256_add_ps(_mm256_mul_ps(y4, y8),
62
+ _mm256_mul_ps(y5, y9))));
63
+ }
64
+
65
+ #endif
66
+ #endif /* cglm_mat_simd_avx_h */
@@ -0,0 +1,90 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_intrin_h
9
+ #define cglm_intrin_h
10
+
11
+ #if defined( _MSC_VER )
12
+ # if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
13
+ # ifndef __SSE2__
14
+ # define __SSE2__
15
+ # endif
16
+ # elif _M_IX86_FP == 1
17
+ # ifndef __SSE__
18
+ # define __SSE__
19
+ # endif
20
+ # endif
21
+ /* do not use alignment for older visual studio versions */
22
+ # if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
23
+ # define CGLM_ALL_UNALIGNED
24
+ # endif
25
+ #endif
26
+
27
+ #if defined( __SSE__ ) || defined( __SSE2__ )
28
+ # include <xmmintrin.h>
29
+ # include <emmintrin.h>
30
+ # define CGLM_SSE_FP 1
31
+ # ifndef CGLM_SIMD_x86
32
+ # define CGLM_SIMD_x86
33
+ # endif
34
+ #endif
35
+
36
+ #if defined(__SSE3__)
37
+ # include <x86intrin.h>
38
+ # ifndef CGLM_SIMD_x86
39
+ # define CGLM_SIMD_x86
40
+ # endif
41
+ #endif
42
+
43
+ #if defined(__SSE4_1__)
44
+ # include <smmintrin.h>
45
+ # ifndef CGLM_SIMD_x86
46
+ # define CGLM_SIMD_x86
47
+ # endif
48
+ #endif
49
+
50
+ #if defined(__SSE4_2__)
51
+ # include <nmmintrin.h>
52
+ # ifndef CGLM_SIMD_x86
53
+ # define CGLM_SIMD_x86
54
+ # endif
55
+ #endif
56
+
57
+ #ifdef __AVX__
58
+ # include <immintrin.h>
59
+ # define CGLM_AVX_FP 1
60
+ # ifndef CGLM_SIMD_x86
61
+ # define CGLM_SIMD_x86
62
+ # endif
63
+ #endif
64
+
65
+ /* ARM Neon */
66
+ #if defined(__ARM_NEON)
67
+ # include <arm_neon.h>
68
+ # if defined(__ARM_NEON_FP)
69
+ # define CGLM_NEON_FP 1
70
+ # ifndef CGLM_SIMD_ARM
71
+ # define CGLM_SIMD_ARM
72
+ # endif
73
+ # endif
74
+ #endif
75
+
76
+ #if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
77
+ # ifndef CGLM_SIMD
78
+ # define CGLM_SIMD
79
+ # endif
80
+ #endif
81
+
82
+ #if defined(CGLM_SIMD_x86)
83
+ # include "x86.h"
84
+ #endif
85
+
86
+ #if defined(CGLM_SIMD_ARM)
87
+ # include "arm.h"
88
+ #endif
89
+
90
+ #endif /* cglm_intrin_h */
@@ -0,0 +1,57 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_mat4_neon_h
9
+ #define cglm_mat4_neon_h
10
+ #if defined(__ARM_NEON_FP)
11
+
12
+ #include "../../common.h"
13
+ #include "../intrin.h"
14
+
15
+ CGLM_INLINE
16
+ void
17
+ glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
18
+ /* D = R * L (Column-Major) */
19
+ float32x4_t l0, l1, l2, l3, r, d0, d1, d2, d3;
20
+
21
+ l0 = vld1q_f32(m2[0]);
22
+ l1 = vld1q_f32(m2[1]);
23
+ l2 = vld1q_f32(m2[2]);
24
+ l3 = vld1q_f32(m2[3]);
25
+
26
+ r = vld1q_f32(m1[0]);
27
+ d0 = vmulq_lane_f32(r, vget_low_f32(l0), 0);
28
+ d1 = vmulq_lane_f32(r, vget_low_f32(l1), 0);
29
+ d2 = vmulq_lane_f32(r, vget_low_f32(l2), 0);
30
+ d3 = vmulq_lane_f32(r, vget_low_f32(l3), 0);
31
+
32
+ r = vld1q_f32(m1[1]);
33
+ d0 = vmlaq_lane_f32(d0, r, vget_low_f32(l0), 1);
34
+ d1 = vmlaq_lane_f32(d1, r, vget_low_f32(l1), 1);
35
+ d2 = vmlaq_lane_f32(d2, r, vget_low_f32(l2), 1);
36
+ d3 = vmlaq_lane_f32(d3, r, vget_low_f32(l3), 1);
37
+
38
+ r = vld1q_f32(m1[2]);
39
+ d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 0);
40
+ d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 0);
41
+ d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 0);
42
+ d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 0);
43
+
44
+ r = vld1q_f32(m1[3]);
45
+ d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 1);
46
+ d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 1);
47
+ d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 1);
48
+ d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 1);
49
+
50
+ vst1q_f32(dest[0], d0);
51
+ vst1q_f32(dest[1], d1);
52
+ vst1q_f32(dest[2], d2);
53
+ vst1q_f32(dest[3], d3);
54
+ }
55
+
56
+ #endif
57
+ #endif /* cglm_mat4_neon_h */
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_affine_mat_sse2_h
9
+ #define cglm_affine_mat_sse2_h
10
+ #if defined( __SSE__ ) || defined( __SSE2__ )
11
+
12
+ #include "../../common.h"
13
+ #include "../intrin.h"
14
+
15
+ CGLM_INLINE
16
+ void
17
+ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
18
+ /* D = R * L (Column-Major) */
19
+ __m128 l0, l1, l2, l3, r;
20
+
21
+ l0 = glmm_load(m1[0]);
22
+ l1 = glmm_load(m1[1]);
23
+ l2 = glmm_load(m1[2]);
24
+ l3 = glmm_load(m1[3]);
25
+
26
+ r = glmm_load(m2[0]);
27
+ glmm_store(dest[0],
28
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
29
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
30
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
31
+
32
+ r = glmm_load(m2[1]);
33
+ glmm_store(dest[1],
34
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
35
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
36
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
37
+
38
+ r = glmm_load(m2[2]);
39
+ glmm_store(dest[2],
40
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
41
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
42
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
43
+
44
+ r = glmm_load(m2[3]);
45
+ glmm_store(dest[3],
46
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
47
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
48
+ _mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
49
+ _mm_mul_ps(glmm_shuff1x(r, 3), l3))));
50
+ }
51
+
52
+ CGLM_INLINE
53
+ void
54
+ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
55
+ /* D = R * L (Column-Major) */
56
+ __m128 l0, l1, l2, l3, r;
57
+
58
+ l0 = glmm_load(m1[0]);
59
+ l1 = glmm_load(m1[1]);
60
+ l2 = glmm_load(m1[2]);
61
+ l3 = glmm_load(m1[3]);
62
+
63
+ r = glmm_load(m2[0]);
64
+ glmm_store(dest[0],
65
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
66
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
67
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
68
+
69
+ r = glmm_load(m2[1]);
70
+ glmm_store(dest[1],
71
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
72
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
73
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
74
+
75
+ r = glmm_load(m2[2]);
76
+ glmm_store(dest[2],
77
+ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
78
+ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
79
+ _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
80
+
81
+ glmm_store(dest[3], l3);
82
+ }
83
+
84
+ CGLM_INLINE
85
+ void
86
+ glm_inv_tr_sse2(mat4 mat) {
87
+ __m128 r0, r1, r2, r3, x0, x1;
88
+
89
+ r0 = glmm_load(mat[0]);
90
+ r1 = glmm_load(mat[1]);
91
+ r2 = glmm_load(mat[2]);
92
+ r3 = glmm_load(mat[3]);
93
+ x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
94
+
95
+ _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
96
+
97
+ x0 = _mm_add_ps(_mm_mul_ps(r0, glmm_shuff1(r3, 0, 0, 0, 0)),
98
+ _mm_mul_ps(r1, glmm_shuff1(r3, 1, 1, 1, 1)));
99
+ x0 = _mm_add_ps(x0, _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2)));
100
+ x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f));
101
+
102
+ x0 = _mm_add_ps(x0, x1);
103
+
104
+ glmm_store(mat[0], r0);
105
+ glmm_store(mat[1], r1);
106
+ glmm_store(mat[2], r2);
107
+ glmm_store(mat[3], x0);
108
+ }
109
+
110
+ #endif
111
+ #endif /* cglm_affine_mat_sse2_h */
@@ -0,0 +1,59 @@
1
+ /*
2
+ * Copyright (c), Recep Aslantas.
3
+ *
4
+ * MIT License (MIT), http://opensource.org/licenses/MIT
5
+ * Full license can be found in the LICENSE file
6
+ */
7
+
8
+ #ifndef cglm_mat3_sse_h
9
+ #define cglm_mat3_sse_h
10
+ #if defined( __SSE__ ) || defined( __SSE2__ )
11
+
12
+ #include "../../common.h"
13
+ #include "../intrin.h"
14
+
15
+ CGLM_INLINE
16
+ void
17
+ glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
18
+ __m128 l0, l1, l2;
19
+ __m128 r0, r1, r2;
20
+ __m128 x0, x1, x2;
21
+
22
+ l0 = _mm_loadu_ps(m1[0]);
23
+ l1 = _mm_loadu_ps(&m1[1][1]);
24
+ l2 = _mm_set1_ps(m1[2][2]);
25
+
26
+ r0 = _mm_loadu_ps(m2[0]);
27
+ r1 = _mm_loadu_ps(&m2[1][1]);
28
+ r2 = _mm_set1_ps(m2[2][2]);
29
+
30
+ x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0);
31
+ x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0);
32
+
33
+ x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 0, 2, 1, 0),
34
+ glmm_shuff1(r0, 3, 0, 0, 0)),
35
+ _mm_mul_ps(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0)));
36
+
37
+ x0 = _mm_add_ps(x0,
38
+ _mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)));
39
+
40
+ _mm_storeu_ps(dest[0], x0);
41
+
42
+ x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 1, 0, 2, 1),
43
+ _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3))),
44
+ _mm_mul_ps(glmm_shuff1(x1, 1, 0, 2, 1),
45
+ glmm_shuff1(r1, 3, 3, 0, 0)));
46
+
47
+ x0 = _mm_add_ps(x0,
48
+ _mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
49
+ _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))));
50
+
51
+ _mm_storeu_ps(&dest[1][1], x0);
52
+
53
+ dest[2][2] = m1[0][2] * m2[2][0]
54
+ + m1[1][2] * m2[2][1]
55
+ + m1[2][2] * m2[2][2];
56
+ }
57
+
58
+ #endif
59
+ #endif /* cglm_mat3_sse_h */