llama-cpp-capacitor 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/cpp/LICENSE +21 -0
  2. package/cpp/README.md +4 -0
  3. package/cpp/anyascii.c +22223 -0
  4. package/cpp/anyascii.h +42 -0
  5. package/cpp/chat-parser.cpp +393 -0
  6. package/cpp/chat-parser.h +120 -0
  7. package/cpp/chat.cpp +2315 -0
  8. package/cpp/chat.h +221 -0
  9. package/cpp/common.cpp +1619 -0
  10. package/cpp/common.h +744 -0
  11. package/cpp/ggml-alloc.c +1028 -0
  12. package/cpp/ggml-alloc.h +76 -0
  13. package/cpp/ggml-backend-impl.h +255 -0
  14. package/cpp/ggml-backend-reg.cpp +600 -0
  15. package/cpp/ggml-backend.cpp +2118 -0
  16. package/cpp/ggml-backend.h +354 -0
  17. package/cpp/ggml-common.h +1878 -0
  18. package/cpp/ggml-cpp.h +39 -0
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2512 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  25. package/cpp/ggml-cpu/arch/arm/quants.c +3650 -0
  26. package/cpp/ggml-cpu/arch/arm/repack.cpp +1891 -0
  27. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  28. package/cpp/ggml-cpu/arch/x86/quants.c +3820 -0
  29. package/cpp/ggml-cpu/arch/x86/repack.cpp +6307 -0
  30. package/cpp/ggml-cpu/arch-fallback.h +215 -0
  31. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  32. package/cpp/ggml-cpu/binary-ops.h +16 -0
  33. package/cpp/ggml-cpu/common.h +73 -0
  34. package/cpp/ggml-cpu/ggml-cpu-impl.h +525 -0
  35. package/cpp/ggml-cpu/ggml-cpu.c +3578 -0
  36. package/cpp/ggml-cpu/ggml-cpu.cpp +672 -0
  37. package/cpp/ggml-cpu/ops.cpp +10587 -0
  38. package/cpp/ggml-cpu/ops.h +114 -0
  39. package/cpp/ggml-cpu/quants.c +1193 -0
  40. package/cpp/ggml-cpu/quants.h +97 -0
  41. package/cpp/ggml-cpu/repack.cpp +1982 -0
  42. package/cpp/ggml-cpu/repack.h +120 -0
  43. package/cpp/ggml-cpu/simd-mappings.h +1184 -0
  44. package/cpp/ggml-cpu/traits.cpp +36 -0
  45. package/cpp/ggml-cpu/traits.h +38 -0
  46. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  47. package/cpp/ggml-cpu/unary-ops.h +28 -0
  48. package/cpp/ggml-cpu/vec.cpp +348 -0
  49. package/cpp/ggml-cpu/vec.h +1121 -0
  50. package/cpp/ggml-cpu.h +145 -0
  51. package/cpp/ggml-impl.h +622 -0
  52. package/cpp/ggml-metal-impl.h +688 -0
  53. package/cpp/ggml-metal.h +66 -0
  54. package/cpp/ggml-metal.m +6833 -0
  55. package/cpp/ggml-opt.cpp +1093 -0
  56. package/cpp/ggml-opt.h +256 -0
  57. package/cpp/ggml-quants.c +5324 -0
  58. package/cpp/ggml-quants.h +106 -0
  59. package/cpp/ggml-threading.cpp +12 -0
  60. package/cpp/ggml-threading.h +14 -0
  61. package/cpp/ggml.c +7108 -0
  62. package/cpp/ggml.h +2492 -0
  63. package/cpp/gguf.cpp +1358 -0
  64. package/cpp/gguf.h +202 -0
  65. package/cpp/json-partial.cpp +256 -0
  66. package/cpp/json-partial.h +38 -0
  67. package/cpp/json-schema-to-grammar.cpp +985 -0
  68. package/cpp/json-schema-to-grammar.h +21 -0
  69. package/cpp/llama-adapter.cpp +388 -0
  70. package/cpp/llama-adapter.h +76 -0
  71. package/cpp/llama-arch.cpp +2355 -0
  72. package/cpp/llama-arch.h +499 -0
  73. package/cpp/llama-batch.cpp +875 -0
  74. package/cpp/llama-batch.h +160 -0
  75. package/cpp/llama-chat.cpp +783 -0
  76. package/cpp/llama-chat.h +65 -0
  77. package/cpp/llama-context.cpp +2748 -0
  78. package/cpp/llama-context.h +306 -0
  79. package/cpp/llama-cparams.cpp +5 -0
  80. package/cpp/llama-cparams.h +41 -0
  81. package/cpp/llama-cpp.h +30 -0
  82. package/cpp/llama-grammar.cpp +1229 -0
  83. package/cpp/llama-grammar.h +173 -0
  84. package/cpp/llama-graph.cpp +1891 -0
  85. package/cpp/llama-graph.h +810 -0
  86. package/cpp/llama-hparams.cpp +180 -0
  87. package/cpp/llama-hparams.h +233 -0
  88. package/cpp/llama-impl.cpp +167 -0
  89. package/cpp/llama-impl.h +61 -0
  90. package/cpp/llama-io.cpp +15 -0
  91. package/cpp/llama-io.h +35 -0
  92. package/cpp/llama-kv-cache-iswa.cpp +318 -0
  93. package/cpp/llama-kv-cache-iswa.h +135 -0
  94. package/cpp/llama-kv-cache.cpp +2059 -0
  95. package/cpp/llama-kv-cache.h +374 -0
  96. package/cpp/llama-kv-cells.h +491 -0
  97. package/cpp/llama-memory-hybrid.cpp +258 -0
  98. package/cpp/llama-memory-hybrid.h +137 -0
  99. package/cpp/llama-memory-recurrent.cpp +1146 -0
  100. package/cpp/llama-memory-recurrent.h +179 -0
  101. package/cpp/llama-memory.cpp +59 -0
  102. package/cpp/llama-memory.h +119 -0
  103. package/cpp/llama-mmap.cpp +600 -0
  104. package/cpp/llama-mmap.h +68 -0
  105. package/cpp/llama-model-loader.cpp +1164 -0
  106. package/cpp/llama-model-loader.h +170 -0
  107. package/cpp/llama-model-saver.cpp +282 -0
  108. package/cpp/llama-model-saver.h +37 -0
  109. package/cpp/llama-model.cpp +19042 -0
  110. package/cpp/llama-model.h +491 -0
  111. package/cpp/llama-sampling.cpp +2575 -0
  112. package/cpp/llama-sampling.h +32 -0
  113. package/cpp/llama-vocab.cpp +3792 -0
  114. package/cpp/llama-vocab.h +176 -0
  115. package/cpp/llama.cpp +358 -0
  116. package/cpp/llama.h +1373 -0
  117. package/cpp/log.cpp +427 -0
  118. package/cpp/log.h +103 -0
  119. package/cpp/minja/chat-template.hpp +550 -0
  120. package/cpp/minja/minja.hpp +3009 -0
  121. package/cpp/nlohmann/json.hpp +25526 -0
  122. package/cpp/nlohmann/json_fwd.hpp +187 -0
  123. package/cpp/regex-partial.cpp +204 -0
  124. package/cpp/regex-partial.h +56 -0
  125. package/cpp/rn-completion.cpp +681 -0
  126. package/cpp/rn-completion.h +116 -0
  127. package/cpp/rn-llama.cpp +345 -0
  128. package/cpp/rn-llama.h +149 -0
  129. package/cpp/rn-mtmd.hpp +602 -0
  130. package/cpp/rn-tts.cpp +591 -0
  131. package/cpp/rn-tts.h +59 -0
  132. package/cpp/sampling.cpp +579 -0
  133. package/cpp/sampling.h +107 -0
  134. package/cpp/tools/mtmd/clip-impl.h +473 -0
  135. package/cpp/tools/mtmd/clip.cpp +4322 -0
  136. package/cpp/tools/mtmd/clip.h +106 -0
  137. package/cpp/tools/mtmd/miniaudio/miniaudio.h +93468 -0
  138. package/cpp/tools/mtmd/mtmd-audio.cpp +769 -0
  139. package/cpp/tools/mtmd/mtmd-audio.h +47 -0
  140. package/cpp/tools/mtmd/mtmd-helper.cpp +460 -0
  141. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  142. package/cpp/tools/mtmd/mtmd.cpp +1066 -0
  143. package/cpp/tools/mtmd/mtmd.h +298 -0
  144. package/cpp/tools/mtmd/stb/stb_image.h +7988 -0
  145. package/cpp/unicode-data.cpp +7034 -0
  146. package/cpp/unicode-data.h +20 -0
  147. package/cpp/unicode.cpp +1061 -0
  148. package/cpp/unicode.h +68 -0
  149. package/package.json +2 -1
@@ -0,0 +1,1121 @@
1
+ // Vectorized functions for fundamental operations
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-impl.h"
6
+ #include "simd-mappings.h"
7
+ #include "ggml.h"
8
+ #include "ggml-cpu.h"
9
+
10
+ #if defined(LM_GGML_USE_ACCELERATE)
11
+ #include <Accelerate/Accelerate.h>
12
+ #endif
13
+
14
+ // floating point type used to accumulate sums
15
+ typedef double lm_ggml_float;
16
+
17
+ #define LM_GGML_GELU_FP16
18
+ #define LM_GGML_GELU_QUICK_FP16
19
+
20
+ #define LM_GGML_SOFT_MAX_UNROLL 4
21
+ #define LM_GGML_VEC_DOT_UNROLL 2
22
+ #define LM_GGML_VEC_MAD_UNROLL 32
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ //
29
+ // global data
30
+ //
31
+
32
+ // precomputed gelu table for f16 (128 KB)
33
+ extern lm_ggml_fp16_t lm_ggml_table_gelu_f16[1 << 16];
34
+
35
+ // precomputed quick gelu table for f16 (128 KB)
36
+ extern lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16];
37
+
38
+ //
39
+ // fundamental operations
40
+ //
41
+
42
+ void lm_ggml_vec_dot_f32(int n, float * LM_GGML_RESTRICT s, size_t bs, const float * LM_GGML_RESTRICT x, size_t bx, const float * LM_GGML_RESTRICT y, size_t by, int nrc);
43
+ void lm_ggml_vec_dot_bf16(int n, float * LM_GGML_RESTRICT s, size_t bs, lm_ggml_bf16_t * LM_GGML_RESTRICT x, size_t bx, lm_ggml_bf16_t * LM_GGML_RESTRICT y, size_t by, int nrc);
44
+ void lm_ggml_vec_dot_f16(int n, float * LM_GGML_RESTRICT s, size_t bs, lm_ggml_fp16_t * LM_GGML_RESTRICT x, size_t bx, lm_ggml_fp16_t * LM_GGML_RESTRICT y, size_t by, int nrc);
45
+
46
+ void lm_ggml_vec_silu_f32(const int n, float * y, const float * x);
47
+ lm_ggml_float lm_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
48
+ lm_ggml_float lm_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
49
+
50
+ inline static void lm_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
51
+ inline static void lm_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
52
+
53
+ inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
54
+ inline static void lm_ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
55
+
56
+ inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const lm_ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
+ inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const lm_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
+
59
+ inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
74
+ inline static void lm_ggml_vec_add_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
75
+ for (int i = 0; i < n; ++i) {
76
+ z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) + LM_GGML_CPU_FP16_TO_FP32(y[i]));
77
+ }
78
+ }
79
+ inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
80
+ inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
81
+ inline static void lm_ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
82
+ inline static void lm_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
83
+ inline static void lm_ggml_vec_sub_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
84
+ for (int i = 0; i < n; ++i) {
85
+ z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) - LM_GGML_CPU_FP16_TO_FP32(y[i]));
86
+ }
87
+ }
88
+ inline static void lm_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
89
+ inline static void lm_ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
90
+ inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
91
+ inline static void lm_ggml_vec_neg_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
92
+ for (int i = 0; i < n; ++i) {
93
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(-LM_GGML_CPU_FP16_TO_FP32(x[i]));
94
+ }
95
+ }
96
+
97
+ inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
98
+ inline static void lm_ggml_vec_mul_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
99
+ for (int i = 0; i < n; ++i) {
100
+ z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) * LM_GGML_CPU_FP16_TO_FP32(y[i]));
101
+ }
102
+ }
103
+ inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
104
+ inline static void lm_ggml_vec_div_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
105
+ for (int i = 0; i < n; ++i) {
106
+ z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) / LM_GGML_CPU_FP16_TO_FP32(y[i]));
107
+ }
108
+ }
109
+
110
+ // compute LM_GGML_VEC_DOT_UNROLL dot products at once
111
+ // xs - x row stride in bytes
112
+ inline static void lm_ggml_vec_dot_f16_unroll(const int n, const int xs, float * LM_GGML_RESTRICT s, void * LM_GGML_RESTRICT xv, lm_ggml_fp16_t * LM_GGML_RESTRICT y) {
113
+ lm_ggml_float sumf[LM_GGML_VEC_DOT_UNROLL] = { 0.0 };
114
+
115
+ lm_ggml_fp16_t * LM_GGML_RESTRICT x[LM_GGML_VEC_DOT_UNROLL];
116
+
117
+ for (int i = 0; i < LM_GGML_VEC_DOT_UNROLL; ++i) {
118
+ x[i] = (lm_ggml_fp16_t *) ((char *) xv + i*xs);
119
+ }
120
+
121
+ #if defined(LM_GGML_SIMD)
122
+ const int np = (n & ~(LM_GGML_F16_STEP - 1));
123
+
124
+ LM_GGML_F16_VEC sum[LM_GGML_VEC_DOT_UNROLL][LM_GGML_F16_ARR] = { { LM_GGML_F16_VEC_ZERO } };
125
+
126
+ LM_GGML_F16_VEC ax[LM_GGML_F16_ARR];
127
+ LM_GGML_F16_VEC ay[LM_GGML_F16_ARR];
128
+
129
+ for (int i = 0; i < np; i += LM_GGML_F16_STEP) {
130
+ for (int j = 0; j < LM_GGML_F16_ARR; j++) {
131
+ ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j);
132
+
133
+ for (int k = 0; k < LM_GGML_VEC_DOT_UNROLL; ++k) {
134
+ ax[j] = LM_GGML_F16_VEC_LOAD(x[k] + i + j*LM_GGML_F16_EPR, j);
135
+
136
+ sum[k][j] = LM_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
137
+ }
138
+ }
139
+ }
140
+
141
+ // reduce sum0..sum3 to sum0
142
+ for (int k = 0; k < LM_GGML_VEC_DOT_UNROLL; ++k) {
143
+ LM_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
144
+ }
145
+
146
+ // leftovers
147
+ for (int i = np; i < n; ++i) {
148
+ for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
149
+ sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
150
+ }
151
+ }
152
+ #else
153
+ for (int i = 0; i < n; ++i) {
154
+ for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
155
+ sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
156
+ }
157
+ }
158
+ #endif
159
+
160
+ for (int i = 0; i < LM_GGML_VEC_DOT_UNROLL; ++i) {
161
+ s[i] = (float)sumf[i];
162
+ }
163
+ }
164
+
165
+ inline static void lm_ggml_vec_mad_f32(const int n, float * LM_GGML_RESTRICT y, const float * LM_GGML_RESTRICT x, const float v) {
166
+ #if defined(LM_GGML_SIMD)
167
+ #if defined(__ARM_FEATURE_SVE)
168
+
169
+ const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
170
+ const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
171
+ const int lm_ggml_f32_step = 8 * lm_ggml_f32_epr; // choose 8 SVE registers
172
+ LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
173
+
174
+ const int np = (n & ~(lm_ggml_f32_step - 1));
175
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
176
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
177
+ for (int i = 0; i < np; i += lm_ggml_f32_step) {
178
+
179
+ ax1 = LM_GGML_F32_VEC_LOAD(x + i);
180
+ ay1 = LM_GGML_F32_VEC_LOAD(y + i);
181
+ ay1 = LM_GGML_F32_VEC_FMA(ay1, ax1, vx);
182
+
183
+ LM_GGML_F32_VEC_STORE(y + i, ay1);
184
+
185
+ ax2 = LM_GGML_F32_VEC_LOAD(x + i + 1*lm_ggml_f32_epr);
186
+ ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
187
+ ay2 = LM_GGML_F32_VEC_FMA(ay2, ax2, vx);
188
+
189
+ LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
190
+
191
+ ax3 = LM_GGML_F32_VEC_LOAD(x + i + 2*lm_ggml_f32_epr);
192
+ ay3 = LM_GGML_F32_VEC_LOAD(y + i + 2*lm_ggml_f32_epr);
193
+ ay3 = LM_GGML_F32_VEC_FMA(ay3, ax3, vx);
194
+
195
+ LM_GGML_F32_VEC_STORE(y + i + 2*lm_ggml_f32_epr, ay3);
196
+
197
+ ax4 = LM_GGML_F32_VEC_LOAD(x + i + 3*lm_ggml_f32_epr);
198
+ ay4 = LM_GGML_F32_VEC_LOAD(y + i + 3*lm_ggml_f32_epr);
199
+ ay4 = LM_GGML_F32_VEC_FMA(ay4, ax4, vx);
200
+
201
+ LM_GGML_F32_VEC_STORE(y + i + 3*lm_ggml_f32_epr, ay4);
202
+
203
+ ax5 = LM_GGML_F32_VEC_LOAD(x + i + 4*lm_ggml_f32_epr);
204
+ ay5 = LM_GGML_F32_VEC_LOAD(y + i + 4*lm_ggml_f32_epr);
205
+ ay5 = LM_GGML_F32_VEC_FMA(ay5, ax5, vx);
206
+
207
+ LM_GGML_F32_VEC_STORE(y + i + 4*lm_ggml_f32_epr, ay5);
208
+
209
+ ax6 = LM_GGML_F32_VEC_LOAD(x + i + 5*lm_ggml_f32_epr);
210
+ ay6 = LM_GGML_F32_VEC_LOAD(y + i + 5*lm_ggml_f32_epr);
211
+ ay6 = LM_GGML_F32_VEC_FMA(ay6, ax6, vx);
212
+
213
+ LM_GGML_F32_VEC_STORE(y + i + 5*lm_ggml_f32_epr, ay6);
214
+
215
+ ax7 = LM_GGML_F32_VEC_LOAD(x + i + 6*lm_ggml_f32_epr);
216
+ ay7 = LM_GGML_F32_VEC_LOAD(y + i + 6*lm_ggml_f32_epr);
217
+ ay7 = LM_GGML_F32_VEC_FMA(ay7, ax7, vx);
218
+
219
+ LM_GGML_F32_VEC_STORE(y + i + 6*lm_ggml_f32_epr, ay7);
220
+
221
+ ax8 = LM_GGML_F32_VEC_LOAD(x + i + 7*lm_ggml_f32_epr);
222
+ ay8 = LM_GGML_F32_VEC_LOAD(y + i + 7*lm_ggml_f32_epr);
223
+ ay8 = LM_GGML_F32_VEC_FMA(ay8, ax8, vx);
224
+
225
+ LM_GGML_F32_VEC_STORE(y + i + 7*lm_ggml_f32_epr, ay8);
226
+ }
227
+ // leftovers
228
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, lm_ggml_f32_step] which is handled in below loop
229
+ const int np2 = (n & ~(lm_ggml_f32_epr - 1));
230
+ for (int i = np; i < np2; i += lm_ggml_f32_epr) {
231
+ ax1 = LM_GGML_F32_VEC_LOAD(x + i);
232
+ ay1 = LM_GGML_F32_VEC_LOAD(y + i);
233
+ ay1 = LM_GGML_F32_VEC_FMA(ay1, ax1, vx);
234
+
235
+ LM_GGML_F32_VEC_STORE(y + i, ay1);
236
+ }
237
+ // maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
238
+ if (np2 < n) {
239
+ svbool_t pg =svwhilelt_b32(np2, n);
240
+ ax1 = svld1_f32(pg, x + np2);
241
+ ay1 = svld1_f32(pg, y + np2);
242
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
243
+
244
+ svst1_f32(pg, y + np2, ay1);
245
+ }
246
+ #else
247
+ const int np = (n & ~(LM_GGML_F32_STEP - 1));
248
+
249
+ LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
250
+
251
+ LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
252
+ LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
253
+
254
+ for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
255
+ for (int j = 0; j < LM_GGML_F32_ARR; j++) {
256
+ ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
257
+ ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
258
+ ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
259
+
260
+ LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
261
+ }
262
+ }
263
+
264
+ // leftovers
265
+ for (int i = np; i < n; ++i) {
266
+ y[i] += x[i]*v;
267
+ }
268
+ #endif
269
+ #else
270
+ // scalar
271
+ for (int i = 0; i < n; ++i) {
272
+ y[i] += x[i]*v;
273
+ }
274
+ #endif
275
+ }
276
+
277
+ inline static void lm_ggml_vec_mad_f16(const int n, lm_ggml_fp16_t * LM_GGML_RESTRICT y, const lm_ggml_fp16_t * LM_GGML_RESTRICT x, const float v) {
278
+ #if defined(LM_GGML_SIMD)
279
+ const int np = (n & ~(LM_GGML_F16_STEP - 1));
280
+
281
+ LM_GGML_F16_VEC vx = LM_GGML_F16_VEC_SET1(v);
282
+
283
+ LM_GGML_F16_VEC ax[LM_GGML_F16_ARR];
284
+ LM_GGML_F16_VEC ay[LM_GGML_F16_ARR];
285
+
286
+ for (int i = 0; i < np; i += LM_GGML_F16_STEP) {
287
+ for (int j = 0; j < LM_GGML_F16_ARR; j++) {
288
+ ax[j] = LM_GGML_F16_VEC_LOAD(x + i + j*LM_GGML_F16_EPR, j);
289
+ ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j);
290
+ ay[j] = LM_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
291
+
292
+ LM_GGML_F16_VEC_STORE(y + i + j*LM_GGML_F16_EPR, ay, j);
293
+ }
294
+ }
295
+
296
+ // leftovers
297
+ for (int i = np; i < n; ++i) {
298
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
299
+ }
300
+ #else
301
+ // scalar
302
+ for (int i = 0; i < n; ++i) {
303
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
304
+ }
305
+ #endif
306
+ }
307
+
308
+ // xs and vs are byte strides of x and v
309
+ inline static void lm_ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * LM_GGML_RESTRICT y, const float * LM_GGML_RESTRICT xv, const float * LM_GGML_RESTRICT vv) {
310
+
311
+ const float * LM_GGML_RESTRICT x[LM_GGML_VEC_MAD_UNROLL];
312
+ const float * LM_GGML_RESTRICT v[LM_GGML_VEC_MAD_UNROLL];
313
+
314
+ for (int i = 0; i < LM_GGML_VEC_MAD_UNROLL; ++i) {
315
+ x[i] = (const float *) ((const char *) xv + i*xs);
316
+ v[i] = (const float *) ((const char *) vv + i*vs);
317
+ }
318
+
319
+ #if defined(LM_GGML_SIMD)
320
+ #if defined(__ARM_FEATURE_SVE)
321
+ // scalar Route to scalar implementation //TODO: Write SVE code
322
+ for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
323
+ for (int i = 0; i < n; ++i) {
324
+ y[i] += x[k][i]*v[k][0];
325
+ }
326
+ }
327
+ #else
328
+ const int np = (n & ~(LM_GGML_F32_STEP - 1));
329
+
330
+ LM_GGML_F32_VEC vx[LM_GGML_VEC_MAD_UNROLL];
331
+
332
+ for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
333
+ vx[k] = LM_GGML_F32_VEC_SET1(v[k][0]);
334
+ }
335
+
336
+ LM_GGML_F32_VEC ax[LM_GGML_VEC_MAD_UNROLL][LM_GGML_F32_ARR];
337
+ LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
338
+
339
+ for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
340
+ for (int j = 0; j < LM_GGML_F32_ARR; j++) {
341
+ ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
342
+
343
+ for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
344
+ ax[k][j] = LM_GGML_F32_VEC_LOAD(x[k] + i + j*LM_GGML_F32_EPR);
345
+ ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
346
+ }
347
+
348
+ LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
349
+ }
350
+ }
351
+
352
+ // leftovers
353
+ for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
354
+ for (int i = np; i < n; ++i) {
355
+ y[i] += x[k][i]*v[k][0];
356
+ }
357
+ }
358
+ #endif
359
+ #else
360
+ // scalar
361
+ for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
362
+ for (int i = 0; i < n; ++i) {
363
+ y[i] += x[k][i]*v[k][0];
364
+ }
365
+ }
366
+ #endif
367
+ }
368
+
369
+ inline static void lm_ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
370
+ #if defined(LM_GGML_USE_ACCELERATE)
371
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
372
+ #elif defined(LM_GGML_SIMD)
373
+ #if defined(__ARM_FEATURE_SVE)
374
+ // scalar ; TODO: Write SVE code
375
+ for (int i = 0; i < n; ++i) {
376
+ y[i] = x[i]*s + b;
377
+ }
378
+ #else
379
+ const int np = (n & ~(LM_GGML_F32_STEP - 1));
380
+
381
+ LM_GGML_F32_VEC vs = LM_GGML_F32_VEC_SET1(s);
382
+ LM_GGML_F32_VEC vb = LM_GGML_F32_VEC_SET1(b);
383
+
384
+ LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
385
+
386
+ for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
387
+ for (int j = 0; j < LM_GGML_F32_ARR; j++) {
388
+ ay[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
389
+ ay[j] = LM_GGML_F32_VEC_FMA(ay[j], vs, vb);
390
+
391
+ LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
392
+ }
393
+ }
394
+
395
+ // leftovers
396
+ for (int i = np; i < n; ++i) {
397
+ y[i] = x[i]*s + b;
398
+ }
399
+ #endif
400
+ #else
401
+ // scalar
402
+ for (int i = 0; i < n; ++i) {
403
+ y[i] = x[i]*s + b;
404
+ }
405
+ #endif
406
+ }
407
+
408
+ //inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
409
+ inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v) {
410
+ #if defined(LM_GGML_USE_ACCELERATE)
411
+ vDSP_vsmul(y, 1, &v, y, 1, n);
412
+ #elif defined(LM_GGML_SIMD)
413
+ #if defined(__ARM_FEATURE_SVE)
414
+ const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
415
+ const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
416
+ const int lm_ggml_f32_step = 2 * lm_ggml_f32_epr;
417
+
418
+ LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
419
+ const int np = (n & ~(lm_ggml_f32_step - 1));
420
+ svfloat32_t ay1;
421
+ svfloat32_t ay2;
422
+ for (int i = 0; i < np; i += lm_ggml_f32_step) {
423
+ ay1 = LM_GGML_F32_VEC_LOAD(y + i);
424
+ ay1 = LM_GGML_F32_VEC_MUL(ay1, vx);
425
+ LM_GGML_F32_VEC_STORE(y + i, ay1);
426
+
427
+ ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
428
+ ay2 = LM_GGML_F32_VEC_MUL(ay2, vx);
429
+ LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
430
+ }
431
+ // leftovers
432
+ // maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
433
+ if (np < n) {
434
+ svbool_t pg = svwhilelt_b32(np, n);
435
+ ay1 = svld1_f32(pg, y + np);
436
+ ay1 = svmul_f32_m(pg, ay1, vx);
437
+ svst1_f32(pg, y + np, ay1);
438
+ }
439
+ #else
440
+ const int np = (n & ~(LM_GGML_F32_STEP - 1));
441
+
442
+ LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
443
+
444
+ LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
445
+
446
+ for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
447
+ for (int j = 0; j < LM_GGML_F32_ARR; j++) {
448
+ ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
449
+ ay[j] = LM_GGML_F32_VEC_MUL(ay[j], vx);
450
+
451
+ LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
452
+ }
453
+ }
454
+
455
+ // leftovers
456
+ for (int i = np; i < n; ++i) {
457
+ y[i] *= v;
458
+ }
459
+ #endif
460
+ #else
461
+ // scalar
462
+ for (int i = 0; i < n; ++i) {
463
+ y[i] *= v;
464
+ }
465
+ #endif
466
+ }
467
+
468
+ inline static void lm_ggml_vec_scale_f16(const int n, lm_ggml_fp16_t * y, const float v) {
469
+ #if defined(LM_GGML_SIMD)
470
+ const int np = (n & ~(LM_GGML_F16_STEP - 1));
471
+
472
+ LM_GGML_F16_VEC vx = LM_GGML_F16_VEC_SET1(v);
473
+
474
+ LM_GGML_F16_VEC ay[LM_GGML_F16_ARR];
475
+
476
+ for (int i = 0; i < np; i += LM_GGML_F16_STEP) {
477
+ for (int j = 0; j < LM_GGML_F16_ARR; j++) {
478
+ ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j);
479
+ ay[j] = LM_GGML_F16_VEC_MUL(ay[j], vx);
480
+
481
+ LM_GGML_F16_VEC_STORE(y + i + j*LM_GGML_F16_EPR, ay, j);
482
+ }
483
+ }
484
+
485
+ // leftovers
486
+ for (int i = np; i < n; ++i) {
487
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
488
+ }
489
+ #else
490
+ // scalar
491
+ for (int i = 0; i < n; ++i) {
492
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
493
+ }
494
+ #endif
495
+ }
496
+
497
+ inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x) { lm_ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
498
+ inline static void lm_ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
499
+ inline static void lm_ggml_vec_sqr_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
500
+ for (int i = 0; i < n; ++i) {
501
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
502
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(v*v);
503
+ }
504
+ }
505
+ inline static void lm_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
506
+ inline static void lm_ggml_vec_sqrt_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
507
+ for (int i = 0; i < n; ++i) {
508
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(sqrtf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
509
+ }
510
+ }
511
+ inline static void lm_ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
512
+ inline static void lm_ggml_vec_log_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
513
+ for (int i = 0; i < n; ++i) {
514
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(logf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
515
+ }
516
+ }
517
+ inline static void lm_ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
518
+ inline static void lm_ggml_vec_sin_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
519
+ for (int i = 0; i < n; ++i) {
520
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(sinf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
521
+ }
522
+ }
523
+ inline static void lm_ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
524
+ inline static void lm_ggml_vec_cos_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
525
+ for (int i = 0; i < n; ++i) {
526
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(cosf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
527
+ }
528
+ }
529
+ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
530
+ inline static void lm_ggml_vec_abs_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
531
+ for (int i = 0; i < n; ++i) {
532
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(fabsf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
533
+ }
534
+ }
535
+ inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
536
+ inline static void lm_ggml_vec_sgn_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
537
+ for (int i = 0; i < n; ++i) {
538
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
539
+ y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
540
+ }
541
+ }
542
+ inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
543
+ inline static void lm_ggml_vec_step_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
544
+ for (int i = 0; i < n; ++i) {
545
+ y[i] = LM_GGML_CPU_FP32_TO_FP16((LM_GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
546
+ }
547
+ }
548
+ inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
549
+ inline static void lm_ggml_vec_tanh_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
550
+ for (int i = 0; i < n; ++i) {
551
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(tanhf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
552
+ }
553
+ }
554
+ inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
555
+ inline static void lm_ggml_vec_elu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
556
+ for (int i = 0; i < n; ++i) {
557
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(expm1f(LM_GGML_CPU_FP16_TO_FP32(x[i])));
558
+ }
559
+ }
560
+ inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
561
+ inline static void lm_ggml_vec_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
562
+ for (int i = 0; i < n; ++i) {
563
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
564
+ y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
565
+ }
566
+ }
567
+ inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
568
+ inline static void lm_ggml_vec_leaky_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const float ns) {
569
+ for (int i = 0; i < n; ++i) {
570
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
571
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
572
+ }
573
+ }
574
+ inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
575
+ inline static void lm_ggml_vec_sigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
576
+ for (int i = 0; i < n; ++i) {
577
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-LM_GGML_CPU_FP16_TO_FP32(x[i]))));
578
+ }
579
+ }
580
+ // TODO: optimize performance
581
+ inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
582
+ inline static void lm_ggml_vec_hardswish_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
583
+ for (int i = 0; i < n; ++i) {
584
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
585
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
586
+ }
587
+ }
588
+ inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
589
+ inline static void lm_ggml_vec_hardsigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
590
+ for (int i = 0; i < n; ++i) {
591
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (LM_GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
592
+ }
593
+ }
594
+ inline static void lm_ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
595
+ inline static void lm_ggml_vec_exp_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
596
+ for (int i = 0; i < n; ++i) {
597
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(expf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
598
+ }
599
+ }
600
+
601
+ static const float GELU_COEF_A = 0.044715f;
602
+ static const float GELU_QUICK_COEF = -1.702f;
603
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
604
+ static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
605
+
606
+ inline static float lm_ggml_gelu_f32(float x) {
607
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
608
+ }
609
+
610
+ inline static void lm_ggml_vec_gelu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
611
+ const uint16_t * i16 = (const uint16_t *) x;
612
+ for (int i = 0; i < n; ++i) {
613
+ y[i] = lm_ggml_table_gelu_f16[i16[i]];
614
+ }
615
+ }
616
+
617
+ inline static void lm_ggml_vec_gelu_erf_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
618
+ for (int i = 0; i < n; ++i) {
619
+ float xi = LM_GGML_CPU_FP16_TO_FP32(x[i]);
620
+ float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
621
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(res);
622
+ }
623
+ }
624
+
625
+ #ifdef LM_GGML_GELU_FP16
626
+ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x) {
627
+ uint16_t t;
628
+ for (int i = 0; i < n; ++i) {
629
+ if (x[i] <= -10.0f) {
630
+ y[i] = 0.0f;
631
+ } else if (x[i] >= 10.0f) {
632
+ y[i] = x[i];
633
+ } else {
634
+ lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
635
+ memcpy(&t, &fp16, sizeof(uint16_t));
636
+ y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]);
637
+ }
638
+ }
639
+ }
640
+ #else
641
+ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x) {
642
+ for (int i = 0; i < n; ++i) {
643
+ y[i] = lm_ggml_gelu_f32(x[i]);
644
+ }
645
+ }
646
+ #endif
647
+
648
+ inline static void lm_ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
649
+ for (int i = 0; i < n; ++i) {
650
+ float xi = x[i];
651
+ y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
652
+ }
653
+ }
654
+
655
+ inline static float lm_ggml_gelu_quick_f32(float x) {
656
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
657
+ }
658
+
659
+ //inline static void lm_ggml_vec_gelu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
660
+ // const uint16_t * i16 = (const uint16_t *) x;
661
+ // for (int i = 0; i < n; ++i) {
662
+ // y[i] = lm_ggml_table_gelu_quick_f16[i16[i]];
663
+ // }
664
+ //}
665
+
666
+ #ifdef LM_GGML_GELU_QUICK_FP16
667
+ inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
668
+ uint16_t t;
669
+ for (int i = 0; i < n; ++i) {
670
+ lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
671
+ memcpy(&t, &fp16, sizeof(uint16_t));
672
+ y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]);
673
+ }
674
+ }
675
+ #else
676
+ inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
677
+ for (int i = 0; i < n; ++i) {
678
+ y[i] = lm_ggml_gelu_quick_f32(x[i]);
679
+ }
680
+ }
681
+ #endif
682
+
683
+ inline static void lm_ggml_vec_gelu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
684
+ for (int i = 0; i < n; ++i) {
685
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
686
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
687
+ }
688
+ }
689
+
690
+ // Sigmoid Linear Unit (SiLU) function
691
+ inline static float lm_ggml_silu_f32(float x) {
692
+ return x/(1.0f + expf(-x));
693
+ }
694
+ inline static lm_ggml_fp16_t lm_ggml_silu_f16(lm_ggml_fp16_t x) {
695
+ float v = LM_GGML_CPU_FP16_TO_FP32(x);
696
+ return LM_GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
697
+ }
698
+
699
+ #if __FINITE_MATH_ONLY__
700
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
701
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
702
+ #endif
703
+
704
+ /* Below function was borrowed from the GitHub repository:
705
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
706
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
707
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
708
+ // Constants
709
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
710
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
711
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
712
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
713
+ const svfloat32_t one = svdup_n_f32(1.0f);
714
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
715
+ const svint32_t inactive2 = svdup_n_s32(0);
716
+
717
+ // Algorithm starts here
718
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
719
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
720
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
721
+
722
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
723
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
724
+
725
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
726
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
727
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
728
+
729
+ // and_(t2.d, t1.d, not_mask17.d)
730
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
731
+ t5 = svsub_f32_m(pg, t1, t5); // z
732
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
733
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
734
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
735
+
736
+ return t0;
737
+ }
738
+ #endif
739
+
740
+ #if defined(__ARM_NEON) && defined(__aarch64__)
741
+
742
+ // adapted from arm limited optimized routine
743
+ // the maximum error is 1.45358 plus 0.5 ulps
744
+ // numbers above 88.38 will flush to infinity
745
+ // numbers beneath -103.97 will flush to zero
746
+ inline static float32x4_t lm_ggml_v_expf(float32x4_t x) {
747
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
748
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
749
+ const float32x4_t n = vsubq_f32(z, r);
750
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
751
+ vdupq_n_f32(0x1.7f7d1cp-20f));
752
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
753
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
754
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
755
+ const float32x4_t u = vmulq_f32(b, b);
756
+ const float32x4_t j = vfmaq_f32(
757
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
758
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
759
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
760
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
761
+ return vfmaq_f32(k, j, k);
762
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
763
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
764
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
765
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
766
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
767
+ }
768
+
769
+ // computes silu x/(1+exp(-x)) in single precision vector
770
+ inline static float32x4_t lm_ggml_v_silu(float32x4_t x) {
771
+ const float32x4_t one = vdupq_n_f32(1.0f);
772
+ const float32x4_t zero = vdupq_n_f32(0.0f);
773
+ const float32x4_t neg_x = vsubq_f32(zero, x);
774
+ const float32x4_t exp_neg_x = lm_ggml_v_expf(neg_x);
775
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
776
+ return vdivq_f32(x, one_plus_exp_neg_x);
777
+ }
778
+
779
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
780
+
781
+ // adapted from arm limited optimized routine
782
+ // the maximum error is 1.45358 plus 0.5 ulps
783
+ // numbers above 88.38 will flush to infinity
784
+ // numbers beneath -103.97 will flush to zero
785
+ inline static __m512 lm_ggml_v_expf(__m512 x) {
786
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
787
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
788
+ const __m512 n = _mm512_sub_ps(z, r);
789
+ const __m512 b =
790
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
791
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
792
+ const __mmask16 d =
793
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
794
+ const __m512 u = _mm512_mul_ps(b, b);
795
+ const __m512 j = _mm512_fmadd_ps(
796
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
797
+ _mm512_set1_ps(0x1.573e2ep-5f)),
798
+ u,
799
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
800
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
801
+ u,
802
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
803
+ const __m512 res = _mm512_scalef_ps(j, n);
804
+ if (_mm512_kortestz(d, d))
805
+ return res;
806
+ const __m512 zero = _mm512_setzero_ps();
807
+ const __m512 alt = _mm512_mask_blend_ps(
808
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
809
+ return _mm512_mask_blend_ps(d, res, alt);
810
+ }
811
+
812
+ // computes silu x/(1+exp(-x)) in single precision vector
813
+ inline static __m512 lm_ggml_v_silu(__m512 x) {
814
+ const __m512 one = _mm512_set1_ps(1);
815
+ const __m512 zero = _mm512_setzero_ps();
816
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
817
+ const __m512 exp_neg_x = lm_ggml_v_expf(neg_x);
818
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
819
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
820
+ }
821
+
822
+ #elif defined(__AVX2__) && defined(__FMA__)
823
+
824
+ // adapted from arm limited optimized routine
825
+ // the maximum error is 1.45358 plus 0.5 ulps
826
+ // numbers above 88.38 will flush to infinity
827
+ // numbers beneath -103.97 will flush to zero
828
+ inline static __m256 lm_ggml_v_expf(__m256 x) {
829
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
830
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
831
+ const __m256 n = _mm256_sub_ps(z, r);
832
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
833
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
834
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
835
+ const __m256 k = _mm256_castsi256_ps(
836
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
837
+ const __m256i c = _mm256_castps_si256(
838
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
839
+ _mm256_set1_ps(126), _CMP_GT_OQ));
840
+ const __m256 u = _mm256_mul_ps(b, b);
841
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
842
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
843
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
844
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
845
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
846
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
847
+ return _mm256_fmadd_ps(j, k, k);
848
+ const __m256i g = _mm256_and_si256(
849
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
850
+ _mm256_set1_epi32(0x82000000u));
851
+ const __m256 s1 =
852
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
853
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
854
+ const __m256i d = _mm256_castps_si256(
855
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
856
+ _mm256_set1_ps(192), _CMP_GT_OQ));
857
+ return _mm256_or_ps(
858
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
859
+ _mm256_andnot_ps(
860
+ _mm256_castsi256_ps(d),
861
+ _mm256_or_ps(
862
+ _mm256_and_ps(_mm256_castsi256_ps(c),
863
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
864
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
865
+ }
866
+
867
+ // computes silu x/(1+exp(-x)) in single precision vector
868
+ inline static __m256 lm_ggml_v_silu(__m256 x) {
869
+ const __m256 one = _mm256_set1_ps(1);
870
+ const __m256 zero = _mm256_setzero_ps();
871
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
872
+ const __m256 exp_neg_x = lm_ggml_v_expf(neg_x);
873
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
874
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
875
+ }
876
+
877
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
878
+
879
+ #if defined(__FMA__)
880
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
881
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
882
+ #else
883
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
884
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
885
+ #endif
886
+
887
+ // adapted from arm limited optimized routine
888
+ // the maximum error is 1.45358 plus 0.5 ulps
889
+ // numbers above 88.38 will flush to infinity
890
+ // numbers beneath -103.97 will flush to zero
891
+ inline static __m128 lm_ggml_v_expf(__m128 x) {
892
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
893
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
894
+ const __m128 n = _mm_sub_ps(z, r);
895
+ const __m128 b =
896
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
897
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
898
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
899
+ const __m128i c =
900
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
901
+ const __m128 u = _mm_mul_ps(b, b);
902
+ const __m128 j =
903
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
904
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
905
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
906
+ if (!_mm_movemask_epi8(c))
907
+ return MADD128(j, k, k);
908
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
909
+ _mm_set1_epi32(0x82000000u));
910
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
911
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
912
+ const __m128i d =
913
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
914
+ return _mm_or_ps(
915
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
916
+ _mm_andnot_ps(_mm_castsi128_ps(d),
917
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
918
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
919
+ }
920
+
921
+ // computes silu x/(1+exp(-x)) in single precision vector
922
+ inline static __m128 lm_ggml_v_silu(__m128 x) {
923
+ const __m128 one = _mm_set1_ps(1);
924
+ const __m128 zero = _mm_setzero_ps();
925
+ const __m128 neg_x = _mm_sub_ps(zero, x);
926
+ const __m128 exp_neg_x = lm_ggml_v_expf(neg_x);
927
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
928
+ return _mm_div_ps(x, one_plus_exp_neg_x);
929
+ }
930
+
931
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
932
+
933
+ inline static void lm_ggml_vec_silu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
934
+ for (int i = 0; i < n; ++i) {
935
+ y[i] = lm_ggml_silu_f16(x[i]);
936
+ }
937
+ }
938
+
939
+ inline static float lm_ggml_silu_backward_f32(float x, float dy) {
940
+ const float s = 1.0f/(1.0f + expf(-x));
941
+ return dy*s*(1.0f + x*(1.0f - s));
942
+ }
943
+
944
+ inline static lm_ggml_fp16_t lm_ggml_silu_backward_f16(lm_ggml_fp16_t x, lm_ggml_fp16_t dy) {
945
+ const float v = LM_GGML_CPU_FP16_TO_FP32(x);
946
+ const float s = 1.0f/(1.0f + expf(-v));
947
+ return LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
948
+ }
949
+
950
+ inline static void lm_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
951
+ for (int i = 0; i < n; ++i) {
952
+ dx[i] = lm_ggml_silu_backward_f32(x[i], dy[i]);
953
+ }
954
+ }
955
+
956
+ inline static void lm_ggml_vec_silu_backward_f16(const int n, lm_ggml_fp16_t * dx, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * dy) {
957
+ for (int i = 0; i < n; ++i) {
958
+ dx[i] = lm_ggml_silu_backward_f16(x[i], dy[i]);
959
+ }
960
+ }
961
+
962
+ inline static void lm_ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
963
+ for (int i = 0; i < n; ++i) {
964
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
965
+ }
966
+ }
967
+
968
+ inline static void lm_ggml_vec_reglu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * g) {
969
+ for (int i = 0; i < n; ++i) {
970
+ float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
971
+ y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * LM_GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
972
+ }
973
+ }
974
+
975
+ #ifdef LM_GGML_GELU_FP16
976
+ inline static void lm_ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
977
+ uint16_t t;
978
+ for (int i = 0; i < n; ++i) {
979
+ if (x[i] <= -10.0f) {
980
+ y[i] = 0.0f;
981
+ } else if (x[i] >= 10.0f) {
982
+ y[i] = x[i] * g[i];
983
+ } else {
984
+ lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
985
+ memcpy(&t, &fp16, sizeof(uint16_t));
986
+ y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]) * g[i];
987
+ }
988
+ }
989
+ }
990
+ #else
991
+ inline static void lm_ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
992
+ for (int i = 0; i < n; ++i) {
993
+ y[i] = lm_ggml_gelu_f32(x[i]) * g[i];
994
+ }
995
+ }
996
+ #endif
997
+
998
+ inline static void lm_ggml_vec_geglu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * g) {
999
+ const uint16_t * i16 = (const uint16_t *) x;
1000
+ for (int i = 0; i < n; ++i) {
1001
+ float v = LM_GGML_CPU_FP16_TO_FP32(g[i]);
1002
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_f16[i16[i]]) * v);
1003
+ }
1004
+ }
1005
+
1006
+ void lm_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1007
+
1008
+ inline static void lm_ggml_vec_swiglu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * g) {
1009
+ for (int i = 0; i < n; ++i) {
1010
+ float xi = LM_GGML_CPU_FP16_TO_FP32(x[i]);
1011
+ float gi = LM_GGML_CPU_FP16_TO_FP32(g[i]);
1012
+ y[i] = LM_GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1013
+ }
1014
+ }
1015
+
1016
+ inline static void lm_ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1017
+ for (int i = 0; i < n; ++i) {
1018
+ float xi = x[i];
1019
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1020
+ }
1021
+ }
1022
+
1023
+ inline static void lm_ggml_vec_geglu_erf_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * g) {
1024
+ for (int i = 0; i < n; ++i) {
1025
+ float xi = LM_GGML_CPU_FP16_TO_FP32(x[i]);
1026
+ float gi = LM_GGML_CPU_FP16_TO_FP32(g[i]);
1027
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1028
+ }
1029
+ }
1030
+
1031
+ #ifdef LM_GGML_GELU_QUICK_FP16
1032
+ inline static void lm_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1033
+ uint16_t t;
1034
+ for (int i = 0; i < n; ++i) {
1035
+ lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
1036
+ memcpy(&t, &fp16, sizeof(uint16_t));
1037
+ y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]) * g[i];
1038
+ }
1039
+ }
1040
+ #else
1041
+ inline static void lm_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1042
+ for (int i = 0; i < n; ++i) {
1043
+ y[i] = lm_ggml_gelu_quick_f32(x[i]) * g[i];
1044
+ }
1045
+ }
1046
+ #endif
1047
+
1048
+ inline static void lm_ggml_vec_geglu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * g) {
1049
+ const uint16_t * i16 = (const uint16_t *) x;
1050
+ for (int i = 0; i < n; ++i) {
1051
+ float v = LM_GGML_CPU_FP16_TO_FP32(g[i]);
1052
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[i16[i]]) * v);
1053
+ }
1054
+ }
1055
+
1056
+ inline static void lm_ggml_vec_sum_f32(const int n, float * s, const float * x) {
1057
+ #ifndef LM_GGML_USE_ACCELERATE
1058
+ lm_ggml_float sum = 0.0;
1059
+ for (int i = 0; i < n; ++i) {
1060
+ sum += (lm_ggml_float)x[i];
1061
+ }
1062
+ *s = (float)sum;
1063
+ #else
1064
+ vDSP_sve(x, 1, s, n);
1065
+ #endif
1066
+ }
1067
+
1068
+ inline static void lm_ggml_vec_sum_f32_ggf(const int n, lm_ggml_float * s, const float * x) {
1069
+ lm_ggml_float sum = 0.0;
1070
+ for (int i = 0; i < n; ++i) {
1071
+ sum += (lm_ggml_float)x[i];
1072
+ }
1073
+ *s = sum;
1074
+ }
1075
+
1076
+ inline static void lm_ggml_vec_sum_f16_ggf(const int n, float * s, const lm_ggml_fp16_t * x) {
1077
+ float sum = 0.0f;
1078
+ for (int i = 0; i < n; ++i) {
1079
+ sum += LM_GGML_CPU_FP16_TO_FP32(x[i]);
1080
+ }
1081
+ *s = sum;
1082
+ }
1083
+
1084
+ inline static void lm_ggml_vec_sum_bf16_ggf(const int n, float * s, const lm_ggml_bf16_t * x) {
1085
+ float sum = 0.0f;
1086
+ for (int i = 0; i < n; ++i) {
1087
+ sum += LM_GGML_BF16_TO_FP32(x[i]);
1088
+ }
1089
+ *s = sum;
1090
+ }
1091
+
1092
+ inline static void lm_ggml_vec_max_f32(const int n, float * s, const float * x) {
1093
+ #ifndef LM_GGML_USE_ACCELERATE
1094
+ float max = -INFINITY;
1095
+ for (int i = 0; i < n; ++i) {
1096
+ max = MAX(max, x[i]);
1097
+ }
1098
+ *s = max;
1099
+ #else
1100
+ vDSP_maxv(x, 1, s, n);
1101
+ #endif
1102
+ }
1103
+
1104
+ inline static void lm_ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1105
+ lm_ggml_vec_norm_f32(n, s, x);
1106
+ *s = 1.f/(*s);
1107
+ }
1108
+
1109
+ inline static void lm_ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1110
+ float max = -INFINITY;
1111
+ int idx = 0;
1112
+ for (int i = 0; i < n; ++i) {
1113
+ max = MAX(max, x[i]);
1114
+ if (max == x[i]) { idx = i; }
1115
+ }
1116
+ *s = idx;
1117
+ }
1118
+
1119
+ #ifdef __cplusplus
1120
+ }
1121
+ #endif