@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -0,0 +1,802 @@
1
+ // Vectorized functions for fundamental operations
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-impl.h"
6
+ #include "simd-mappings.h"
7
+ #include "ggml.h"
8
+
9
+ #if defined(GGML_USE_ACCELERATE)
10
+ #include <Accelerate/Accelerate.h>
11
+ #endif
12
+
13
+ // floating point type used to accumulate sums
14
+ typedef double ggml_float;
15
+
16
+ #define GGML_GELU_FP16
17
+ #define GGML_GELU_QUICK_FP16
18
+
19
+ #define GGML_SOFT_MAX_UNROLL 4
20
+ #define GGML_VEC_DOT_UNROLL 2
21
+ #define GGML_VEC_MAD_UNROLL 32
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ //
28
+ // global data
29
+ //
30
+
31
+ // precomputed gelu table for f16 (128 KB)
32
+ extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
33
+
34
+ // precomputed quick gelu table for f16 (128 KB)
35
+ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
36
+
37
+ //
38
+ // fundamental operations
39
+ //
40
+
41
+ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
42
+ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
43
+ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
44
+
45
+ void ggml_vec_silu_f32(const int n, float * y, const float * x);
46
+ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
47
+ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
48
+
49
+ inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
50
+ inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
51
+
52
+ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
53
+ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
54
+
55
+ inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
56
+ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
59
+ for (int i = 0; i < n; ++i) {
60
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
61
+ }
62
+ }
63
+ inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
64
+ inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
65
+ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
66
+ inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
67
+ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
68
+ for (int i = 0; i < n; ++i) {
69
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
70
+ }
71
+ }
72
+ inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
73
+ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
74
+ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
75
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
76
+ for (int i = 0; i < n; ++i) {
77
+ y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
78
+ }
79
+ }
80
+
81
+ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
82
+ inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
83
+ for (int i = 0; i < n; ++i) {
84
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
85
+ }
86
+ }
87
+ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
88
+ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
89
+ for (int i = 0; i < n; ++i) {
90
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
91
+ }
92
+ }
93
+
94
+ // compute GGML_VEC_DOT_UNROLL dot products at once
95
+ // xs - x row stride in bytes
96
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
97
+ ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
98
+
99
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
100
+
101
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
102
+ x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
103
+ }
104
+
105
+ #if defined(GGML_SIMD)
106
+ const int np = (n & ~(GGML_F16_STEP - 1));
107
+
108
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
109
+
110
+ GGML_F16_VEC ax[GGML_F16_ARR];
111
+ GGML_F16_VEC ay[GGML_F16_ARR];
112
+
113
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
114
+ for (int j = 0; j < GGML_F16_ARR; j++) {
115
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
116
+
117
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
118
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
119
+
120
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
121
+ }
122
+ }
123
+ }
124
+
125
+ // reduce sum0..sum3 to sum0
126
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
127
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
128
+ }
129
+
130
+ // leftovers
131
+ for (int i = np; i < n; ++i) {
132
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
133
+ sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
134
+ }
135
+ }
136
+ #else
137
+ for (int i = 0; i < n; ++i) {
138
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
139
+ sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
140
+ }
141
+ }
142
+ #endif
143
+
144
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
145
+ s[i] = (float)sumf[i];
146
+ }
147
+ }
148
+
149
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
+ #if defined(GGML_SIMD)
151
+ const int np = (n & ~(GGML_F32_STEP - 1));
152
+
153
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
+
155
+ GGML_F32_VEC ax[GGML_F32_ARR];
156
+ GGML_F32_VEC ay[GGML_F32_ARR];
157
+
158
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
159
+ for (int j = 0; j < GGML_F32_ARR; j++) {
160
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
163
+
164
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
165
+ }
166
+ }
167
+
168
+ // leftovers
169
+ for (int i = np; i < n; ++i) {
170
+ y[i] += x[i]*v;
171
+ }
172
+ #else
173
+ // scalar
174
+ for (int i = 0; i < n; ++i) {
175
+ y[i] += x[i]*v;
176
+ }
177
+ #endif
178
+ }
179
+
180
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
181
+ #if defined(GGML_SIMD)
182
+ const int np = (n & ~(GGML_F16_STEP - 1));
183
+
184
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
185
+
186
+ GGML_F16_VEC ax[GGML_F16_ARR];
187
+ GGML_F16_VEC ay[GGML_F16_ARR];
188
+
189
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
190
+ for (int j = 0; j < GGML_F16_ARR; j++) {
191
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
192
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
193
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
194
+
195
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
196
+ }
197
+ }
198
+
199
+ // leftovers
200
+ for (int i = np; i < n; ++i) {
201
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
202
+ }
203
+ #else
204
+ // scalar
205
+ for (int i = 0; i < n; ++i) {
206
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
207
+ }
208
+ #endif
209
+ }
210
+
211
+ // xs and vs are byte strides of x and v
212
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
213
+
214
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
215
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
216
+
217
+ for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
218
+ x[i] = (const float *) ((const char *) xv + i*xs);
219
+ v[i] = (const float *) ((const char *) vv + i*vs);
220
+ }
221
+
222
+ #if defined(GGML_SIMD)
223
+ const int np = (n & ~(GGML_F32_STEP - 1));
224
+
225
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
+
227
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
+ }
230
+
231
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
+ GGML_F32_VEC ay[GGML_F32_ARR];
233
+
234
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
235
+ for (int j = 0; j < GGML_F32_ARR; j++) {
236
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
+
238
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
+ }
242
+
243
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
244
+ }
245
+ }
246
+
247
+ // leftovers
248
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
+ for (int i = np; i < n; ++i) {
250
+ y[i] += x[k][i]*v[k][0];
251
+ }
252
+ }
253
+ #else
254
+ // scalar
255
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
256
+ for (int i = 0; i < n; ++i) {
257
+ y[i] += x[k][i]*v[k][0];
258
+ }
259
+ }
260
+ #endif
261
+ }
262
+
263
+ //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
264
+ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
+ #if defined(GGML_USE_ACCELERATE)
266
+ vDSP_vsmul(y, 1, &v, y, 1, n);
267
+ #elif defined(GGML_SIMD)
268
+ const int np = (n & ~(GGML_F32_STEP - 1));
269
+
270
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
+
272
+ GGML_F32_VEC ay[GGML_F32_ARR];
273
+
274
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
275
+ for (int j = 0; j < GGML_F32_ARR; j++) {
276
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
+
279
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
280
+ }
281
+ }
282
+
283
+ // leftovers
284
+ for (int i = np; i < n; ++i) {
285
+ y[i] *= v;
286
+ }
287
+ #else
288
+ // scalar
289
+ for (int i = 0; i < n; ++i) {
290
+ y[i] *= v;
291
+ }
292
+ #endif
293
+ }
294
+
295
+ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
296
+ #if defined(GGML_SIMD)
297
+ const int np = (n & ~(GGML_F16_STEP - 1));
298
+
299
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
300
+
301
+ GGML_F16_VEC ay[GGML_F16_ARR];
302
+
303
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
304
+ for (int j = 0; j < GGML_F16_ARR; j++) {
305
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
306
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
307
+
308
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
309
+ }
310
+ }
311
+
312
+ // leftovers
313
+ for (int i = np; i < n; ++i) {
314
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
315
+ }
316
+ #else
317
+ // scalar
318
+ for (int i = 0; i < n; ++i) {
319
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
320
+ }
321
+ #endif
322
+ }
323
+
324
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
325
+ inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
326
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
327
+ for (int i = 0; i < n; ++i) {
328
+ float v = GGML_FP16_TO_FP32(x[i]);
329
+ y[i] = GGML_FP32_TO_FP16(v*v);
330
+ }
331
+ }
332
+ inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
333
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
334
+ for (int i = 0; i < n; ++i) {
335
+ y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
336
+ }
337
+ }
338
+ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
339
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
340
+ for (int i = 0; i < n; ++i) {
341
+ y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
342
+ }
343
+ }
344
+ inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
345
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
346
+ for (int i = 0; i < n; ++i) {
347
+ y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
348
+ }
349
+ }
350
+ inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
351
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
352
+ for (int i = 0; i < n; ++i) {
353
+ y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
354
+ }
355
+ }
356
+ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
357
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
358
+ for (int i = 0; i < n; ++i) {
359
+ y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
360
+ }
361
+ }
362
+ inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
363
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
364
+ for (int i = 0; i < n; ++i) {
365
+ float v = GGML_FP16_TO_FP32(x[i]);
366
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
367
+ }
368
+ }
369
+ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
370
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
371
+ for (int i = 0; i < n; ++i) {
372
+ y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
373
+ }
374
+ }
375
+ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
376
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
377
+ for (int i = 0; i < n; ++i) {
378
+ y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
379
+ }
380
+ }
381
+ inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
382
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
383
+ for (int i = 0; i < n; ++i) {
384
+ y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
385
+ }
386
+ }
387
+ inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
388
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
389
+ for (int i = 0; i < n; ++i) {
390
+ float v = GGML_FP16_TO_FP32(x[i]);
391
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
392
+ }
393
+ }
394
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
395
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
396
+ for (int i = 0; i < n; ++i) {
397
+ float v = GGML_FP16_TO_FP32(x[i]);
398
+ y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
399
+ }
400
+ }
401
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
402
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
403
+ for (int i = 0; i < n; ++i) {
404
+ y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
405
+ }
406
+ }
407
+ // TODO: optimize performance
408
+ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
409
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
410
+ for (int i = 0; i < n; ++i) {
411
+ float v = GGML_FP16_TO_FP32(x[i]);
412
+ y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
413
+ }
414
+ }
415
+ inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
416
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
417
+ for (int i = 0; i < n; ++i) {
418
+ y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
419
+ }
420
+ }
421
+ inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
422
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
423
+ for (int i = 0; i < n; ++i) {
424
+ y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
425
+ }
426
+ }
427
+
428
+ static const float GELU_COEF_A = 0.044715f;
429
+ static const float GELU_QUICK_COEF = -1.702f;
430
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
431
+
432
+ inline static float ggml_gelu_f32(float x) {
433
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
434
+ }
435
+
436
+ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
437
+ const uint16_t * i16 = (const uint16_t *) x;
438
+ for (int i = 0; i < n; ++i) {
439
+ y[i] = ggml_table_gelu_f16[i16[i]];
440
+ }
441
+ }
442
+
443
+ #ifdef GGML_GELU_FP16
444
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
445
+ uint16_t t;
446
+ for (int i = 0; i < n; ++i) {
447
+ if (x[i] <= -10.0f) {
448
+ y[i] = 0.0f;
449
+ } else if (x[i] >= 10.0f) {
450
+ y[i] = x[i];
451
+ } else {
452
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
453
+ memcpy(&t, &fp16, sizeof(uint16_t));
454
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
455
+ }
456
+ }
457
+ }
458
+ #else
459
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
460
+ for (int i = 0; i < n; ++i) {
461
+ y[i] = ggml_gelu_f32(x[i]);
462
+ }
463
+ }
464
+ #endif
465
+
466
+ inline static float ggml_gelu_quick_f32(float x) {
467
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
468
+ }
469
+
470
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
471
+ // const uint16_t * i16 = (const uint16_t *) x;
472
+ // for (int i = 0; i < n; ++i) {
473
+ // y[i] = ggml_table_gelu_quick_f16[i16[i]];
474
+ // }
475
+ //}
476
+
477
+ #ifdef GGML_GELU_QUICK_FP16
478
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
479
+ uint16_t t;
480
+ for (int i = 0; i < n; ++i) {
481
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
482
+ memcpy(&t, &fp16, sizeof(uint16_t));
483
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
484
+ }
485
+ }
486
+ #else
487
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
488
+ for (int i = 0; i < n; ++i) {
489
+ y[i] = ggml_gelu_quick_f32(x[i]);
490
+ }
491
+ }
492
+ #endif
493
+
494
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
495
+ for (int i = 0; i < n; ++i) {
496
+ float v = GGML_FP16_TO_FP32(x[i]);
497
+ y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
498
+ }
499
+ }
500
+
501
+ // Sigmoid Linear Unit (SiLU) function
502
+ inline static float ggml_silu_f32(float x) {
503
+ return x/(1.0f + expf(-x));
504
+ }
505
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
506
+ float v = GGML_FP16_TO_FP32(x);
507
+ return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
508
+ }
509
+
510
+ #if __FINITE_MATH_ONLY__
511
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
512
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
513
+ #endif
514
+
515
+ #if defined(__ARM_NEON) && defined(__aarch64__)
516
+
517
+ // adapted from arm limited optimized routine
518
+ // the maximum error is 1.45358 plus 0.5 ulps
519
+ // numbers above 88.38 will flush to infinity
520
+ // numbers beneath -103.97 will flush to zero
521
+ inline static float32x4_t ggml_v_expf(float32x4_t x) {
522
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
523
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
524
+ const float32x4_t n = vsubq_f32(z, r);
525
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
526
+ vdupq_n_f32(0x1.7f7d1cp-20f));
527
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
528
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
529
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
530
+ const float32x4_t u = vmulq_f32(b, b);
531
+ const float32x4_t j = vfmaq_f32(
532
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
533
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
534
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
535
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
536
+ return vfmaq_f32(k, j, k);
537
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
538
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
539
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
540
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
541
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
542
+ }
543
+
544
+ // computes silu x/(1+exp(-x)) in single precision vector
545
+ inline static float32x4_t ggml_v_silu(float32x4_t x) {
546
+ const float32x4_t one = vdupq_n_f32(1.0f);
547
+ const float32x4_t zero = vdupq_n_f32(0.0f);
548
+ const float32x4_t neg_x = vsubq_f32(zero, x);
549
+ const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
550
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
551
+ return vdivq_f32(x, one_plus_exp_neg_x);
552
+ }
553
+
554
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
555
+
556
+ // adapted from arm limited optimized routine
557
+ // the maximum error is 1.45358 plus 0.5 ulps
558
+ // numbers above 88.38 will flush to infinity
559
+ // numbers beneath -103.97 will flush to zero
560
+ inline static __m512 ggml_v_expf(__m512 x) {
561
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
562
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
563
+ const __m512 n = _mm512_sub_ps(z, r);
564
+ const __m512 b =
565
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
566
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
567
+ const __mmask16 d =
568
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
569
+ const __m512 u = _mm512_mul_ps(b, b);
570
+ const __m512 j = _mm512_fmadd_ps(
571
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
572
+ _mm512_set1_ps(0x1.573e2ep-5f)),
573
+ u,
574
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
575
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
576
+ u,
577
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
578
+ const __m512 res = _mm512_scalef_ps(j, n);
579
+ if (_mm512_kortestz(d, d))
580
+ return res;
581
+ const __m512 zero = _mm512_setzero_ps();
582
+ const __m512 alt = _mm512_mask_blend_ps(
583
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
584
+ return _mm512_mask_blend_ps(d, res, alt);
585
+ }
586
+
587
+ // computes silu x/(1+exp(-x)) in single precision vector
588
+ inline static __m512 ggml_v_silu(__m512 x) {
589
+ const __m512 one = _mm512_set1_ps(1);
590
+ const __m512 zero = _mm512_setzero_ps();
591
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
592
+ const __m512 exp_neg_x = ggml_v_expf(neg_x);
593
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
594
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
595
+ }
596
+
597
+ #elif defined(__AVX2__) && defined(__FMA__)
598
+
599
+ // adapted from arm limited optimized routine
600
+ // the maximum error is 1.45358 plus 0.5 ulps
601
+ // numbers above 88.38 will flush to infinity
602
+ // numbers beneath -103.97 will flush to zero
603
+ inline static __m256 ggml_v_expf(__m256 x) {
604
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
605
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
606
+ const __m256 n = _mm256_sub_ps(z, r);
607
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
608
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
609
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
610
+ const __m256 k = _mm256_castsi256_ps(
611
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
612
+ const __m256i c = _mm256_castps_si256(
613
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
614
+ _mm256_set1_ps(126), _CMP_GT_OQ));
615
+ const __m256 u = _mm256_mul_ps(b, b);
616
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
617
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
618
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
619
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
620
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
621
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
622
+ return _mm256_fmadd_ps(j, k, k);
623
+ const __m256i g = _mm256_and_si256(
624
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
625
+ _mm256_set1_epi32(0x82000000u));
626
+ const __m256 s1 =
627
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
628
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
629
+ const __m256i d = _mm256_castps_si256(
630
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
631
+ _mm256_set1_ps(192), _CMP_GT_OQ));
632
+ return _mm256_or_ps(
633
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
634
+ _mm256_andnot_ps(
635
+ _mm256_castsi256_ps(d),
636
+ _mm256_or_ps(
637
+ _mm256_and_ps(_mm256_castsi256_ps(c),
638
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
639
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
640
+ }
641
+
642
+ // computes silu x/(1+exp(-x)) in single precision vector
643
+ inline static __m256 ggml_v_silu(__m256 x) {
644
+ const __m256 one = _mm256_set1_ps(1);
645
+ const __m256 zero = _mm256_setzero_ps();
646
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
647
+ const __m256 exp_neg_x = ggml_v_expf(neg_x);
648
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
649
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
650
+ }
651
+
652
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
653
+
654
+ #if defined(__FMA__)
655
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
656
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
657
+ #else
658
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
659
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
660
+ #endif
661
+
662
+ // adapted from arm limited optimized routine
663
+ // the maximum error is 1.45358 plus 0.5 ulps
664
+ // numbers above 88.38 will flush to infinity
665
+ // numbers beneath -103.97 will flush to zero
666
+ inline static __m128 ggml_v_expf(__m128 x) {
667
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
668
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
669
+ const __m128 n = _mm_sub_ps(z, r);
670
+ const __m128 b =
671
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
672
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
673
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
674
+ const __m128i c =
675
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
676
+ const __m128 u = _mm_mul_ps(b, b);
677
+ const __m128 j =
678
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
679
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
680
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
681
+ if (!_mm_movemask_epi8(c))
682
+ return MADD128(j, k, k);
683
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
684
+ _mm_set1_epi32(0x82000000u));
685
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
686
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
687
+ const __m128i d =
688
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
689
+ return _mm_or_ps(
690
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
691
+ _mm_andnot_ps(_mm_castsi128_ps(d),
692
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
693
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
694
+ }
695
+
696
+ // computes silu x/(1+exp(-x)) in single precision vector
697
+ inline static __m128 ggml_v_silu(__m128 x) {
698
+ const __m128 one = _mm_set1_ps(1);
699
+ const __m128 zero = _mm_setzero_ps();
700
+ const __m128 neg_x = _mm_sub_ps(zero, x);
701
+ const __m128 exp_neg_x = ggml_v_expf(neg_x);
702
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
703
+ return _mm_div_ps(x, one_plus_exp_neg_x);
704
+ }
705
+
706
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
707
+
708
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
709
+ for (int i = 0; i < n; ++i) {
710
+ y[i] = ggml_silu_f16(x[i]);
711
+ }
712
+ }
713
+
714
+ inline static float ggml_silu_backward_f32(float x, float dy) {
715
+ const float s = 1.0f/(1.0f + expf(-x));
716
+ return dy*s*(1.0f + x*(1.0f - s));
717
+ }
718
+
719
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
720
+ const float v = GGML_FP16_TO_FP32(x);
721
+ const float s = 1.0f/(1.0f + expf(-v));
722
+ return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
723
+ }
724
+
725
+ inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
726
+ for (int i = 0; i < n; ++i) {
727
+ dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
728
+ }
729
+ }
730
+
731
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
732
+ for (int i = 0; i < n; ++i) {
733
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
734
+ }
735
+ }
736
+
737
+ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
738
+ #ifndef GGML_USE_ACCELERATE
739
+ ggml_float sum = 0.0;
740
+ for (int i = 0; i < n; ++i) {
741
+ sum += (ggml_float)x[i];
742
+ }
743
+ *s = (float)sum;
744
+ #else
745
+ vDSP_sve(x, 1, s, n);
746
+ #endif
747
+ }
748
+
749
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
750
+ ggml_float sum = 0.0;
751
+ for (int i = 0; i < n; ++i) {
752
+ sum += (ggml_float)x[i];
753
+ }
754
+ *s = sum;
755
+ }
756
+
757
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
758
+ float sum = 0.0f;
759
+ for (int i = 0; i < n; ++i) {
760
+ sum += GGML_FP16_TO_FP32(x[i]);
761
+ }
762
+ *s = sum;
763
+ }
764
+
765
+ inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
766
+ float sum = 0.0f;
767
+ for (int i = 0; i < n; ++i) {
768
+ sum += GGML_BF16_TO_FP32(x[i]);
769
+ }
770
+ *s = sum;
771
+ }
772
+
773
+ inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
774
+ #ifndef GGML_USE_ACCELERATE
775
+ float max = -INFINITY;
776
+ for (int i = 0; i < n; ++i) {
777
+ max = MAX(max, x[i]);
778
+ }
779
+ *s = max;
780
+ #else
781
+ vDSP_maxv(x, 1, s, n);
782
+ #endif
783
+ }
784
+
785
+ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
786
+ ggml_vec_norm_f32(n, s, x);
787
+ *s = 1.f/(*s);
788
+ }
789
+
790
+ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
791
+ float max = -INFINITY;
792
+ int idx = 0;
793
+ for (int i = 0; i < n; ++i) {
794
+ max = MAX(max, x[i]);
795
+ if (max == x[i]) { idx = i; }
796
+ }
797
+ *s = idx;
798
+ }
799
+
800
+ #ifdef __cplusplus
801
+ }
802
+ #endif